scrapy 异常处理

可以在下载中间件直接处理 404 503 等等异常

settings.py

DOWNLOADER_MIDDLEWARES = {

    #定义成200 . 让其他默认中间件先处理
    'test_scrapy.middlewares.ProcessAllException': 200, 
}

from scrapy.http import HtmlResponse
from scrapy.exceptions import IgnoreRequest
class ProcessAllException(object):

    def process_response(self, request, response, spider):
        #处理异常
        if response.status >= 400 or hasattr(response,'exception'):
            print('process_response :' , response.status, response)
            if hasattr(response,'exception'):
                print('exception:'  , response.exception)
            raise IgnoreRequest(response)
        return response



    def process_exception(self, request, exception, spider):
        status_code = 400

        #构造一个假的response , 传递给process_response
        obj = HtmlResponse(url=request.url,status=status_code,request=request)

        #携带exception
        obj.exception = exception
        return obj

 

 

以上这个类与Proxy类合并比较方便 . 顺便去除无用的proxy;下面使用了redis

redisconn.py:

import redis
import threading
def singleton(cls):
    instance = None
    lock = threading.Lock()
    def oncall(*args, **kwargs):
        nonlocal  instance
        if not instance:
            with lock:
                if not instance :
                    instance = cls(*args, **kwargs)
        return instance
    return oncall


@singleton
class RedisObject:
    def __init__(self,host="localhost",port=6379,password="",max_conn = 1000):
        self.__pool = redis.ConnectionPool(max_connections=max_conn,
                                           host=host,port=port,password=password)
        self.__conn = redis.StrictRedis(connection_pool=self.pool())


    def pool(self):
        return self.__pool
    def conn(self):
        return self.__conn
    def __del__(self):
        self.close()
    def close(self):
        self.__pool.disconnect()

proxymiddleware.py

import random
from .spiders import redisconn
from scrapy.http import HtmlResponse
from scrapy.exceptions import IgnoreRequest



class RandomProxy(HttpProxyMiddleware):
    def __init__(self,encoding):
        super().__init__(encoding)
        self.redis = redisconn.RedisObject()
        self.conn = self.redis.conn()
        self.proxy_list = []
        for proxy in self.conn.sscan_iter('proxy_sets',count=10):
            self.proxy_list.append(str(proxy,encoding = "utf8"))
    def process_request(self, request, spider):
        if 'proxy' in request.meta or not self.proxy_list:
            return super().process_request(request,spider)
        else:
            proxy = random.choice(self.proxy_list)
            creds, proxy_url = self._get_proxy(proxy,None)
            request.meta['proxy'] = proxy_url
            if creds:
                request.headers['Proxy-Authorization'] = b'Basic ' + creds

    def process_response(self, request, response, spider):
        if response.status >= 400 or hasattr(response, 'exception'):
            print('process_response :', response.status, response,
                  request.meta['proxy'])
            self.conn.srem('proxy_sets', request.meta['proxy'])
            self.proxy_list.remove(request.meta['proxy'])
            if hasattr(response, 'exception'):
                print('exception:', response.exception)
            raise IgnoreRequest(response)
        return response

    def process_exception(self, request, exception, spider):
        status_code = 400
        obj = HtmlResponse(url=request.url, status=status_code, request=request)
        obj.exception = exception
        return obj

 

你可能感兴趣的:(py)