可以在下载中间件直接处理 404 503 等等异常
settings.py
DOWNLOADER_MIDDLEWARES = {
#定义成200 . 让其他默认中间件先处理
'test_scrapy.middlewares.ProcessAllException': 200,
}
from scrapy.http import HtmlResponse
from scrapy.exceptions import IgnoreRequest
class ProcessAllException(object):
def process_response(self, request, response, spider):
#处理异常
if response.status >= 400 or hasattr(response,'exception'):
print('process_response :' , response.status, response)
if hasattr(response,'exception'):
print('exception:' , response.exception)
raise IgnoreRequest(response)
return response
def process_exception(self, request, exception, spider):
status_code = 400
#构造一个假的response , 传递给process_response
obj = HtmlResponse(url=request.url,status=status_code,request=request)
#携带exception
obj.exception = exception
return obj
以上这个类与Proxy类合并比较方便 . 顺便去除无用的proxy;下面使用了redis
redisconn.py:
import redis
import threading
def singleton(cls):
instance = None
lock = threading.Lock()
def oncall(*args, **kwargs):
nonlocal instance
if not instance:
with lock:
if not instance :
instance = cls(*args, **kwargs)
return instance
return oncall
@singleton
class RedisObject:
def __init__(self,host="localhost",port=6379,password="",max_conn = 1000):
self.__pool = redis.ConnectionPool(max_connections=max_conn,
host=host,port=port,password=password)
self.__conn = redis.StrictRedis(connection_pool=self.pool())
def pool(self):
return self.__pool
def conn(self):
return self.__conn
def __del__(self):
self.close()
def close(self):
self.__pool.disconnect()
proxymiddleware.py
import random
from .spiders import redisconn
from scrapy.http import HtmlResponse
from scrapy.exceptions import IgnoreRequest
class RandomProxy(HttpProxyMiddleware):
def __init__(self,encoding):
super().__init__(encoding)
self.redis = redisconn.RedisObject()
self.conn = self.redis.conn()
self.proxy_list = []
for proxy in self.conn.sscan_iter('proxy_sets',count=10):
self.proxy_list.append(str(proxy,encoding = "utf8"))
def process_request(self, request, spider):
if 'proxy' in request.meta or not self.proxy_list:
return super().process_request(request,spider)
else:
proxy = random.choice(self.proxy_list)
creds, proxy_url = self._get_proxy(proxy,None)
request.meta['proxy'] = proxy_url
if creds:
request.headers['Proxy-Authorization'] = b'Basic ' + creds
def process_response(self, request, response, spider):
if response.status >= 400 or hasattr(response, 'exception'):
print('process_response :', response.status, response,
request.meta['proxy'])
self.conn.srem('proxy_sets', request.meta['proxy'])
self.proxy_list.remove(request.meta['proxy'])
if hasattr(response, 'exception'):
print('exception:', response.exception)
raise IgnoreRequest(response)
return response
def process_exception(self, request, exception, spider):
status_code = 400
obj = HtmlResponse(url=request.url, status=status_code, request=request)
obj.exception = exception
return obj