# 文件 settings.py中
import datetime
LOG_LEVEL = 'DEBUG' # log等级设置为debug模式
startDate = datetime.datetime.now().strftime('%Y%m%d')
LOG_FILE = f"redisClawerSlaver_1_log{startDate}.txt" # 将log写入文件中
# 小技巧:因为debug模式下,一天下来,log会非常大,用Notepad++无法打开,可以使用UltraEdit来打开
后来分析Log,搜索failed, 发现了大量的 timeout error,占到了错误的九成,而且这两个站点的链接都有:
结论1:所以大胆猜测可能有几个原因:
第一,本地网络变慢。
第二,爬取的站点发生了变化,包括爬取的内容发生了变化。
第三,阿布云代理故障。
第四,爬虫错误。
**思考与完善:**其实在这一步,应该做好详细的error统计的,记录在数据库中,这个后期要完善起来。补充一下,形如此类:
# 文件 spider.py中
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import TimeoutError, TCPTimedOutError, DNSLookupError, ConnectionRefusedError
yield response.follow(
url=re.sub(r'page=\d+',f'page={page}',url,count=1),
meta={'dont_redirect':True,'key':response.meta['key']},
callback=self.galance,
errback=self.error # 做好error记录
)
RETRY = 4 # settings中最大重试次数
def error(self, failure):
if failure.check(HttpError):
response = failure.value.response
if response.meta['depth'] < RETRY:
failure.request.dont_filter = True
yield failure.request
else:
yield {
'url': response.url, 'error': 'HttpError', 'depth': response.meta['depth'],
'priority': response.request.priority, 'status': response.status,
'callback': response.request.callback.__name__,
'key': response.meta.get('key') or response.meta.get('item', {}).get('key', ''),
} # 日志用
elif failure.check(TimeoutError, TCPTimedOutError, ConnectionRefusedError, DNSLookupError):
request = failure.request
yield {
'url': request.url,
'error': 'TimeoutError',
'priority': request.priority,
'callback': request.callback.__name__,
'key': request.meta.get('key') or request.meta.get('item', {}).get('key', ''),
} # 日志用,只在最后一次超时后才执行
else:
request = failure.request
yield {'url': request.url, 'error': 'UnknownError', 'priority': request.priority,
'callback': request.callback.__name__} # 日志用
2018-03-17 00:10:29 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying (failed 1 times): User timeout caused connection failure: Getting https://www.amazon.com/s/ref=lp_3734591_nr_n_6/143-5700955-1921713?fst=as%3Aoff&rh=n%3A1055398%2Cn%3A%211063498%2Cn%3A1063278%2Cn%3A1063282%2Cn%3A3734591%2Cn%3A3734671&bbn=3734591&ie=UTF8&qid=1517900687&rnid=3734591 took longer than 20.0 seconds..
2018-03-17 00:10:29 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying (failed 1 times): User timeout caused connection failure: Getting https://www.amazon.com/s/ref=lp_3422251_nr_n_8/134-3091133-0771864?fst=as%3Aoff&rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A3407731%2Cn%3A3422251%2Cn%3A7261122011&bbn=3422251&ie=UTF8&qid=1517900684&rnid=3422251 took longer than 20.0 seconds..
The speed result of not use proxy for amazon detail page.
No_proxy Totalurls:20, successCount:20, totalSuccessTime:68.56400000000001, avgTime:3.4282000000000004, connectFailCount:0, proxyFailCount:0
Amazon Totalurls:20, successCount:14, totalSuccessTime:104.4075, avgTime:7.457678571428572, connectFailCount:0, proxyFailCount:6
2018-03-22 14:09:02 [scrapy.core.engine] DEBUG: Crawled (200) (referer: www.amazon.com)
# 在middleware.py中增加代理的情况下,会无法抓下来
18-03-21 11:09:22 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://www.amztracker.com/unicorn.php?rank=189599&category=Sports+%26+Outdoors> (failed 1 times): User timeout caused connection failure: Getting https://www.amztracker.com/unicorn.php?rank=189599&category=Sports+%26+Outdoors took longer than 20.0 seconds..