scrapy 自带了去重方案,通过RFPDupeFilter类完成去重,查看源码。
def request_seen(self, request):
fp = self.request_fingerprint(request)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
if self.file:
self.file.write(fp + os.linesep)
def request_fingerprint(self, request):
return request_fingerprint(request)
def request_fingerprint(request, include_headers=None):
if include_headers:
include_headers = tuple(to_bytes(h.lower())
for h in sorted(include_headers))
cache = _fingerprint_cache.setdefault(request, {})
if include_headers not in cache:
fp = hashlib.sha1()
fp.update(to_bytes(request.method))
fp.update(to_bytes(canonicalize_url(request.url)))
fp.update(request.body or b'')
if include_headers:
for hdr in include_headers:
if hdr in request.headers:
fp.update(hdr)
for v in request.headers.getlist(hdr):
fp.update(v)
cache[include_headers] = fp.hexdigest()
return cache[include_headers]
RFPDupeFilter定义了request_seen()方法,将request的指纹信息sha1(method+url+body+header)整体写入set()进行去重。
这种方式去重的比例较小。
下面我们定制过滤器,仅根据request的URL进行去重。
from scrapy.dupefilters import RFPDupeFilter
class URLFilter(RFPDupeFilter):
""" 只根据url去重"""
def __init__(self, path=None, debug=False):
self.urls_seen = set()
RFPDupeFilter.__init__(self, path)
def request_seen(self, request):
if request.url in self.urls_seen:
return True
else:
self.urls_seen.add(request.url)
配置setting.py
DUPEFILTER_CLASS = '项目名.文件名.URLFilter'
这种去重方式,保存着set中的信息在爬虫运行结束就会消失。下次调度爬虫的时候还是会继续爬去此次爬过的url。
为了实现增量爬虫,可以利用redis的set()缓存爬过的url数据。
1.自定义过滤器,在爬数据之前,校验该URL是否爬取过
from scrapy.dupefilters import RFPDupeFilter
class URLRedisFilter(RFPDupeFilter):
""" 只根据url去重"""
def __init__(self, path=None, debug=False):
RFPDupeFilter.__init__(self, path)
self.dupefilter = UrlFilterAndAdd()
def request_seen(self, request):
# 校验,新增2行代码
if self.dupefilter.check_url(request.url):
return True
#保留中间页面的去重规则不变,不然爬虫在运行过程中容易出现死循环
fp = self.request_fingerprint(request)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
if self.file:
self.file.write(fp + os.linesep)
class UrlFilterAndAdd(object):
def __init__(self):
redis_config = {
"host": "localhost", #redis ip
"port": 6379,
"password": "1234",
"db": 10,
}
pool = ConnectionPool(**redis_config)
self.pool = pool
self.redis = StrictRedis(connection_pool=pool)
self.key = "spider_redis_key"
def url_sha1(self, url):
fp = hashlib.sha1()
fp.update(canonicalize_url(url).encode("utf-8"))
url_sha1 = fp.hexdigest()
return url_sha1
def check_url(self, url):
sha1 = self.url_sha1(url)
#此处只判断url是否在set中,并不添加url信息,
#防止将起始url 、中间url(比如列表页的url地址)写入缓存中,
isExist = self.redis.sismember(self.key, sha1)
return isExist
def add_url(self, url):
sha1 = self.url_sha1(url)
added = self.redis.sadd(self.key, sha1)
return added
注意:URLFilter中只校验爬虫是否存在,不缓存url数据
2.修改pipeline 在数据爬完后将URL放入redis中去重,避免将中间链接也缓存了
class MySpiderPipeline(object):
def __init__(self):
self.dupefilter = UrlFilterAndAdd()
def process_item(self, item, spider):
# ,在数据爬完后将URL放入redis去重
print("add>>url:", item['crawl_url'])
self.dupefilter.add_url(item['crawl_url'])
return item
print("add>>url:", item['crawl_url'])
self.dupefilter.add_url(item['crawl_url'])
return item
在setting中配置:
ITEM_PIPELINES = {
'项目名.pipelines.MySpiderPipeline': 300,
}
DUPEFILTER_CLASS = '项目名.文件名.URLRedisFilter'
按照目前的方式,redis中的数据会一直膨胀,后续在优化。