布隆过滤器原理:略
在平时使用scrapy分布式时,当我们爬取海量数据时,要考虑到内存的问题。那么如何优化呢?
假设一个1亿个url,使用内存是2GB,那么我们使用的布隆过滤器则用几百MB就行了。
我写不知道怎么说,上代码吧。
原理代码(可以看看):
# -*- coding: utf-8 -*-
# @Time : 2019/5/27 9:19
# @Author : hccfm
# @File : Bloomtest.py
# @Software: PyCharm
"""
布隆过滤器。
手写类学习与测试。
"""
import redis
class HashMap():
"""
基本的散列算法
"""
def __init__(self,m,seed):
"""
:param m: 数组的位数
:param seed:元素种子,用于计算不同散列,
"""
self.m = m
self.seed = seed
def hash(self,value):
ret = 0
for i in range(len(value)):
ret += self.seed * ret +ord(value[i]) # ord() 用于计算元素的ASCII码
return (self.m - 1) & ret
BLOOMFILTER_HASH_NUMBER = 6
BLOOMFILTER_BIT = 30
class Redis_BloomFilter():
"""
这里bloom文件数据在存储在redis中
"""
def __init__(self,server,key,bit=BLOOMFILTER_BIT,hash_number=BLOOMFILTER_HASH_NUMBER):
"""
布隆过滤器初初始化
文件大小:1 << 30 =1,073,741,824 = 2^30/1024/1024/8 = 128MB
最大指纹过滤数:2^30/6 = 178,956,970
这个可以存储亿级数据
:param server: redis 服务
:param key: bloomfilter key
:param bit: m = 2 ^ bit
:param hash_number: 哈希函数的个数
"""
self.m = 1 << bit
self.seeds = range(hash_number)
self.maps = [HashMap(self.m,seed) for seed in self.seeds] #产生hash_number 个hash 对象
self.server = server
self.key = key
def exists(self,value):
"""
判断数据是否在布隆过滤器中
:param value:
:return:
"""
if not value:
return False
exist = 1
for map in self.maps:
offset = map.hash(value)
exist = exist & self.server.getbit(self.key,offset) # 通过多个hash函数循环值进行比较,如果有一个为0 则,不存在
return exist
def insert(self,value):
"""
将数据插入到过滤器中
:param value:
:return:
"""
for m in self.maps:
offset = m.hash(value)
self.server.setbit(self.key,offset,1)
if __name__ == '__main__':
pool = redis.ConnectionPool(host = '127.0.0.1',port=6379)
r = redis.Redis(connection_pool=pool)
# bf = Redis_BloomFilter(server=r,key='hccfm_bf1',) # 这个数据库生成128M
bf = Redis_BloomFilter(server=r,key='hccfm_bf',bit=10,hash_number=5)
bf.insert('hccfm1')
bf.insert('hccfm2')
result = bf.exists('hccfm3')
print(bool(result)) # False
result = bf.exists('hccfm1')
print(bool(result)) # True
对接scrapy-redis(两种方式):
修改源码方式(不推荐,不过可以去试试,有助于了解原理):
主要修改:RFPDupeFilter类和Scheduler类
过滤类(RFPDupeFilter)
__init__()
def __init__(self, server, key, debug=False):
"""Initialize the duplicates filter.
Parameters
----------
server : redis.StrictRedis
The redis server instance.
key : str
Redis key Where to store fingerprints.
debug : bool, optional
Whether to log filtered requests.
"""
self.server = server
self.key = key
self.debug = debug
self.logdupes = True
修改为:
def __init__(self, server, key, debug, bit, hash_number):
"""Initialize the duplicates filter.
Parameters
----------
server : redis.StrictRedis
The redis server instance.
key : str
Redis key Where to store fingerprints.
debug : bool, optional
Whether to log filtered requests.
"""
self.server = server
self.key = key
self.debug = debug
self.bit = bit
self.hash_number = hash_number
self.logdupes = True
self.bf = BloomFilter(server, self.key, bit, hash_number) #创建一个对象,需要导入
from_settings():
@classmethod
def from_settings(cls, settings):
"""Returns an instance from given settings.
This uses by default the key ``dupefilter:``. When using the
``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
it needs to pass the spider name in the key.
Parameters
----------
settings : scrapy.settings.Settings
Returns
-------
RFPDupeFilter
A RFPDupeFilter instance.
"""
server = get_redis_from_settings(settings)
# XXX: This creates one-time key. needed to support to use this
# class as standalone dupefilter with scrapy's default scheduler
# if scrapy passes spider on open() method this wouldn't be needed
# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(server, key=key, debug=debug)
修改为:
@classmethod
def from_settings(cls, settings):
"""Returns an instance from given settings.
This uses by default the key ``dupefilter:``. When using the
``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
it needs to pass the spider name in the key.
Parameters
----------
settings : scrapy.settings.Settings
Returns
-------
RFPDupeFilter
A RFPDupeFilter instance.
"""
server = get_redis_from_settings(settings)
# XXX: This creates one-time key. needed to support to use this
# class as standalone dupefilter with scrapy's default scheduler
# if scrapy passes spider on open() method this wouldn't be needed
# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
debug = settings.getbool('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG)
bit = settings.getint('BLOOMFILTER_BIT', BLOOMFILTER_BIT)
hash_number = settings.getint('BLOOMFILTER_HASH_NUMBER', BLOOMFILTER_HASH_NUMBER)
return cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number)
调度类(Scheduler)
open():
def open(self, spider):
self.spider = spider
try:
self.queue = load_object(self.queue_cls)(
server=self.server,
spider=spider,
key=self.queue_key % {'spider': spider.name},
serializer=self.serializer,
)
except TypeError as e:
raise ValueError("Failed to instantiate queue class '%s': %s",
self.queue_cls, e)
try:
self.df = load_object(self.dupefilter_cls)(
server=self.server,
key=self.dupefilter_key % {'spider': spider.name},
debug=spider.settings.getbool('DUPEFILTER_DEBUG'),
)
except TypeError as e:
raise ValueError("Failed to instantiate dupefilter class '%s': %s",
self.dupefilter_cls, e)
if self.flush_on_start:
self.flush()
# notice if there are requests already in the queue to resume the crawl
if len(self.queue):
spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
修改为:
def open(self, spider):
self.spider = spider
try:
self.queue = load_object(self.queue_cls)(
server=self.server,
spider=spider,
key=self.queue_key % {'spider': spider.name},
serializer=self.serializer,
)
except TypeError as e:
raise ValueError("Failed to instantiate queue class '%s': %s",
self.queue_cls, e)
try:
self.df = load_object(self.dupefilter_cls)(
server=self.server,
key=self.dupefilter_key % {'spider': spider.name},
debug=spider.settings.getbool('DUPEFILTER_DEBUG'),
bit=spider.settings.getint('BLOOMFILTER_BIT', BLOOMFILTER_BIT), #这个添加
hash_number=spider.settings.getint('BLOOMFILTER_HASH_NUMBER', BLOOMFILTER_HASH_NUMBER) # 这里添加
)
except TypeError as e:
raise ValueError("Failed to instantiate dupefilter class '%s': %s",
self.dupefilter_cls, e)
if self.flush_on_start:
self.flush()
# notice if there are requests already in the queue to resume the crawl
if len(self.queue):
spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
注意需要导入:
from .defaults import BLOOMFILTER_BIT, BLOOMFILTER_HASH_NUMBER
同时:
defaults文件中也需要设置,这里是设置默认值。当settings 里没有设置这两个值时,就取这两个
BLOOMFILTER_HASH_NUMBER = 6
BLOOMFILTER_BIT = 30
pip 安装使用方式(比较简单,推荐)
pip install scrapy-redis-bloomfilter
在settings 里面设置
# 分布式配置
# DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis_bloomfilter.dupefilter.RFPDupeFilter" # 使用布隆过滤器过滤url
SCHEDULER = "scrapy_redis_bloomfilter.scheduler.Scheduler"
SCHEDULER_PERSIST = True
LOG_LEVEL = "DEBUG"
# BLOOMFILTER_BIT = 30 # 布隆过滤器位数
# BLOOMFILTER_HASH_NUMBER = 10 # hash函数个数
ITEM_PIPELINES = {
# 'scrapy_redis.pipelines.RedisPipeline': 400,
'scrapy_redis_bloomfilter.pipelines.RedisPipeline': 400,
}
# REDIS_HOST = '10.250.24.60' # 访问master的主机ip
REDIS_HOST = '127.0.0.1' # 访问master的主机ip
REDIS_PORT = 6379