scrapy 分布式性能优化(布隆过滤器对接)

布隆过滤器原理:略

在平时使用scrapy分布式时,当我们爬取海量数据时,要考虑到内存的问题。那么如何优化呢?

假设一个1亿个url,使用内存是2GB,那么我们使用的布隆过滤器则用几百MB就行了。

我写不知道怎么说,上代码吧。

原理代码(可以看看):

# -*- coding: utf-8 -*-

# @Time    : 2019/5/27 9:19
# @Author  : hccfm
# @File    : Bloomtest.py
# @Software: PyCharm

"""
布隆过滤器。
手写类学习与测试。
"""
import redis


class HashMap():
    """
    基本的散列算法
    """

    def __init__(self,m,seed):
        """

        :param m: 数组的位数
        :param seed:元素种子,用于计算不同散列,
        """
        self.m = m
        self.seed = seed

    def hash(self,value):
        ret = 0
        for i in range(len(value)):
            ret += self.seed * ret +ord(value[i])    # ord() 用于计算元素的ASCII码

        return (self.m - 1) & ret


BLOOMFILTER_HASH_NUMBER = 6
BLOOMFILTER_BIT = 30

class Redis_BloomFilter():
    """
    这里bloom文件数据在存储在redis中
    """

    def __init__(self,server,key,bit=BLOOMFILTER_BIT,hash_number=BLOOMFILTER_HASH_NUMBER):
        """
        布隆过滤器初初始化
        文件大小:1 << 30 =1,073,741,824 = 2^30/1024/1024/8 = 128MB
        最大指纹过滤数:2^30/6 = 178,956,970
        这个可以存储亿级数据
        :param server: redis 服务
        :param key: bloomfilter key
        :param bit: m = 2 ^ bit
        :param hash_number: 哈希函数的个数
        """
        self.m = 1 << bit
        self.seeds = range(hash_number)
        self.maps = [HashMap(self.m,seed) for seed in self.seeds]   #产生hash_number 个hash 对象
        self.server = server
        self.key = key

    def exists(self,value):
        """
        判断数据是否在布隆过滤器中
        :param value:
        :return:
        """
        if not value:
            return False
        exist = 1
        for map in self.maps:
            offset = map.hash(value)
            exist = exist & self.server.getbit(self.key,offset) # 通过多个hash函数循环值进行比较,如果有一个为0 则,不存在

        return exist

    def insert(self,value):
        """
        将数据插入到过滤器中
        :param value:
        :return:
        """
        for m in self.maps:
            offset = m.hash(value)
            self.server.setbit(self.key,offset,1)



if __name__ == '__main__':

    pool = redis.ConnectionPool(host = '127.0.0.1',port=6379)
    r = redis.Redis(connection_pool=pool)

    # bf = Redis_BloomFilter(server=r,key='hccfm_bf1',) # 这个数据库生成128M
    bf = Redis_BloomFilter(server=r,key='hccfm_bf',bit=10,hash_number=5)
    bf.insert('hccfm1')
    bf.insert('hccfm2')
    result = bf.exists('hccfm3')
    print(bool(result))             # False
    result = bf.exists('hccfm1')
    print(bool(result))             # True






 

对接scrapy-redis(两种方式):

修改源码方式(不推荐,不过可以去试试,有助于了解原理):

主要修改:RFPDupeFilter类和Scheduler类

过滤类(RFPDupeFilter)

__init__()

    def __init__(self, server, key, debug=False):
        """Initialize the duplicates filter.

        Parameters
        ----------
        server : redis.StrictRedis
            The redis server instance.
        key : str
            Redis key Where to store fingerprints.
        debug : bool, optional
            Whether to log filtered requests.

        """
        self.server = server
        self.key = key
        self.debug = debug
        self.logdupes = True

修改为:

    def __init__(self, server, key, debug, bit, hash_number):
        """Initialize the duplicates filter.

        Parameters
        ----------
        server : redis.StrictRedis
            The redis server instance.
        key : str
            Redis key Where to store fingerprints.
        debug : bool, optional
            Whether to log filtered requests.

        """
        self.server = server
        self.key = key
        self.debug = debug
        self.bit = bit
        self.hash_number = hash_number
        self.logdupes = True
        self.bf = BloomFilter(server, self.key, bit, hash_number)    #创建一个对象,需要导入
from_settings():
    @classmethod
    def from_settings(cls, settings):
        """Returns an instance from given settings.

        This uses by default the key ``dupefilter:``. When using the
        ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
        it needs to pass the spider name in the key.

        Parameters
        ----------
        settings : scrapy.settings.Settings

        Returns
        -------
        RFPDupeFilter
            A RFPDupeFilter instance.


        """
        server = get_redis_from_settings(settings)
        # XXX: This creates one-time key. needed to support to use this
        # class as standalone dupefilter with scrapy's default scheduler
        # if scrapy passes spider on open() method this wouldn't be needed
        # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
        key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(server, key=key, debug=debug)

 

修改为:

    @classmethod
    def from_settings(cls, settings):
        """Returns an instance from given settings.

        This uses by default the key ``dupefilter:``. When using the
        ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
        it needs to pass the spider name in the key.

        Parameters
        ----------
        settings : scrapy.settings.Settings

        Returns
        -------
        RFPDupeFilter
            A RFPDupeFilter instance.


        """
        server = get_redis_from_settings(settings)
        # XXX: This creates one-time key. needed to support to use this
        # class as standalone dupefilter with scrapy's default scheduler
        # if scrapy passes spider on open() method this wouldn't be needed
        # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
        key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
        debug = settings.getbool('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG)
        bit = settings.getint('BLOOMFILTER_BIT', BLOOMFILTER_BIT)
        hash_number = settings.getint('BLOOMFILTER_HASH_NUMBER', BLOOMFILTER_HASH_NUMBER)
        return cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number)

 

调度类(Scheduler)

open():

    def open(self, spider):
        self.spider = spider

        try:
            self.queue = load_object(self.queue_cls)(
                server=self.server,
                spider=spider,
                key=self.queue_key % {'spider': spider.name},
                serializer=self.serializer,
            )
        except TypeError as e:
            raise ValueError("Failed to instantiate queue class '%s': %s",
                             self.queue_cls, e)

        try:
            self.df = load_object(self.dupefilter_cls)(
                server=self.server,
                key=self.dupefilter_key % {'spider': spider.name},
                debug=spider.settings.getbool('DUPEFILTER_DEBUG'),
            )
        except TypeError as e:
            raise ValueError("Failed to instantiate dupefilter class '%s': %s",
                             self.dupefilter_cls, e)

        if self.flush_on_start:
            self.flush()
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

修改为:

    def open(self, spider):
        self.spider = spider
        
        try:
            self.queue = load_object(self.queue_cls)(
                server=self.server,
                spider=spider,
                key=self.queue_key % {'spider': spider.name},
                serializer=self.serializer,
            )
        except TypeError as e:
            raise ValueError("Failed to instantiate queue class '%s': %s",
                             self.queue_cls, e)
        
        try:
            self.df = load_object(self.dupefilter_cls)(
                server=self.server,
                key=self.dupefilter_key % {'spider': spider.name},
                debug=spider.settings.getbool('DUPEFILTER_DEBUG'),
                bit=spider.settings.getint('BLOOMFILTER_BIT', BLOOMFILTER_BIT),    #这个添加
                hash_number=spider.settings.getint('BLOOMFILTER_HASH_NUMBER', BLOOMFILTER_HASH_NUMBER)    # 这里添加
            )
        except TypeError as e:
            raise ValueError("Failed to instantiate dupefilter class '%s': %s",
                             self.dupefilter_cls, e)
        
        if self.flush_on_start:
            self.flush()
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

注意需要导入:

from .defaults import BLOOMFILTER_BIT, BLOOMFILTER_HASH_NUMBER

同时:

defaults文件中也需要设置,这里是设置默认值。当settings 里没有设置这两个值时,就取这两个

BLOOMFILTER_HASH_NUMBER = 6
BLOOMFILTER_BIT = 30

 

pip 安装使用方式(比较简单,推荐)

pip install scrapy-redis-bloomfilter

 

在settings 里面设置

# 分布式配置
# DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis_bloomfilter.dupefilter.RFPDupeFilter"   # 使用布隆过滤器过滤url
SCHEDULER = "scrapy_redis_bloomfilter.scheduler.Scheduler"
SCHEDULER_PERSIST = True
LOG_LEVEL = "DEBUG"
# BLOOMFILTER_BIT = 30       # 布隆过滤器位数
# BLOOMFILTER_HASH_NUMBER = 10   # hash函数个数

ITEM_PIPELINES = {
   # 'scrapy_redis.pipelines.RedisPipeline': 400,
   'scrapy_redis_bloomfilter.pipelines.RedisPipeline': 400,
}



# REDIS_HOST = '10.250.24.60'    # 访问master的主机ip
REDIS_HOST = '127.0.0.1'    # 访问master的主机ip
REDIS_PORT = 6379

 

 

 

 

 

你可能感兴趣的:(Python编程,爬虫部分)