7-初识分布式爬虫

redis 基础

  1. CONFIG GET * //获得所有配置信息
  2. CONFIG SET loglevel "notice" //设置配置信息
  3. PING //检测redis是否正常连通
  4. SET CLASS 1803 //创建key
  5. KEYS * //查询key
  6. EXISTS CLASS //检测key是否存在
  7. DEL CLASS //删除key
  8. EXISTS CLASS //检测key是否存在
  9. SET CLASS 1803 EX 3 //创建key并设置过期时间,秒
  10. SET CLASS 1803 PX 3000 //创建key并设置过期时间,毫秒
  11. EXPIRE CLASS 3 //设置过期时间
  12. PERSIST CLASS //移出过期时间
  13. sadd student zhangsan //添加集合
  14. scard student //返回集合中元素的数量
  15. lpush
  16. lpop
  17. blpop
  18. smembers //返回集合中所有成员

更多命令,请参考:http://redisdoc.com/list/

分布式爬虫步骤

  1. 写爬虫脚本 redis_spider.py
  2. 部署
  1. 服务端部署
  • redis-server

  • redis_spider.py rds =Redis('127.0.0.1',6379)

    1. 节点部署
  • redis_spider.py rds =Redis('服务端的ip',6379)
    eg: rds =Redis('10.31.161.59',6379)

  1. 运行

服务端 python redis_spider.py

节点 python redis_spider.py

  1. 如何在局域网部署redis分布式
    [服务端配置]:
    修改配置文件redis.conf,添加远程访问配置,如下:
    bind 127.0.0.1
    bind 10.31.161.59
    保存后重启redis-server
    ./redis-server /path/to/redis.conf

[客户端连接服务端]
redis-cli -h 10.31.161.59 -p 6379

从redis中提取数据

"""
假设,redis数据库中有一个键名为('spider:items'的list,现在
我们要把这个键中的数据及时地转存到其它存储介质中.('spider:items'中的数据是实时生成的)

方案:
由于'spider:items'中的数据是实时生成的,我们要及时地完成转存数据,可以每隔一段时间从
redis的'spider:items'中取出一条数据,保存到其它存储介质中,代码试下思路如下:
"""


from redis import Redis
import time

rds = Redis('127.0.0.1',port=6379)
while True:
    # 实时监控redis中的'spider:items'键,每隔2秒提取一条数据
    # 注意这里使用的blpop
    key,value = rds.blpop('spider:items')
    time.sleep(2)
    # 把value保存到其它数据.....

设计超时装饰器


import time
import requests
RETRY_TIME = 3
DOWNLOAD_DELAY = 2
class Retry(object):
    def __init__(self,retries = 3,delay=0):
        self.retries = retries
        self.delay=delay

    def __call__(self,func):
        def wrapper(*args, **kwargs):
            for i in range(self.retries):
                try:
                    result=func(*args,**kwargs)
                except Exception as e:
                    print(e)
                    time.sleep(self.delay)
                    continue
                else:
                    return result
        return wrapper


@Retry(RETRY_TIME,DOWNLOAD_DELAY)
def fetch(url):
    print(f'Start fetch {url}')
    resp = requests.get(url,timeout=5)
    print(resp.status_code)


fetch('http://www.baidu.com')

分布式爬虫完整代码

"""
爬虫:
    for url in urls:
        url -> 发送请求 ->  获得response ->  解析response -> 保存数据

多线程爬虫:
    urls 保存在本地内存中
    work(url -> 发送请求 ->  获得response ->  解析response -> 保存数据)
    启用多个work

多线程分布式爬虫:
    urls 保存在redis内存数据库中
    多台电脑 从redis内存数据库取url,
    每台电脑执行的操作是:work(url -> 发送请求 ->  获得response ->  解析response -> 保存数据)

"""
from redis import Redis
import threading
from lxml import etree
import requests
import time
import sys

IDLE  = 0   # 空闲状态
WORKING = 1 # 工作状态
REDIS_SPIDER_URLS_KEY = 'spider:urls'
start_url = 'https://m.sogou.com/'
ALLOW_DOMAIN = 'sogou.com'
MAX_DOWNLOAD_THREAD = 10
RETRY_TIME = 3
DOWNLOAD_DELAY = 10
crawled_url = set()  # 爬取过的url
REDIS_CRAWLED_URL = 'crawled_url'
rds =Redis('127.0.0.1',6379)


class Retry(object):
    def __init__(self,retries = 3,delay=0):
        self.retries = retries
        self.delay=delay

    def __call__(self,func):
        def wrapper(*args, **kwargs):
            for i in range(self.retries):
                try:
                    result=func(*args,**kwargs)
                except Exception as e:
                    print(e)
                    time.sleep(self.delay)
                    continue
                else:
                    return result

        return wrapper



@Retry(RETRY_TIME,DOWNLOAD_DELAY)
def fetch(url):
    """
    下载页面,如果下载成功,返回网页源码 -->str,否则返回None
    :param url:
    :return: 返回str类型的网页内容 或者  None

    """
    # 如果是新url,就下载url对应的页面,如果url已经爬取过,忽略
    print(url)
    print(rds.smembers(REDIS_CRAWLED_URL))
    if rds.sadd(REDIS_CRAWLED_URL,url) == 1:
        print('Start  fetching .. {url}'.format(url=url))
        print(f'Start  fetching .. {url}')
        resp = requests.get(url)
        print(resp.status_code)
        if resp.status_code == 200:
            return resp.text
    return None

    # else:
    #     pass


def parse(html):
    """
    解析html,从html中抽取所有的url
    :param html:网页源码
    :return: 返回list类型的url列表
    """

    if html is None:
        return None
    tree = etree.HTML(html)
    urls = tree.xpath('//a/@href')
    new_urls=[]
    print(urls)
    for url in urls:
        if ALLOW_DOMAIN in url:
            if url.startswith('http') or url.startswith('https'):
                print(url)
                new_urls.append(url)
    if not new_urls:
        return None
    return new_urls



class Spider(threading.Thread):
    """爬虫器"""

    def __init__(self):
        super(Spider,self).__init__()
        self.status = IDLE

    @property
    def is_idle(self):
        """
        判断对象是不是空闲状态,如果是空闲状态,返回True
        :return:
        """
        return  self.status == IDLE

    def run(self):
        while True:
            # 从redis url队列中取出一个url,如果队列中暂时没有url,等待
            url = rds.blpop(REDIS_SPIDER_URLS_KEY)[1]
            print(url)
            # 开始爬取url对应的页面
            self.status = WORKING
            # 获取网页源码
            html = fetch(url)
            # 解析
            urls = parse(html)
            print(urls)
            if urls is not None:
                new_urls = set(urls)  # 获取页面中所有url,并去重

                rds.lpush(REDIS_SPIDER_URLS_KEY,*new_urls)
                # for url in urls:
                #      rds.lpush(REDIS_SPIDER_URLS_KEY,url)
                # 线程爬取结束
                self.status = IDLE


def all_is_idle(spiders):
    """
    检测所有爬虫线程是否均为空闲状态,如果都是空闲状态返回True,否则返回False
    :param spiders: 所有爬虫线程
    :return:bool
    """
    # spiders_status = []  # [1,1,0,1,1]
    # for spider in spiders:
    #     spiders_status.append(spider.status)
    spiders_is_idle = [spider.is_idle for spider in spiders]
    return all(spiders_is_idle)


    for spider in spiders:
        if spider.is_idle is not True:
            return False
    return True



def main(n):
    # 创建n个线程
    rds.lpush(REDIS_SPIDER_URLS_KEY,start_url)
    spider_pools = []
    for i in range(n):
        spider = Spider()
        spider_pools.append(spider)

    for spider in spider_pools:
        spider.start()


    while True:
        # 监听 rds中的待爬取队列
        # 如果待爬取队列为空,结束爬虫

        if rds.llen(REDIS_SPIDER_URLS_KEY) == 0  and all_is_idle(spider_pools):
            print("所有线程爬取完毕.")
            sys.exit(0)
        time.sleep(2)


if __name__ == '__main__':
    main(MAX_DOWNLOAD_THREAD)

你可能感兴趣的:(7-初识分布式爬虫)