(三)分布式爬虫(2)——豆瓣小组爬虫案例

之前写的都是单机爬虫,在一个机器上可以运行,这节就研究一下分布式爬虫应该怎么写。
scrapy-redis创建项目的过程,与之前scrapy一样,都是命令行创建项目,然后在创建爬虫。最后这个爬虫的主体逻辑都写完了之后,再改一下爬虫继承的类名,修改一下settings.py文件就OK了。


现在就开始这个案例,默认是安装好了redis和docker。
如果没有安装好,可以在linux上安装一个docker然后pull一个redis镜像到本地即可。docker安装过程。
OK,现在开始,首先分析一下需要爬取的页面,豆瓣小组讨论。

豆瓣小组

有网址,还有7个可以点击的标签页,其中精选是就是当前页面,剩下的六个都是有自己的后缀。我们选择使用CrawlSpider进行链接匹配。


OK,上爬虫代码:

# -*- coding: utf-8 -*-

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from .. import utils, items, LinkExtractorRule
import re

class SpiderSpider(CrawlSpider):
    name = 'spider'
    allowed_domains = ['www.douban.com']
    # start_urls = ['https://www.douban.com/group/topic/124785371/']
    start_urls = ['https://www.douban.com/group/explore']

    # 翻页规则,
    page_link = LinkExtractor(process_value=LinkExtractorRule.page_process_value, allow_domains=('www.douban.com'))
    # 每一页中的讨论话题
    topic_link = LinkExtractor(process_value=LinkExtractorRule.topic_process_value, allow_domains=('www.douban.com'))
    # 小组规则
    group_link = LinkExtractor(process_value=LinkExtractorRule.group_process_value, allow_domains=('www.douban.com'))
    # 评论翻页
    comment_link = LinkExtractor(process_value=LinkExtractorRule.comment_process_value, allow_domains=('www.douban.com'))


    # 定义匹配规则,callback不可以使用parse()
    rules = (
        Rule(page_link, follow=True),
        Rule(topic_link, callback="parse_topic", follow=True),
        Rule(group_link, callback="parse_group", follow=False),
        Rule(comment_link, callback="parse_comment", follow=True)
    )

    def parse_topic(self, response):
        print(response.url)
        topic_item = items.TopicItem()
        id = re.compile(r"/(\d+)/").search(response.url).group().replace("/", "")
        topic_item["id"] = id
        topic_item["title"] = utils.is_None(response.xpath("//*[@class=\"tablecc\"]"))[0]
        topic_item["person_name"] = utils.is_None(response.xpath("//*[@class=\"from\"]/a/text()"))[0]
        topic_item["content"] = utils.is_None(response.xpath("//*[@class=\"topic-content\"]"))[0]
        topic_item["group_id"] = re.compile(r"/group/\S+/\?ref=sidebar").search(utils.is_None(response.xpath("//*[@class=\"group-item\"]/div[@class=\"info\"]/div[@class=\"title\"]/a/@href"))[0]).group().replace("/group/", "").replace("/?ref=sidebar", "")
        yield topic_item
        comment_item_list = self.get_comments(response, id)
        for comment_item in comment_item_list:
            yield comment_item

    def parse_group(self, response):
        group_item = items.GroupItem()
        group_item["id"] = re.compile(r"/(\d+)/").search(response.url).group().replace("/", "")
        group_item["name"] = utils.is_None(response.xpath("//*[@id=\"group-info\"]/div/h1/text()"))[0]
        group_item["leader"] = utils.is_None(response.xpath("//*[@id=\"content\"]/div[@class=\"grid-16-8 clearfix\"]/div[@class=\"article\"]/div[@class=\"group-board\"]/p/a/text()"))[0]
        time = utils.is_None(response.xpath("//*[@id=\"content\"]/div[@class=\"grid-16-8 clearfix\"]/div[@class=\"article\"]/div[@class=\"group-board\"]/p/text()"))[0]
        group_item["time"] = re.compile("\d+\-\d+\-\d+").search( time if time!=""  else "0000-00-00").group()
        group_item["content"] = utils.is_None(response.xpath("//*[@id=\"content\"]/div[@class=\"grid-16-8 clearfix\"]/div[@class=\"article\"]/div[@class=\"group-board\"]/div[@class=\"group-intro\"]"))[0]
        yield group_item

    def parse_comment(self, response):
        topic_id = re.compile(r"/(\d+)/").search(response.url).group().replace("/", "")
        item_list = self.get_comments(response, topic_id)
        for item in item_list:
            yield item

    def get_comments(self, response, topic_id):
        item_list = []
        ul = response.xpath("//*[@id=\"comments\"]")
        li_list = ul.xpath("./li")
        for li in li_list:
            item = items.CommentItem()
            item["id"] = utils.is_None(li.xpath("./@data-cid"))[0]
            item["person_name"] = utils.is_None(li.xpath("./div[@class=\"reply-doc content\"]/div[@class=\"bg-img-green\"]/h4/a/text()"))[0]
            item["content"] = utils.is_None(li.xpath("./div[@class=\"reply-doc content\"]/p/text()"))[0]
            item["topic_id"] = topic_id
            item_list.append(item)
        return item_list

首先保证数据可以正常的爬取到,然后再改成scrapy-redis项目。
首先修改settings.py

# -*- coding: utf-8 -*-

BOT_NAME = 'douban_group_spider'
SPIDER_MODULES = ['douban_group_spider.spiders']
NEWSPIDER_MODULE = 'douban_group_spider.spiders'

#使用scrapy-redis内置的去重组件
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#使用scrapy-redis调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 使用队列形式
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
#允许暂停,redis请求记录不丢失
SCHEDULER_PERSIST = True

ROBOTSTXT_OBEY = False #不遵守robot协议

DOWNLOAD_DELAY = 1 #间隔时间


COOKIES_ENABLED = False

DOWNLOADER_MIDDLEWARES = {
    'douban_group_spider.middlewares.DoubanGroupSpiderSpiderMiddleware': 543,
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 560,
    'douban_group_spider.middlewares.ABProxyMiddleware' : 550,
    'douban_group_spider.middlewares.UserAgentMiddleware': 600,
}

ITEM_PIPELINES = {
    'douban_group_spider.pipelines.DoubanGroupSpiderPipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline' : 900, #支持将数据存储到redis中,必须启动
}


# 阿布云ip代理配置,包括账号密码
import base64
PROXY_USER = "通行证书"
PROXY_PASS = "通行密钥"
PROXY_HOST = "HTTP隧道服务器地址"
PROXY_PORT = "端口"
# for Python3
PROXY_AUTH = "Basic " + base64.urlsafe_b64encode(bytes((PROXY_USER + ":" + PROXY_PASS), "ascii")).decode("utf8")
PROXY_SERVER = "http://" + PROXY_HOST +  ":" + PROXY_PORT


#配置redis数据库
REDIS_HOST = '192.168.1.130'
REDIS_PORT = 9901
REDIS_DB = 0
REDIS_URL = 'redis://' + REDIS_HOST + ': '+ str(REDIS_PORT) + "/" + str(REDIS_DB)

然后修改sprider文件

# -*- coding: utf-8 -*-

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import  Rule
from scrapy_redis.spiders import RedisCrawlSpider
from .. import utils, items, LinkExtractorRule
import re

#docker run --name redis-test -i -t -p 0.0.0.0:9001:6379 -d redis /bin/bash   docker后台运行redis命令

class SpiderSpider(RedisCrawlSpider):
    name = 'spider'
    allowed_domains = ['www.douban.com']
    # start_urls = ['https://www.douban.com/group/topic/124785371/']
    # start_urls = ['https://www.douban.com/group/explore']
    redis_key = "douban_spider:start_urls"

    #逻辑代码都一样
    #.......

OK此时这个项目就是scrpay-redis项目了,可以放到几个机器上运行。
首先运行爬虫,multiprocessing是关于多进程运行的库。

from scrapy import cmdline
from multiprocessing import Pool as Process_Pool
from time import sleep


def run_spider(number) :
    print("线程" + str(number))
    cmdline.execute("scrapy crawl spider".split())

if __name__ == '__main__':
    p_pool = Process_Pool()
    for i in range(5):
        p_pool.apply_async(run_spider, args=(i,))
        sleep(1)
    p_pool.close()
    p_pool.join()
    sleep(5)

然后添加一条start_url到redis中。

# coding=utf-8
from redis import StrictRedis,ConnectionPool

# redis 连接池
pool = ConnectionPool.from_url("redis://@192.168.1.130:9901/0")
redis = StrictRedis(connection_pool=pool)
redis.lpush("douban_spider:start_urls", "https://www.douban.com/group/explore")
pool.disconnect()

此时所有的爬虫就开始工作了。
最后就是把爬到的数据从redis中存入到mysql中。
完整代码

你可能感兴趣的:((三)分布式爬虫(2)——豆瓣小组爬虫案例)