之前写的都是单机爬虫,在一个机器上可以运行,这节就研究一下分布式爬虫应该怎么写。
scrapy-redis创建项目的过程,与之前scrapy一样,都是命令行创建项目,然后在创建爬虫。最后这个爬虫的主体逻辑都写完了之后,再改一下爬虫继承的类名,修改一下settings.py文件就OK了。
现在就开始这个案例,默认是安装好了redis和docker。
如果没有安装好,可以在linux上安装一个docker然后pull一个redis镜像到本地即可。docker安装过程。
OK,现在开始,首先分析一下需要爬取的页面,豆瓣小组讨论。
有网址,还有7个可以点击的标签页,其中精选是就是当前页面,剩下的六个都是有自己的后缀。我们选择使用CrawlSpider进行链接匹配。
OK,上爬虫代码:
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from .. import utils, items, LinkExtractorRule
import re
class SpiderSpider(CrawlSpider):
name = 'spider'
allowed_domains = ['www.douban.com']
# start_urls = ['https://www.douban.com/group/topic/124785371/']
start_urls = ['https://www.douban.com/group/explore']
# 翻页规则,
page_link = LinkExtractor(process_value=LinkExtractorRule.page_process_value, allow_domains=('www.douban.com'))
# 每一页中的讨论话题
topic_link = LinkExtractor(process_value=LinkExtractorRule.topic_process_value, allow_domains=('www.douban.com'))
# 小组规则
group_link = LinkExtractor(process_value=LinkExtractorRule.group_process_value, allow_domains=('www.douban.com'))
# 评论翻页
comment_link = LinkExtractor(process_value=LinkExtractorRule.comment_process_value, allow_domains=('www.douban.com'))
# 定义匹配规则,callback不可以使用parse()
rules = (
Rule(page_link, follow=True),
Rule(topic_link, callback="parse_topic", follow=True),
Rule(group_link, callback="parse_group", follow=False),
Rule(comment_link, callback="parse_comment", follow=True)
)
def parse_topic(self, response):
print(response.url)
topic_item = items.TopicItem()
id = re.compile(r"/(\d+)/").search(response.url).group().replace("/", "")
topic_item["id"] = id
topic_item["title"] = utils.is_None(response.xpath("//*[@class=\"tablecc\"]"))[0]
topic_item["person_name"] = utils.is_None(response.xpath("//*[@class=\"from\"]/a/text()"))[0]
topic_item["content"] = utils.is_None(response.xpath("//*[@class=\"topic-content\"]"))[0]
topic_item["group_id"] = re.compile(r"/group/\S+/\?ref=sidebar").search(utils.is_None(response.xpath("//*[@class=\"group-item\"]/div[@class=\"info\"]/div[@class=\"title\"]/a/@href"))[0]).group().replace("/group/", "").replace("/?ref=sidebar", "")
yield topic_item
comment_item_list = self.get_comments(response, id)
for comment_item in comment_item_list:
yield comment_item
def parse_group(self, response):
group_item = items.GroupItem()
group_item["id"] = re.compile(r"/(\d+)/").search(response.url).group().replace("/", "")
group_item["name"] = utils.is_None(response.xpath("//*[@id=\"group-info\"]/div/h1/text()"))[0]
group_item["leader"] = utils.is_None(response.xpath("//*[@id=\"content\"]/div[@class=\"grid-16-8 clearfix\"]/div[@class=\"article\"]/div[@class=\"group-board\"]/p/a/text()"))[0]
time = utils.is_None(response.xpath("//*[@id=\"content\"]/div[@class=\"grid-16-8 clearfix\"]/div[@class=\"article\"]/div[@class=\"group-board\"]/p/text()"))[0]
group_item["time"] = re.compile("\d+\-\d+\-\d+").search( time if time!="" else "0000-00-00").group()
group_item["content"] = utils.is_None(response.xpath("//*[@id=\"content\"]/div[@class=\"grid-16-8 clearfix\"]/div[@class=\"article\"]/div[@class=\"group-board\"]/div[@class=\"group-intro\"]"))[0]
yield group_item
def parse_comment(self, response):
topic_id = re.compile(r"/(\d+)/").search(response.url).group().replace("/", "")
item_list = self.get_comments(response, topic_id)
for item in item_list:
yield item
def get_comments(self, response, topic_id):
item_list = []
ul = response.xpath("//*[@id=\"comments\"]")
li_list = ul.xpath("./li")
for li in li_list:
item = items.CommentItem()
item["id"] = utils.is_None(li.xpath("./@data-cid"))[0]
item["person_name"] = utils.is_None(li.xpath("./div[@class=\"reply-doc content\"]/div[@class=\"bg-img-green\"]/h4/a/text()"))[0]
item["content"] = utils.is_None(li.xpath("./div[@class=\"reply-doc content\"]/p/text()"))[0]
item["topic_id"] = topic_id
item_list.append(item)
return item_list
首先保证数据可以正常的爬取到,然后再改成scrapy-redis项目。
首先修改settings.py
# -*- coding: utf-8 -*-
BOT_NAME = 'douban_group_spider'
SPIDER_MODULES = ['douban_group_spider.spiders']
NEWSPIDER_MODULE = 'douban_group_spider.spiders'
#使用scrapy-redis内置的去重组件
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#使用scrapy-redis调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 使用队列形式
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
#允许暂停,redis请求记录不丢失
SCHEDULER_PERSIST = True
ROBOTSTXT_OBEY = False #不遵守robot协议
DOWNLOAD_DELAY = 1 #间隔时间
COOKIES_ENABLED = False
DOWNLOADER_MIDDLEWARES = {
'douban_group_spider.middlewares.DoubanGroupSpiderSpiderMiddleware': 543,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 560,
'douban_group_spider.middlewares.ABProxyMiddleware' : 550,
'douban_group_spider.middlewares.UserAgentMiddleware': 600,
}
ITEM_PIPELINES = {
'douban_group_spider.pipelines.DoubanGroupSpiderPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline' : 900, #支持将数据存储到redis中,必须启动
}
# 阿布云ip代理配置,包括账号密码
import base64
PROXY_USER = "通行证书"
PROXY_PASS = "通行密钥"
PROXY_HOST = "HTTP隧道服务器地址"
PROXY_PORT = "端口"
# for Python3
PROXY_AUTH = "Basic " + base64.urlsafe_b64encode(bytes((PROXY_USER + ":" + PROXY_PASS), "ascii")).decode("utf8")
PROXY_SERVER = "http://" + PROXY_HOST + ":" + PROXY_PORT
#配置redis数据库
REDIS_HOST = '192.168.1.130'
REDIS_PORT = 9901
REDIS_DB = 0
REDIS_URL = 'redis://' + REDIS_HOST + ': '+ str(REDIS_PORT) + "/" + str(REDIS_DB)
然后修改sprider文件
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
from scrapy_redis.spiders import RedisCrawlSpider
from .. import utils, items, LinkExtractorRule
import re
#docker run --name redis-test -i -t -p 0.0.0.0:9001:6379 -d redis /bin/bash docker后台运行redis命令
class SpiderSpider(RedisCrawlSpider):
name = 'spider'
allowed_domains = ['www.douban.com']
# start_urls = ['https://www.douban.com/group/topic/124785371/']
# start_urls = ['https://www.douban.com/group/explore']
redis_key = "douban_spider:start_urls"
#逻辑代码都一样
#.......
OK此时这个项目就是scrpay-redis项目了,可以放到几个机器上运行。
首先运行爬虫,multiprocessing是关于多进程运行的库。
from scrapy import cmdline
from multiprocessing import Pool as Process_Pool
from time import sleep
def run_spider(number) :
print("线程" + str(number))
cmdline.execute("scrapy crawl spider".split())
if __name__ == '__main__':
p_pool = Process_Pool()
for i in range(5):
p_pool.apply_async(run_spider, args=(i,))
sleep(1)
p_pool.close()
p_pool.join()
sleep(5)
然后添加一条start_url到redis中。
# coding=utf-8
from redis import StrictRedis,ConnectionPool
# redis 连接池
pool = ConnectionPool.from_url("redis://@192.168.1.130:9901/0")
redis = StrictRedis(connection_pool=pool)
redis.lpush("douban_spider:start_urls", "https://www.douban.com/group/explore")
pool.disconnect()
此时所有的爬虫就开始工作了。
最后就是把爬到的数据从redis中存入到mysql中。
完整代码