scrapy-redis是一个基于redis的scrapy组件,通过它可以快速实现简单分布式爬虫程序,该组件本质上提供了三大功能:
下面通过爬取顶点小说网帮助大家快速了解scrapy-redis的使用。
items.py:
import scrapy
class SrTestItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
auth = scrapy.Field()
last_update_time = scrapy.Field()
url = scrapy.Field()
spiders.sr_spider.py:
from scrapy.spiders import Rule
from sr_test.items import SrTestItem
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy_redis.spiders import RedisCrawlSpider
class DingdianSpider(RedisCrawlSpider):
#当继承RedisCrawlSpider时,需在Redis客户端指定起始URL。
name = 'dingdian'
# start_urls = ['http://www.23us.so/']
rules = (
Rule(LinkExtractor(allow='/list/\d+_\d+.html'), follow=True),
Rule(LinkExtractor(allow='/xiaoshuo/\d+.html'), callback='parse_item')
)
def parse_item(self, response):
auth = response.xpath('//*[@id="at"]/tr[1]/td[2]/text()').extract_first()
last_update_time = response.xpath('//*[@id="at"]/tr[2]/td[3]/text()').extract_first()
url = response.url
item = SrTestItem()
item['auth'] = auth
item['last_update_time'] = last_update_time
item['url'] = url
return item
settings.py(重点!!!):
BOT_NAME = 'sr_test'
SPIDER_MODULES = ['sr_test.spiders']
NEWSPIDER_MODULE = 'sr_test.spiders'
ROBOTSTXT_OBEY = False
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_URL = 'redis://localhost:6379'
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300
}
SCHEDULER_PERSIST = True
insert_items_to_mongodb.py(将Redis的内容转存到MongoDB的控制文件):
import json
import redis
import pymongo
def insert_to_mongo():
# 创建一个 redis 连接
r = redis.Redis(host='localhost', port=6379)
# 创建一个mongodb的连接
client = pymongo.MongoClient(host='localhost', port=27017)
db = client['dd']
coll = db['dingdian']
# 死循环进行数据导入
while True:
# 从 redis 数据库中获取数据,并且在redis中删除
source, data = r.blpop(['dingdian:items'])
# 将 json 字符串 反序列化为 dict
item = json.loads(data)
# 将 dict 对象写入到mongodb中
coll.insert_one(item)
print('insert mongodb :', item)
if __name__ == '__main__':
insert_to_mongo()