Redis 之存储盗墓笔记正文

setting.py

# -*- coding: utf-8 -*-
import scrapy_redis

BOT_NAME = 'CrawlWithRedis'

SPIDER_MODULES = ['CrawlWithRedis.spiders']
NEWSPIDER_MODULE = 'CrawlWithRedis.spiders'

ITEM_PIPELINES = {'CrawlWithRedis.pipelines.CrawlWithRedisPipeline':300}

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Redis 数据库设置
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
REDIS_URL = None
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379

# MongonDB 设置
MONGODB_HOST    = '127.0.0.1'
MONGODB_PORT    = 27017
MONGODB_DBNAME  = 'XiaoYunKeji'
MONGODB_DOCNAME = 'daomubiji'

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field

class CrawlWithRedisItem(Item):
    bookName    = Field()
    bookTitle   = Field()
    chapterNum  = Field()
    chapterName = Field()
    chapterURL  = Field()
    text        = Field()

pipelines.py

# -*- coding: utf-8 -*-
# Don't forget to add your pipeline to the ITEM_PIPELINES setting

from scrapy.conf import settings
from CrawlWithRedis.items import CrawlWithRedisItem
import pymongo

class CrawlWithRedisPipeline(object):
    def __init__(self):
        # 初始化 mongodb 数据库
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        dbName = settings['MONGODB_NAME']
        # 连接
        db = pymongo.MongoClient(host=host, port=port)
        self.post = db[settings['MONGODB_DOCNAME']]

    def process_item(self, item, spider):
        # 将 item 转换为 dict
        item_info = dict(item)
        # 插入记录
        self.post.insert(item_info)
        return item

spiders.py

#-*- coding:utf-8 -*-
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy_redis.spiders import RedisSpider
from CrawlWithRedis.items import CrawlWithRedisItem

class spider(RedisSpider):
    name = "CrawlWithRedis"
    redis_key = 'CrawlWithRedis: start_urls'
    start_urls = ['http://daomubiji.com/']

    def parse(self, response):
        selector = Selector(response)
        tables = selector.xpath('//table')
        for each_table in tables:
            bookName = each_table.xpath('tr/td[@colspan="3"]/center/h2/text()').extract_first()
            contents = each_table.xpath('tr/td/a/text()').extract()
            urls     = each_table.xpath('tr/td/a/@href').extract()
            for i in range(len(urls)):
                item = CrawlWithRedisItem()
                item['bookName'] = bookName
                item['chapterURL'] = urls[i]

                try:
                    item['bookTitle']  = contents[i].split(' ')[0]
                    item['chapterNum'] = contents[i].split(' ')[1]
                except Exception as e:
                    continue
                try:
                    item['chapterName'] = content[i].split(' ')[2]
                except Exception as e:
                    item['chapterName'] = content[i].split(' ')[1][-3:]
                yield Request(urls[i], callback=self.parseContent, meta={'item':item})

    def parse_Content(self, response):
        # 把上面的 item 传递下来
        item = response.meta['item']
        selector = Selector(response)
        contents = selector.xpath('//div[@class="content"]/p/text()').extract()
        text = "\n".join(contents)
        item['text'] = text
        yield item

目前版本 scrapy 1.2 , Python 3.5 运行报错。scrapy github 地址。 暂时未搜索到解决办法, 网上说可能是因为版本问题。改天在 CentOS 上测试下。

你可能感兴趣的:(Redis 之存储盗墓笔记正文)