scrapy与mongo

目标网站:http://www.daomubiji.com/
输出结果:存入mongodb

首先settings配置

COOKIES_ENABLED = True
ITEM_PIPELINES = {
   'douban.pipelines.NovelPipeline': 300,
}

MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'novel'
MONGODB_DOCNAME = 'Book'

Item设置

class NovelItem(scrapy.Item):
    book_name = scrapy.Field()
    book_title = scrapy.Field()
    book_desc = scrapy.Field()
    chapter_num = scrapy.Field()
    chapter_name = scrapy.Field()
    chapter_url = scrapy.Field()

爬虫编写

class NovelSpider(scrapy.Spider):
    name = 'novel'
    start_urls = ['http://www.daomubiji.com/']

    def parse(self, reponse):
        book_urls = reponse.css('.article-content a::attr(href)').extract()
        for book_url in book_urls:
            print(book_url)
            yield Request(book_url, callback=self.parse_book)

    def parse_book(self, response):
        book_name = response.css('.focusbox-title::text').extract()[0]
        book_desc = response.css('.focusbox-text::text').extract()[0]
        acticles = response.css('.excerpt-c3')
        for acticle in acticles:
            item = NovelItem()
            content = acticle.css('a::text').extract()[0].split(' ')
            chapter_url = acticle.css('a::attr(href)').extract()[0]
            if len(content) == 4:
                del content[0]
            book_title = content[0]
            chapter_num = content[1]
            try:
                chapter_name = content[2]
            except Exception as e:
                chapter_name = content[1][-3:]
            item['book_name'] = book_name
            item['book_title'] = book_title
            item['book_desc'] = book_desc
            item['chapter_num'] = chapter_num
            item['chapter_name'] = chapter_name
            item['chapter_url'] = chapter_url

            yield item

NovelPipeline连接mongodb配置:

class NovelPipeline(object):
    def __init__(self):
        host = MONGODB_HOST
        port = MONGODB_PORT
        dbname = MONGODB_DBNAME
        client = pymongo.MongoClient(host=host, port=port)
        tdb = client[dbname]
        self.post = tdb[MONGODB_DOCNAME]

    def process_item(self, item, spider):
        book_info = dict(item)
        self.post.insert(book_info)
        return item

运行main.py,结果

scrapy与mongo_第1张图片
image.png

你可能感兴趣的:(scrapy与mongo)