scrapy :爬取小说

速度是相当的快的

爬取整站的小说

最后结果保存至mongodb数据库

pycharm开发还是很好用的

创建项目:scrapy startproject daomubiji                

运行项目:scrapy crawl daomubi

settings

DEFAULT_REQUEST_HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3722.400 QQBrowser/10.5.3738.400'

}

items

 title = scrapy.Field()
    section = scrapy.Field()
    content = scrapy.Field()

spider

import scrapy
from daomubiji.items import DaomubijiItem

class DaomubiSpider(scrapy.Spider):
    name = 'daomubi'
    allowed_domains = ['daomubiji.com']
    start_urls = ['http://www.daomubiji.com/dao-mu-bi-ji-2']
    def start_requests(self):
        for i in range(1, 9):
            yield scrapy.Request('http://www.daomubiji.com/dao-mu-bi-ji-{}'.format(i), callback=self.parse)
    def parse(self, response):
        item = DaomubijiItem()
        item['title'] = response.xpath('//h1[@class="focusbox-title"]/text()').extract_first()#提取小说名
        items = response.xpath('//article[@class="excerpt excerpt-c3"]')
        for href in items:
            detail_href = href.xpath('./a/@href').extract_first()#提取章节的正文链接
            yield scrapy.Request(url=detail_href, meta={'item': item}, callback=self.get_content)

    def get_content(self, response):
        item = response.meta['item']
        item['section'] = response.xpath('//h1[@class="article-title"]/text()').extract_first()#提取小说的章节名
        pages = response.xpath('//article[@class="article-content"]//p/text()').extract()#解析正文
        item['content'] = ''.join(pages)

        yield item

pipelines

from scrapy.item import Item
import pymongo
class DaomubijiPipeline(object):

    def open_spider(self, spider):
        # 连接数据库
        self.client = pymongo.MongoClient(host='localhost', port=27017)
        # 创建myspider数据库
        self.db = self.client.daomubiji
    def close_spider(self, spider):
        self.client.close()
    def process_item(self, item, spider):
        collection = self.db.spider.xiaoshuo
        post = dict(item) if isinstance(item, Item) else item
        collection.insert_one(post)
        return item

 

你可能感兴趣的:(Scrapy,爬虫开发)