scrapy框架爬取1000本epub格式玄幻小说

github源码链接:https://github.com/chengchaoccss/CCcode.git

spider部分

# -*- coding: utf-8 -*-
import scrapy
from epub.items import EpubItem

class EpubdownloadSpider(scrapy.Spider):
    name = 'epubdownload'
    # allowed_domains = ['https://www.ixdzs.com/sort/1/index_0_2_0_1.html']
    start_urls = ['http://www.ixdzs.com/sort/1/index_0_2_0_'+str(i)+'.html/' for i in range(1,51)]

    def parse(self, response):
        href=response.xpath('//a[contains(@href,"/d") and contains(@href,"epub_down")]/@href').extract()
        for i in range(len(href)):
            url = 'http://www.ixdzs.com/'+href[i]
            yield scrapy.Request(url=url,callback=self.newparse)

    def newparse(self,response):
        item=EpubItem()
        link = response.xpath('//a[contains(@href,"down?id=") and contains(@href,"p=6")]/@href').extract()
        url='http://www.ixdzs.com/'+link[0]
        x=url.split('=')
        list=x[0:len(x)-1]
        newurl='='.join(list)+'='
        nam=response.xpath('//h1[@itemprop="name"]/text()').extract_first()
        item['name']=nam[:len(nam)-5]
        item['down_url']=newurl
        yield item

pipelines部分

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
import os
import json

class EpubPipeline(object):
    def process_item(self, item, spider):
        return item


class MongoPipeline(object):
    def __init__(self,mongo_uri,mongo_db):
        self.mongo_uri=mongo_uri
        self.mongo_db=mongo_db

    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )

    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db=self.client[self.mongo_db]

    def process_item(self,item,spider):
        name=item.__class__.__name__
        self.db[name].insert(dict(item))
        return item

    def close_spider(self,spider):
        self.client.close()

class JsonPipeline(object):
    def process_item(self, item, spider):
        base_dir = os.getcwd()
        filename = base_dir + '/news.json'
        # 打开json文件,向里面以dumps的方式吸入数据
        # 注意需要有一个参数ensure_ascii=False ,不然数据会直接为utf编码的方式存入比如
        # :“/xe15”
        with open(filename, 'a') as f:
            line = json.dumps(dict(item), ensure_ascii=False) + '\n'
            f.write(line)
        return item

settings部分


BOT_NAME = 'epub'

SPIDER_MODULES = ['epub.spiders']
NEWSPIDER_MODULE = 'epub.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'epub (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

ITEM_PIPELINES={
    # 'tutorial.pipelines.TextPipeline':300,
    'epub.pipelines.MongoPipeline':400,
    'epub.pipelines.JsonPipeline':350,
}
MONGO_URI='localhost'
MONGO_DB='epubdownload'

items部分

import scrapy


class EpubItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    down_url=scrapy.Field()

爬取结果存储到mongodb数据库

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XOHMAOYv-1589614884334)(https://i.loli.net/2019/04/10/5cacc5ed027da.png)]

上面爬取到的链接是直接下载链接,复制到浏览器便会直接下载小说!

你可能感兴趣的:(python,scrapy,爬虫)