利用Python scapy爬取起点小说网小说

  土木狗,大二,混吃等死不知老之将至。



items.py

import scrapy


class QidianItem(scrapy.Item):

    title = scrapy.Field()
    content = scrapy.Field()
pipelines.py
import os
class QidianPipeline(object):
     # def __init__(self):
     #

     def process_item(self, item, spider):
     #根据书名来创建文件,item.get('title')就可以获取到书名
       os.chdir(path=r'/home/administrator/PycharmProjects/untitled/qidian/qidian/en')
       with open(file=str(item.get('title'))+".txt",mode='a') as f:
           f.write(item.get('content'))
       return item

settings.py
LOG_LEVEL= 'ERROR'

LOG_FILE ='log.txt'

爬虫文件
import scrapy
# from scrapy.linkextractors import LinkExtractor
# from scrapy.spiders import CrawlSpider , Rule
from ..items import QidianItem
class XiaoshuoSpider(scrapy.Spider):
    name = 'xiaoshuo'
    allowed_domains = ['qidian.com']
    start_urls = [
        "https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page="+str(page) for page in range(100,200)
    ]
    def parse(self, response):
        #获取每本书的url,并且将链逐个交给下一个方法处理
        for url_ in response.xpath("//h4/a/@href").extract():
            yield scrapy.Request('https:'+str(url_),callback=self.parse_info)
    def parse_info(self,response):
        item = QidianItem()
        #获取小说标题
        title = response.xpath("//h1/em/text()").extract()[0]
        self.title_ = title
        print("开始爬取小说:"+title)
        #作为文件名
        item['title'] = title
        #获取免费阅读链接,并且交由parse__content 方法处理
        info_url = response.xpath("//a[@id='readBtn']/@href").extract()[0]
        yield scrapy.Request("https:"+str(info_url),meta={'item':item},callback=self.parse_content)
        #获取说章节,并且进行递归,重复获取
    def parse_content(self,response):
        #获取章节标题
        name = response.xpath("//h3[@class='j_chapterName']/text()").extract()[0]+"\n"
        print("正在爬取小说******《"+self.title_+"》******章节:---------------"+name)
        content = name +''
        for str_ in response.xpath("//div[@class='read-content j_readContent']//p/text()").extract():
            content = content +str_[1:]
        #获取上面传过来的item
        item = response.meta['item']
        #将书的章节及章节内容存入content字段中
        item['content'] = content
        next_url = 'https:' + response.xpath("//a[@id='j_chapterNext']/@href").extract()[0]
        #过滤收费章节,
        count = 0
        if len(content) > 250:
            try:
               yield item
               yield scrapy.Request(str(next_url),meta={'item':item},callback=self.parse_content)
            except:
                print("---------------------------------------------------------------------------")
        else: print("免费章节已经完啦!")





你可能感兴趣的:(爬虫和)