scrapy|爬qianlu小说实战|源码及解释

《疆域》
有很多可以优化的地方,奈何技术不到位,只做参考。
qianlu.py

# -*- coding: utf-8 -*-
import scrapy
import time
import re
 
class QianluSpider(scrapy.Spider):
    name = 'qianlu'
    # allowed_domains = ['qianlu.com']
    start_urls = ['https://www.qianluxiaoshuo.com/0/994/index.html']

    def parse(self, response):
    	# 找到小说每一章节的名字
        selectors = response.xpath('//ul[@class="chapters"]/li')
        item = {} # 字典,存放章节名,链接
        for selector in selectors:
        	# 得到章节名
            title = selector.xpath('./a/text()').get();
			# 得到链接
            half_html = selector.xpath('./a/@href').get();
            a_html = 'https://www.qianluxiaoshuo.com/' + half_html
            #  放进字典
            item["title"] = title
            item["a_html"] = a_html
 
            yield item
			 
            yield scrapy.Request( 
                a_html,
                callback=self.parse_content,
                meta= {"item":item}
            )
     # 抓每一章节的具体内容
    def parse_content(self,response):
        item = response.meta["item"]
        # data抓取章节的更新时间和其他无效数据
        data = ""
        updata_time_list = response.xpath('//div[@class="ainfo"]//text()')
        for u in updata_time_list:
            d = str(u.get())
            data += d
        # 测试:print(len(updata_time_list))    =3
        # 正则提取数字部分
        data = re.findall('\d+',data) # 提取出的结果 ['2019', '07', '31', '19', '57', '2935']
        # st 字符串存储时间 2019年07月31日19时57分  不要2935
        st = ""
        for i in range(0,5):
            st +=data[i]+ '-'
        st = st[:-1] # 去掉2935(列表最后一个)
        item["data"] = st # 结果:2019-07-31-19-52
        # 测试 print(st)

		# 具体内容
        cont_list = response.xpath('//div[@class="acontent"]')
        # 存储内容
        text_list = []
        # 技术不到位,又拿了一次标题
        title = response.xpath('//div[@class="atitle"]/text()').get()
        item["title"] = title
        text_list.append(title)
        text_list.append('\n')
        for con in cont_list:
            text = con.xpath('.//text()')
            for t in text :
                t = t.get()
                t = str(t)
                # t = t.replace("
                # ddel(t,'n')
                # ddel(t,'u') 想要去除无效字符,结果 全部木大
                text_list.append(t)
                # print(t)
# 分割        text_list.append('11111111111111111111111111111111111111111111111111')
        item["content"] = str(text_list)

        # print(item["content"])

        print('1111111111111111111111111111111111111111111111111111')
        yield item
         



pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re
import json
import os

# 创建一个文件夹
if not os.path.exists('./qianluTXTLibs'):
    os.mkdir('./qianluTXTLibs')
class NovelspiderPipeline(object):

    def process_item(self, item, spider):
        #print(item)
        # 去除无效字符
        a =item["content"]
        a = re.sub(r"\\", "",a)
        a = re.sub("r|n|u|2003", "",a)
        # with open('qianlu_text.txt','a',encoding='utf-8') as f:
        #     json.dump(a,f,ensure_ascii=False,indent=2)
		
		# name = 标题名 + 时间
        name = str(item['title']) + str(item['data'])
        print('name=',name)
        Path = './qianluTXTLibs/' +  name
        with open(Path, 'w',encoding='utf-8') as fp:   
            fp.write(a)
        return item

setting.py
加上headers
开启管道
ROBOTSTXT_OBEY = False
LOG_LEVEL = ‘WARNING’

结果:
scrapy|爬qianlu小说实战|源码及解释_第1张图片
scrapy|爬qianlu小说实战|源码及解释_第2张图片
凑活能看。

你可能感兴趣的:(笔记,python,xpath)