scrapy爬取笔趣阁获取全部内容,清晰过滤,得到想要的数据

简单的创建scraly我就不多说了,浪费时间

spider

# -*- coding: utf-8 -*-
import scrapy
from ..items import BiItem
import re


class QuSpider(scrapy.Spider):
    name = 'qu'
    allowed_domains = ['xbiquge.la']
    start_urls = ['http://www.xbiquge.la/xiaoshuodaquan/']

    def parse(self, response):
        home_url = response.xpath("//div[@id='main']//ul//a/@href").extract()
        for home in home_url:
            url = home
            yield scrapy.Request(url=url, callback=self.parse2)

    def parse2(self, response):
        # item = BiItem()
        two_detail = response.xpath("//div[@id='wrapper']")
        for two in two_detail:
            # 获取小说名称
            title = two.xpath(".//div[@class='box_con']//div[@id='info']/h1/text()").extract_first()
            # print('###############################################')
            # print(title)
            # 获取小说作者
            author = two.xpath(".//div[@class='box_con']//div[@id='info']/p/text()").extract_first()
            # print('+++++++++++++++++++++++++++++++++++')
            # print(author_list)
            # 获取小说类型
            type = two.xpath(".//div[@class='con_top']/a[2]/text()").extract_first()
            # print(type)
            # 获取小说简介
            intro = two.xpath(".//div[@id='intro']/p[2]/text()").extract_first()
            # print(intro)
            # 获取小说封面
            cover = two.xpath(".//div[@id='sidebar']/div[@id='fmimg']/img/@src").extract_first()
            # print(image)
            # 获取小说最后更新时间
            turnover_time = two.xpath(".//div[@id='info']/p[3]/text()").extract_first()
            # print(turnover_time)
            # 获取小说章节目录
            section_title = two.xpath(".//div[@class='box_con']//dl//a/text()").extract()

            # item['title'] = title
            # item['author'] = author
            # item['type'] = type
            # item['intro'] = intro
            # item['cover'] = cover
            # yield item
            # 获取小说章节目录url
            section_url = two.xpath(".//div[@class='box_con']//dl//a/@href").extract()
            for section in section_url:
                url = 'http://www.xbiquge.la' + section
                yield scrapy.Request(url=url, callback=self.parse3)

    def parse3(self, response):
        item = BiItem()
        # 获取每一章节目录名称
        section_title = response.xpath("//div[@class='box_con']/div[@class='bookname']/h1/text()").extract_first()
        
        item['section_title'] = section_title
        # 获取每一章节的章节内容
        section_lists = response.xpath("//div[@class='box_con']/div[@id='content']/text()").extract()
        # 设置一个空的字符串进行数据拼接
        section__ = ''
        for section_ in section_lists:
            section__ += section_
            section = re.sub(r"[\s+\.\!\/_,$%^*(+\"\')]+|[+——?【】?~@#¥%……&*]+|\\n+|\\r+|(\\xa0)+|(\\u3000)+|\\t", "", str(section__))


        item['section'] = section

        return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for bi project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'bi'

SPIDER_MODULES = ['bi.spiders']
NEWSPIDER_MODULE = 'bi.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'bi (+http://www.yourdomain.com)'

# Obey robots.txt rules
# 遵守robots协议
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# 并发请求个数(越小越慢)
CONCURRENT_REQUESTS = 1

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 下载延迟时间(越大请求越慢)
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'bi.middlewares.BiSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'bi.middlewares.BiDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'bi.pipelines.BiPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# 默认False;为True表示启用AUTOTHROTTLE扩展
AUTOTHROTTLE_ENABLED = True
# The initial download delay
# 默认3秒;初始下载延迟时间
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# 默认60秒;在高延迟情况下最大的下载延迟
AUTOTHROTTLE_MAX_DELAY = 5
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# 使用httpscatch缓存

HTTPCACHE_ENABLED = True

HTTPCACHE_EXPIRATION_SECS = 0

HTTPCACHE_DIR = 'httpcache'

HTTPCACHE_IGNORE_HTTP_CODES = []

HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql


class BiPipeline(object):
    def __init__(self):
        self.connect = pymysql.connect(host='127.0.0.1', user='root', password='123456', db='fiction_2', port=3306,
                                       charset='utf8mb4')
        self.cursor = self.connect.cursor()
        print('连接成功######################################', self.cursor)

    def process_item(self, item, spider):
        try:
            self.cursor.execute("INSERT INTO home_sectionmodel(section_title, section )"
                                " VALUES (%s, %s) on duplicate key update section_title=(section_title)",
                                (item['section_title'], item['section'],
                                                                                                       ))

            self.connect.commit()
        except Exception as error:
            print('##################################', error)
        return item

item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class BiItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # title = scrapy.Field()
    # author = scrapy.Field()
    # type = scrapy.Field()
    # intro = scrapy.Field()
    # cover = scrapy.Field()
    section_title = scrapy.Field()
    section = scrapy.Field()
    pass

数据是全部都拿到了,我先有个问一下,django创建的关联表,有一个关联id,我想把数据存到mysql关联表中,没有了思路,有没有大神可以帮忙解答一下,谢谢了

 

你可能感兴趣的:(scrapy)