简单的创建scraly我就不多说了,浪费时间
spider
# -*- coding: utf-8 -*-
import scrapy
from ..items import BiItem
import re
class QuSpider(scrapy.Spider):
name = 'qu'
allowed_domains = ['xbiquge.la']
start_urls = ['http://www.xbiquge.la/xiaoshuodaquan/']
def parse(self, response):
home_url = response.xpath("//div[@id='main']//ul//a/@href").extract()
for home in home_url:
url = home
yield scrapy.Request(url=url, callback=self.parse2)
def parse2(self, response):
# item = BiItem()
two_detail = response.xpath("//div[@id='wrapper']")
for two in two_detail:
# 获取小说名称
title = two.xpath(".//div[@class='box_con']//div[@id='info']/h1/text()").extract_first()
# print('###############################################')
# print(title)
# 获取小说作者
author = two.xpath(".//div[@class='box_con']//div[@id='info']/p/text()").extract_first()
# print('+++++++++++++++++++++++++++++++++++')
# print(author_list)
# 获取小说类型
type = two.xpath(".//div[@class='con_top']/a[2]/text()").extract_first()
# print(type)
# 获取小说简介
intro = two.xpath(".//div[@id='intro']/p[2]/text()").extract_first()
# print(intro)
# 获取小说封面
cover = two.xpath(".//div[@id='sidebar']/div[@id='fmimg']/img/@src").extract_first()
# print(image)
# 获取小说最后更新时间
turnover_time = two.xpath(".//div[@id='info']/p[3]/text()").extract_first()
# print(turnover_time)
# 获取小说章节目录
section_title = two.xpath(".//div[@class='box_con']//dl//a/text()").extract()
# item['title'] = title
# item['author'] = author
# item['type'] = type
# item['intro'] = intro
# item['cover'] = cover
# yield item
# 获取小说章节目录url
section_url = two.xpath(".//div[@class='box_con']//dl//a/@href").extract()
for section in section_url:
url = 'http://www.xbiquge.la' + section
yield scrapy.Request(url=url, callback=self.parse3)
def parse3(self, response):
item = BiItem()
# 获取每一章节目录名称
section_title = response.xpath("//div[@class='box_con']/div[@class='bookname']/h1/text()").extract_first()
item['section_title'] = section_title
# 获取每一章节的章节内容
section_lists = response.xpath("//div[@class='box_con']/div[@id='content']/text()").extract()
# 设置一个空的字符串进行数据拼接
section__ = ''
for section_ in section_lists:
section__ += section_
section = re.sub(r"[\s+\.\!\/_,$%^*(+\"\')]+|[+——?【】?~@#¥%……&*]+|\\n+|\\r+|(\\xa0)+|(\\u3000)+|\\t", "", str(section__))
item['section'] = section
return item
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for bi project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'bi'
SPIDER_MODULES = ['bi.spiders']
NEWSPIDER_MODULE = 'bi.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'bi (+http://www.yourdomain.com)'
# Obey robots.txt rules
# 遵守robots协议
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# 并发请求个数(越小越慢)
CONCURRENT_REQUESTS = 1
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 下载延迟时间(越大请求越慢)
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'bi.middlewares.BiSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'bi.middlewares.BiDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'bi.pipelines.BiPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# 默认False;为True表示启用AUTOTHROTTLE扩展
AUTOTHROTTLE_ENABLED = True
# The initial download delay
# 默认3秒;初始下载延迟时间
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# 默认60秒;在高延迟情况下最大的下载延迟
AUTOTHROTTLE_MAX_DELAY = 5
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# 使用httpscatch缓存
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class BiPipeline(object):
def __init__(self):
self.connect = pymysql.connect(host='127.0.0.1', user='root', password='123456', db='fiction_2', port=3306,
charset='utf8mb4')
self.cursor = self.connect.cursor()
print('连接成功######################################', self.cursor)
def process_item(self, item, spider):
try:
self.cursor.execute("INSERT INTO home_sectionmodel(section_title, section )"
" VALUES (%s, %s) on duplicate key update section_title=(section_title)",
(item['section_title'], item['section'],
))
self.connect.commit()
except Exception as error:
print('##################################', error)
return item
item.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class BiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# title = scrapy.Field()
# author = scrapy.Field()
# type = scrapy.Field()
# intro = scrapy.Field()
# cover = scrapy.Field()
section_title = scrapy.Field()
section = scrapy.Field()
pass
数据是全部都拿到了,我先有个问一下,django创建的关联表,有一个关联id,我想把数据存到mysql关联表中,没有了思路,有没有大神可以帮忙解答一下,谢谢了