spiders.py
# -*- coding: utf-8 -*-
import scrapy
# 引入item
from ..items import BooksItem
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['qishu.cc']
start_urls = ['http://www.qishu.cc/xuanhuan/list1_1.html']
def parse(self, response):
# 根据xpath找到小说的详情页面链接
detail_links = response.xpath('//span[@class="mainSoftName"]/a/@href').extract()
# for循环遍历所有小说链接
for link in detail_links:
# 拼接完整的url地址
detail_url = 'http://www.qishu.cc' + link
# yield 一个请求对象
yield scrapy.Request(
url = detail_url,
callback=self.parse_detail # 指定回调的函数
)
# 找到下一页
next_href = response.xpath('//dfn/a/@href').extract_first('')
if next_href:
# 拼接下一页的完整url地址
next_url = 'http://www.qishu.cc' + next_href
yield scrapy.Request(next_url)
# 解析详情页面
def parse_detail(self, response):
# 获取小说标题
title = response.xpath('//dt[@id="downInfoTitle"]/text()').extract_first('')
# 获取小说详细信息
info = response.xpath('//dd[@class="downInfoRowL"]/text()').extract()
results = []
# for循环遍历所有信息
for msg in info:
# 判断\r有没有在信息中
if '\r' not in msg:
results.append(msg)
# 如果拿到的数据是6个,取数据,否则全部设置为未知
if len(results) == 6:
yxhj = results[0] # 运行环境
xsyy = results[1] # 小说语言
xslx = results[2] # 小说类型
xszz = results[3] # 小说作者
xsdx = results[4] # 小说大小
gxsj = results[5] # 更新时间
else:
yxhj = xsyy = xslx = xszz = xsdx = gxsj = '未知'
# 获取小说下载地址
download_url = response.xpath('//div[@id="downAddress"]/a[2]/@href').extract_first('')
# 创建BooksItem对象
book = BooksItem()
book['title'] = title
book['yxhj'] = yxhj
book['xsyy'] = xsyy
book['xslx'] = xslx
book['xszz'] = xszz
book['xsdx'] = xsdx
book['gxsj'] = gxsj
# 下载文件需要将url放在列表中
book['download_url'] = [download_url]
yield book
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class BooksSpiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class BooksItem(scrapy.Item):
title = scrapy.Field() # 标题
yxhj = scrapy.Field() # 环境
xsyy = scrapy.Field() # 语言
xslx = scrapy.Field() # 类型
xszz = scrapy.Field() # 作者
xsdx = scrapy.Field() # 大小
gxsj = scrapy.Field() # 更新时间
download_url = scrapy.Field() # 下载地址
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import xlwt
class BooksSpiderPipeline(object):
# 初始化的时候,新建workbook
def __init__(self):
self.workbook = xlwt.Workbook(encoding='utf-8')
self.sheet = self.workbook.add_sheet(u'小说信息表')
self.sheet.write(0, 0, '小说标题')
self.sheet.write(0, 1, '运行环境')
self.sheet.write(0, 2, '小说语言')
self.sheet.write(0, 3, '小说类型')
self.sheet.write(0, 4, '小说作者')
self.sheet.write(0, 5, '小说大小')
self.sheet.write(0, 6, '更新时间')
self.sheet.write(0, 7, '下载地址')
self.count = 1
def close_spider(self,spider):
self.workbook.save(u'奇书小说信息.xls')
def process_item(self, item, spider):
self.sheet.write(self.count, 0, item['title'])
self.sheet.write(self.count, 1, item['yxhj'])
self.sheet.write(self.count, 2, item['xsyy'])
self.sheet.write(self.count, 3, item['xslx'])
self.sheet.write(self.count, 4, item['xszz'])
self.sheet.write(self.count, 5, item['xsdx'])
self.sheet.write(self.count, 6, item['gxsj'])
self.sheet.write(self.count, 7, item['download_url'][0])
self.count += 1
return item
from scrapy.pipelines.files import FilesPipeline
import scrapy
# 自定义小说下载pipeline 指定小说存放的名称
class MyFilesPipeline(FilesPipeline):
def get_media_requests(self, item, info):
url = item['download_url'][0]
request = scrapy.Request(
url=url,
meta={'item':item}
)
return [request]
def file_path(self, request, response=None, info=None):
item = request.meta['item']
title = item['title']
return '%s.txt'%title
debug.py
# -*- coding: utf-8 -*-
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'books'])
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for Books_Spider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'Books_Spider'
SPIDER_MODULES = ['Books_Spider.spiders']
NEWSPIDER_MODULE = 'Books_Spider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Books_Spider (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Books_Spider.middlewares.BooksSpiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'Books_Spider.middlewares.BooksSpiderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'Books_Spider.pipelines.BooksSpiderPipeline': 300,
# 'Books_Spider.pipelines.MyFilesPipeline': 301
}
# 根据item的哪个属性下载小说
FILES_URLS_FIELD = 'download_url'
# 小说存放的路径
FILES_STORE = 'novels'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
middlewares.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class BooksSpiderSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class BooksSpiderDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)