spider.py
# -*- coding: utf-8 -*-
import scrapy
from datetime import datetime
from novel.items import NovelItem,ChapterItem
import hashlib
class A17kSpider(scrapy.Spider):
name = '17k'
allowed_domains = [
'all.17k.com',
'www.17k.com',
]
start_urls = ['http://all.17k.com/lib/book/2_0_0_0_2_0_1_0_1.html']
def parse(self, response):
detail = response.css("table tbody tr")
novel_item = NovelItem()
m = hashlib.md5()
for tab in detail[1:]:
novel_name = tab.css('.td3 a::text').extract_first()
m.update(novel_name.encode("utf-8"))
uuid_id = m.hexdigest()
novel_item['uuid'] = uuid_id
novel_item['novel_name'] = novel_name
novel_item["author"] = tab.css(".td6 a::text").extract_first()
novel_item["status"] = tab.css(".td8 em::text").extract_first().strip()
novel_item["word_number"] = tab.css(".td5::text").extract_first()
novel_item["lastest_chapter"] = tab.css(".td4 a::text").extract_first()
novel_item["category"] = tab.css(".td2 a::text").extract_first()
update_time = datetime.strptime(tab.css(".td7::text").extract_first(),"%Y-%m-%d %H:%M")
novel_item["update_time"] = update_time
yield novel_item
urlList = response.css(".jt::attr(href)").re("\d+.html")
for item in urlList:
url = "http://www.17k.com/list/%s"%(item)
yield scrapy.Request(url,callback=self.chapter,meta={'uuid':uuid_id})
next_url = response.css(".page a:nth-last-child(4)::attr(href)").extract_first()
if next_url is not None:
yield response.follow(next_url,callback=self.parse)
def chapter(self,response):
uuid_id = response.meta['uuid']
chapter_url = response.css(".Volume dd ::attr(href)").extract()
for index,item_url in enumerate(chapter_url):
yield response.follow(item_url,callback=self.content,meta={'uuid':uuid_id,'index':index})
def content(self,response):
chapter_item = ChapterItem()
chapter_item['chapter_name'] = response.css(".readAreaBox.content h1::text").extract_first().strip()
chapter_item['content'] = response.css(".readAreaBox.content .p").extract_first()
chapter_item['novel_id'] = response.meta['uuid']
chapter_item['sequence']= response.meta['index']
yield chapter_item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NovelItem(scrapy.Item):
uuid = scrapy.Field()
novel_name = scrapy.Field()
author = scrapy.Field()
status = scrapy.Field()
word_number = scrapy.Field()
lastest_chapter = scrapy.Field()
category = scrapy.Field()
update_time = scrapy.Field()
def get_insert_sql(self):
insert_sql="""
insert into app01_novelitem(uuid,novel_name,author,status,word_number,lastest_chapter,category,update_time)
values(%s,%s,%s,%s,%s,%s,%s,%s)
"""
params=(self['uuid'],self['novel_name'],
self['author'],self['status'],
self['word_number'],self['lastest_chapter'],
self['category'],self['update_time']
)
return insert_sql,params
class ChapterItem(scrapy.Item):
chapter_name = scrapy.Field()
content = scrapy.Field()
novel_id = scrapy.Field()
sequence = scrapy.Field()
def get_insert_sql(self):
insert_sql="""
insert into app01_chapteritem(chapter_name,content,novel_id,sequence)
values(%s,%s,%s,%s)
"""
params=(self['chapter_name'],self['content'],self['novel_id'],self['sequence'])
return insert_sql,params
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import pymysql
class NovelPipeline(object):
def process_item(self, item, spider):
return item
class MysqlTwistedPipeline(object):
def __init__(self,dbpool):
self.dbpool=dbpool
@classmethod
def from_settings(cls,settings):
dbprams=dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
password=settings['MYSQL_PASSWORD'],
charset="utf8",
use_unicode=True,
)
dbpool=adbapi.ConnectionPool("pymysql",**dbprams)
return cls(dbpool)
def process_item(self,item,spider):
query=self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error,item,spider)
def handle_error(self,failure,item,spider):
print(failure)
def do_insert(self,cursor,item):
insert_sql,params=item.get_insert_sql()
print(insert_sql,params)
cursor.execute(insert_sql,params)
middlewares.py
from fake_useragent import UserAgent
class RandomUserAgentMiddlerware(object):
def __init__(self,crawler):
super(RandomUserAgentMiddlerware,self).__init__()
self.ua = UserAgent()
self.ua_type = crawler.settings.get('RANDOM_UA_TYPE','random')
@classmethod
def from_crawler(cls,crawler):
return cls(crawler)
def process_request(self,request,spider):
def get_ua():
return getattr(self.ua,self.ua_type)
request.headers.setdefault("User-Agent",get_ua())
settings
# -*- coding: utf-8 -*-
# Scrapy settings for novel project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'novel'
SPIDER_MODULES = ['novel.spiders']
NEWSPIDER_MODULE = 'novel.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'novel (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'novel.middlewares.NovelSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'novel.middlewares.RandomUserAgentMiddlerware': 543,
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'novel.pipelines.MysqlTwistedPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
RANDOM_UA_TYPE="random"
MYSQL_HOST='127.0.0.1'
MYSQL_USER='root'
MYSQL_PASSWORD=''
MYSQL_DBNAME='novel'
先写到这里,这儿主要在middleware里面添加了headers,到时候再添加布隆过滤,分布式,代理等技术