# -*- coding: utf-8 -*-
import scrapy
from epub.items import EpubItem
class EpubdownloadSpider(scrapy.Spider):
name = 'epubdownload'
# allowed_domains = ['https://www.ixdzs.com/sort/1/index_0_2_0_1.html']
start_urls = ['http://www.ixdzs.com/sort/1/index_0_2_0_'+str(i)+'.html/' for i in range(1,51)]
def parse(self, response):
href=response.xpath('//a[contains(@href,"/d") and contains(@href,"epub_down")]/@href').extract()
for i in range(len(href)):
url = 'http://www.ixdzs.com/'+href[i]
yield scrapy.Request(url=url,callback=self.newparse)
def newparse(self,response):
item=EpubItem()
link = response.xpath('//a[contains(@href,"down?id=") and contains(@href,"p=6")]/@href').extract()
url='http://www.ixdzs.com/'+link[0]
x=url.split('=')
list=x[0:len(x)-1]
newurl='='.join(list)+'='
nam=response.xpath('//h1[@itemprop="name"]/text()').extract_first()
item['name']=nam[:len(nam)-5]
item['down_url']=newurl
yield item
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
import os
import json
class EpubPipeline(object):
def process_item(self, item, spider):
return item
class MongoPipeline(object):
def __init__(self,mongo_uri,mongo_db):
self.mongo_uri=mongo_uri
self.mongo_db=mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db=self.client[self.mongo_db]
def process_item(self,item,spider):
name=item.__class__.__name__
self.db[name].insert(dict(item))
return item
def close_spider(self,spider):
self.client.close()
class JsonPipeline(object):
def process_item(self, item, spider):
base_dir = os.getcwd()
filename = base_dir + '/news.json'
# 打开json文件,向里面以dumps的方式吸入数据
# 注意需要有一个参数ensure_ascii=False ,不然数据会直接为utf编码的方式存入比如
# :“/xe15”
with open(filename, 'a') as f:
line = json.dumps(dict(item), ensure_ascii=False) + '\n'
f.write(line)
return item
BOT_NAME = 'epub'
SPIDER_MODULES = ['epub.spiders']
NEWSPIDER_MODULE = 'epub.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'epub (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES={
# 'tutorial.pipelines.TextPipeline':300,
'epub.pipelines.MongoPipeline':400,
'epub.pipelines.JsonPipeline':350,
}
MONGO_URI='localhost'
MONGO_DB='epubdownload'
import scrapy
class EpubItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
down_url=scrapy.Field()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XOHMAOYv-1589614884334)(https://i.loli.net/2019/04/10/5cacc5ed027da.png)]