1.爬虫文件
# -*- coding: utf-8 -*-
import scrapy
import copy
import sys
from gtshe.items import GtsheItem
class MusicSpider(scrapy.Spider):
name = 'music'
allowed_domains = ['jitashe.org']
start_urls = ["https://www.jitashe.org"]
cookie = "yGhj_40fe_saltkey=IDLlBPKk; yGhj_40fe_lastvisit=1551500410; yGhj_40fe_auth=e9f8%2FuHsl%2BbAhP%2BGint%2FUgLktBBjjf3EvlF0TXj4ZXWHe4Z%2Bcbge1LRi21zG6TL19UdsJLoP8sZmZAE%2B3iutAYxvfGg; yGhj_40fe_lastcheckfeed=644660%7C1551504015; yGhj_40fe_lip=60.176.42.168%2C1551504015; yGhj_40fe_pushuid=8430419; yGhj_40fe_pushgid=49982; yGhj_40fe_connect_is_bind=1; yGhj_40fe_st_p=644660%7C1551540151%7C13551060efe5a9679df4c4e9e02ed7a1; yGhj_40fe_viewid=tid_1336137; yGhj_40fe_ulastactivity=1551587332%7C0; yGhj_40fe_checkpm=1; yGhj_40fe_noticeTitle=1; Hm_lvt_4ad169a3774e8f5be3c7945513632bde=1551504009,1551515651,1551522942,1551587351; Hm_lpvt_4ad169a3774e8f5be3c7945513632bde=1551587351; yGhj_40fe_lastact=1551587332%09misc.php%09patch"
cookies = {i.split("=")[0]:i.split("=")[1] for i in cookie.split("; ")}
def start_requests(self):
yield scrapy.Request(
self.start_urls[0],
callback = self.parse,
cookies=self.cookies
)
def parse(self,response):
new = "https://www.jitashe.org/guide/newtab/t1/"
hot = "https://www.jitashe.org/guide/hottab/t1/"
item = GtsheItem()
item['cat'] = "new"
yield scrapy.Request(
new,
callback=self.parse1,
meta = {'item':copy.deepcopy(item)},
cookies=self.cookies
)
item['cat'] = "hot"
yield scrapy.Request(
hot,
callback=self.parse1,
meta = {'item':copy.deepcopy(item)}
)
def parse1(self,response):
url_list = ["https://www.jitashe.org"+i for i in response.xpath("//a[@class='title']/@href").extract()]
name_list = response.xpath("//a[@class='title']/text()").extract()
next_item = copy.deepcopy(response.meta['item'])
print (url_list)
for index,url in enumerate(url_list):
item = response.meta['item']
item['name'] = name_list[index]
yield scrapy.Request(
url = url,
meta={'item':copy.deepcopy(item)},
callback = self.parse2,
cookies=self.cookies #为了实现持久化必须每次访问都携带cookies
)
#获取下一页
next_url = response.xpath('//a[@class="nxt"]/@href').extract()
if len(next_url)!=0:
yield scrapy.Request(
url = "https://www.jitashe.org/"+next_url[0],
meta = {'item':copy.deepcopy(next_item)},
callback = self.parse1
)
def parse2(self,response):
url = response.xpath("//a[@id='gtp_download']/@href").extract_first()
print(url)
if url is not None:
gtp_url = "https://www.jitashe.org"+url
item = response.meta['item']
item['url'] = gtp_url
print(gtp_url)
print("开始爬取:"+item['name'])
yield item
2.配置文件
# -*- coding: utf-8 -*-
#radis 配置
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST = True
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
REDIS_URL = "redis://127.0.0.1:6379"
BOT_NAME = 'gtshe'
SPIDER_MODULES = ['gtshe.spiders']
NEWSPIDER_MODULE = 'gtshe.spiders'
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
#COOKIES_DEBUG=True
#LOG_LEVEL="WARNING"
USER_AGENT_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36"
]
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'gtshe (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
FILES_STORE = "G:/Eclipse_p/scrapy/gtshe/gtp_forum"
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 1
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
DOWNLOADER_MIDDLEWARES = {
'gtshe.middlewares.GtsheDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'gtshe.pipelines.GtshePipeline': 300,
}
3.item文件
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class GtsheItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
url = scrapy.Field()
cat = scrapy.Field()
4.下载中间件
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
from scrapy import signals
class GtsheDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
def process_request(self,request,spider):
request.headers['USER-AGENT'] = random.choice(spider.settings.get('USER_AGENT_LIST'))
5.管道文件
# -*- coding: utf-8 -*-
import scrapy
import copy
import os
from scrapy import cmdline
from scrapy.utils.misc import md5sum
from scrapy.pipelines.files import FilesPipeline
try:
from cStringIO import StringIO as BytesIO
except ImportError:
from io import BytesIO
class GtshePipeline(FilesPipeline):
def get_media_requests(self,item,spider):
yield scrapy.Request(item['url'],meta={'item':item})
#获取文件后缀名
def file_path(self,request,response=None,info=None):
item = request.meta['item']
return item['cat']+"/"
#由于不能直接从链接中获取图片名称所以只能从header头中获取
def file_downloaded(self, response, request, info):
path = self.file_path(request, response=response, info=info)
file_name = response.headers.get('Content-Disposition')
#print(response.headers)
if file_name is None:
print("爬虫关闭")
os._exit(0)
path = path+str(file_name,'utf-8').split("\"")[1]
buf = BytesIO(response.body)
checksum = md5sum(buf)
buf.seek(0)
self.store.persist_file(path, buf, info)
return checksum