使用Scrapy下载千图网首页图片的高清样张,系统为Ubuntu 14.04
千图网反爬:
1、访问频率限制,延时需设置适当,频率上限后会要求输入4位验证码(3-6次),仍持续该频率将被封号4-6小时(测试数据)
2、经测试,当被检测为恶意访问后,网站未对IP进行封杀,只是封帐号
综上可知其反爬技术并不算复杂,可使用验证码破解、降低爬取频率、组建cookie池等反反爬措施。
此项目重点是介绍如何使用scrapy的imagepipeline管道下载图片,所以未对反爬采取任何措施,登录方式采用cookie登录。
注意: 重点查看管道类(pipeline)如何写的,如要修改图片存储路径的话,则必须使用重写file_path方法,具体细节不做详细说明,实 践一下就明白了。
QTWspider.py(核心程序)
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import FormRequest,Request
from scrapy.exceptions import CloseSpider
from QianTuWSpider.items import QiantuwspiderItem
from Cookies_ import cookies #cookie登录
class QtwspiderSpider(CrawlSpider):
name = 'QTWspider'
allowed_domains = ['58pic.com']
start_urls = ['http://www.58pic.com/']
def start_requests(self):
#scrapy使用cookie登录,传入的cookies是字典
return [FormRequest(self.start_urls[0],cookies=cookies,callback=self.parse)]
def cookies_verify(self, response):
'''
验证cookie有效性
:param response:
:return:
'''
if not u'您的ID:' in response.text:
raise CloseSpider('#############COOKIES has expired##############')
def parse(self, response):
'''
抓取主页所有图片的url,并跟进
:param response:
:return:
'''
self.cookies_verify(response)
sel = response.xpath('//div[@class="topic-wrap h-lg roll-box "]//div[@class="clearfix"]') #8组图片
for i in sel: #每组
#print(i.xpath('div//a/@href').extract()[0])
image_names = i.xpath('div//p[@class="card-title"]/text()').extract()
image_urls = list(set(i.xpath('div//a/@href').extract())) #去重
number = len(image_names)
for n in range(number):
request = Request(url=image_urls[n],callback=self.parse_hd_img)
request.meta['name']= image_names[n]
yield request
#yield item
def parse_hd_img(self,response):
'''
跟进抓取每张图片的高清样张url(一张图有多张高清样张url)
:param response:
:return:
'''
item = QiantuwspiderItem()
item['image_name'] = response.meta['name']
item['image_urls'] = response.xpath('//img[@id="show-area-pic"]/@src').extract()
yield item
Cookies_.py (cookies,与QTWspider.py同目录)
cookis_test = '''
qt_visitor_id=%22616d9dde98a45ee772990388431cb125%22; awake=0; message2=1; Hm_lvt_644763986e48f2374d9118a9ae189e14=1521181445,1521203255; loginBackUrl=%22http%3A%5C%2F%5C%2Fwww.58pic.com%5C%2F%22; auth_id=%2241077492%7C%5Cu98ce%7C1521808374%7C42d61b3a90f428b7cb0e4feaf3ce5557%22; success_target_path=%22http%3A%5C%2F%5C%2Fwww.58pic.com%5C%2F%22; sns=%7B%22token%22%3A%7B%22access_token%22%3A%220905E77F82F603F80857EDC13477DF84%22%2C%22expires_in%22%3A%227776000%22%2C%22refresh_token%22%3A%220191FF5732A5B82F839D2839F66DA754%22%2C%22openid%22%3A%223613C90D7E2B388E51549E29EA0EEE7F%22%7D%2C%22type%22%3A%22qq%22%7D; ssid=%225aabb9765368b1.48665150%22; qt_risk_visitor_id=%2238f044dbbae69bad2aed744993eb75ce%22; newbieTask=%22%7B%5C%22is_login%5C%22%3A1%2C%5C%22is_search%5C%22%3A0%2C%5C%22is_download%5C%22%3A0%2C%5C%22is_keep%5C%22%3A0%2C%5C%22login_count%5C%22%3A1%2C%5C%22upload_material%5C%22%3A0%2C%5C%22is_task_complete%5C%22%3A0%2C%5C%22task1%5C%22%3A0%2C%5C%22task2%5C%22%3A0%2C%5C%22task3%5C%22%3A0%7D%22; censor=%2220180316%22; newspeople=100; Hm_lpvt_644763986e48f2374d9118a9ae189e14=1521204266
'''
cookies = {}
for i in cookis_test.split(';'):
key = i.split('=')[0]
value = i.split('=')[1]
cookies[key] = value
items.py
from scrapy.item import Item, Field
class QiantuwspiderItem(Item):
# define the fields for your item here like:
# name = Field()
images = Field() #必要,不可自定义
image_name = Field() #非必要,可自定义,用于传递每个item的名称(用于文件夹命名)
image_urls = Field() #必要,不可自定义
image_results = Field() #必要,不可自定义
image_paths = Field() #非必要,可自定义,在pipeline中的item_completed方法使用
pipelines.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class QiantuwspiderPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
#处理对象:每组item中的每张图片
for image_url in item['image_urls']:
yield Request(image_url,meta={'item':item})
def file_path(self, request, response=None, info=None):
'''
处理对象:每张图片
返回的path是item中每一张图片的路径
'''
image_name = request.meta['item']['image_name']
path = 'full/'+image_name+'/'+request.url[-4:].strip('/')+'.jpg'
return path
def item_completed(self, results, item, info):
'''
处理对象:每组item中的图片
'''
image_path = [x['path'] for ok,x in results if ok]
if not image_path:
raise DropItem('Item contains no images')
item['image_paths'] = image_path
return item
settings.py
BOT_NAME = 'QianTuWSpider'
#LOG_LEVEL = 'INFO'
SPIDER_MODULES = ['QianTuWSpider.spiders']
NEWSPIDER_MODULE = 'QianTuWSpider.spiders'
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
DOWNLOAD_DELAY = 1
ITEM_PIPELINES = {
'QianTuWSpider.pipelines.QiantuwspiderPipeline':1
}
IMAGES_STORE = '/home/eli/Desktop/QianTuW' #图片存储目录
IMAGES_THUMBS = { #缩略小图和大图的尺寸设置
'small':(50,50),
'big':(270,270),
}
如有疑问,欢迎留言。