Scrapy
是一个适用爬取网站数据、提取结构性数据的应用程序框架,它可以应用在广泛领域:Scrapy
常应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中。通常我们可以很简单的通过 Scrapy 框架实现一个爬虫,抓取指定网站的内容或图片。[百度百科]
scrapy startproject 项目名
因为
Scrapy
是一个爬虫框架,一般多个蜘蛛合起来的项目才会用到,所以还要创建小蜘蛛。
cd 项目名
scrapy genspider 蜘蛛名 初始url
很久前帮人写的爬个回收数据
Scrapy
全程非阻塞,通过yeild item
后,再通过pipeline
进行工作,spider
主文件只写解析网页的逻辑,真正工作的地方在pipeline
。
Spider
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from aihuishou.items import AihuishouItem,DownloadImg
from scrapy.http import Request
class HuishouSpider(scrapy.Spider):
name = 'huishou'
allowed_domains = ['aihuishou.com']
start_urls = ['https://www.aihuishou.com/']
def parse(self, response):
hxs = Selector(response=response).xpath('//div[@class="category-panel"]')
for obj in hxs:
href = obj.xpath('.//a[@class="morebrands"]/@href').extract_first().strip()
href = href.replace("/", "")
if href == 'shouji':
yield Request(url="https://www.aihuishou.com/%s" % href, meta={
'flag': 'shouji'}, callback=self.c_item)
elif href == 'pingban':
yield Request(url="https://www.aihuishou.com/%s" % href, meta={
'flag': 'pingban'}, callback=self.c_item)
elif href == 'laptop':
yield Request(url="https://www.aihuishou.com/%s" % href, meta={
'flag': 'laptop'}, callback=self.c_item)
elif href == 'sheying':
yield Request(url="https://www.aihuishou.com/%s" % href, meta={
'flag': 'sheying'}, callback=self.c_item)
elif href == 'digital':
yield Request(url="https://www.aihuishou.com/%s" % href, meta={
'flag': 'digital'}, callback=self.c_item)
def c_item(self, response):
hxs = Selector(response=response).xpath('//div[@class="main-right"]/ul/li')
for obj in hxs:
href = obj.xpath('.//a/@href').extract_first().strip()
yield Request(url="https://www.aihuishou.com/%s" % href, meta=response.meta, callback=self.c_item)
# title = obj.xpath('.//a/@title').extract_first().strip().replace("回收", "").replace("手机", "").rstrip()
title = obj.xpath('.//p/text()').extract_first().rstrip()
img = obj.xpath('.//img/@src').extract_first().strip()
lst = {
response.meta['flag']: title + '-----' + img.split('/')[-1]}
item_obj = AihuishouItem(_class=lst) # 创建分类文件
yield item_obj
downimg = DownloadImg(imgurl=img, imgpath='class/images/') # 下载分类图片
yield downimg
print(href, title, img)
classification = Selector(response=response).xpath('//div[@class="list-box-wrapper"]//dd[@class="active"]/a/text()'). \
extract()
if classification:
page = Selector(response=response).xpath('//div[@class="product-list-pager"]'
'/a[@class="next no_hover"]/@href').extract_first()
item = Selector(response=response).xpath('//div[@class="product-list-wrapper"]/ul/li')
if page:
yield Request(url="https://www.aihuishou.com/%s" % page, meta=response.meta, callback=self.c_item)
for obj in item:
item_href = obj.xpath('.//a/@href').extract_first()
yield Request(url="https://www.aihuishou.com/%s" % item_href, meta=response.meta, callback=self.c_item)
item_info = Selector(response=response).xpath('//div[@id="group-property"]/div[@class="left"]')
item_info2 = Selector(response=response).xpath('//div[@id="group-property"]/div[@class="right"]')
if item_info and item_info2:
catagory = Selector(response=response).xpath('//input[@id="product_brand"]/@value').extract_first().rstrip()
item_name = item_info.xpath('.//h1/text()').extract_first().rstrip()
img = item_info.xpath('.//img/@src').extract_first().strip()
imgurl = (img.split('?')[0]).split('/')[-1]
price = item_info.xpath('.//ul[@class="clearfix section-price"]/@data-highest-price').\
extract_first().strip()
price = price.split('?')[0]
item_infos = item_info2.xpath('.//ul')
item_multi= list()
item_single = dict()
for obj in item_infos:
item_choice = obj.xpath('./@data-ppn-name').extract_first()
if item_choice:
# 单选
item_single_all = list()
item_single_alls = obj.xpath('.//div/text()').extract()
for y in item_single_alls:
item_single_all.append(y)
item_single[item_choice] = item_single_all
else:
item_multi_alls = obj.xpath('.//span[@class="property_value"]/text()').extract()
for y in item_multi_alls:
item_multi.append(y)
lst = {
response.meta['flag']: catagory + '-----' + item_name + '-----' + price + '-----' + imgurl + '-----' + str(item_single) + '-----' + str(item_multi)}
item_obj = AihuishouItem(_item=lst) # 创建分类文件
yield item_obj
downimg = DownloadImg(imgurl=img, imgpath='item/images/') # 下载物品图片
yield downimg
pipeline
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import requests
class AihuishouPipeline(object):
def process_item(self, item, spider):
# print(item)
if item.get('imgurl'):
url = item.get('imgurl')
res = requests.get(url)
if url.find('?') > 0:
img_path = item.get('imgpath') + (url.split('?')[0]).split('/')[-1]
with open(img_path, 'wb') as f:
f.write(res.content)
else:
img_path = item.get('imgpath') + url.split('/')[-1]
with open(img_path, 'wb') as f:
f.write(res.content)
if item.get('_item'):
v = item.get('_item')
for c, n in v.items():
f = open('item/%s.txt' % c, 'a+', encoding='utf-8')
print(n)
f.write(n + '\n')
f.close()
if item.get('_class'):
v = item.get('_class')
for c, n in v.items():
f = open('class/%s.txt' % c, 'a+', encoding='utf-8')
print(n)
f.write(n + '\n')
f.close()
return item
item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class DownloadImg(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
imgurl = scrapy.Field()
imgpath = scrapy.Field()
class AihuishouItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
_class = scrapy.Field()
_item = scrapy.Field()
scrapy crawl 蜘蛛名
# 无日志显示运行
scrapy crawl 蜘蛛名 --nolog
# 如果单纯想运行蜘蛛 切换到蜘蛛目录下
# 把craw 换成 runspider 文件名