先来看个小案例:使用scrapy爬取某度图片。
'''
创建项目及爬虫文件:
1.scrapy startproject baiduimgs
2.cd baiduimgs
3.scrapy genspider bdimg www
'''
# -*- coding: utf-8 -*-
import scrapy
import re
import os
class BdimgSpider(scrapy.Spider):
name = 'bdimgs'
allowed_domains = ['image.baidu.com']
start_urls = ['https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=index&pos=history&word=%E7%8C%AB%E5%92%AA']
num=0
def parse(self, response):
text=response.text
img_urls=re.findall('"thumbURL":"(.*?)"',text)
for img_url in img_urls:
yield scrapy.Request(img_url,dont_filter=True,callback=self.get_img)
def get_img(self,response):
img_data=response.body
if not os.path.exists("dir"):
os.mkdir("dir")
filename="dir/%s.jpg"%self.num
self.num+=1
with open(filename,"wb") as f:
f.write(img_data)
注意:
# -*- coding: utf-8 -*-
import scrapy
import re
import os
from ..items import BaiduimgsItem #引入创建字段的类
class BdimgSpider(scrapy.Spider):
name = 'bdimgs'
allowed_domains = ['image.baidu.com']
start_urls = ['https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=index&pos=history&word=%E7%8C%AB%E5%92%AA']
num=0
def parse(self, response):
text=response.text
img_urls=re.findall('"thumbURL":"(.*?)"',text)
for img_url in img_urls:
yield scrapy.Request(img_url,dont_filter=True,callback=self.get_img)
def get_img(self,response):
img_data=response.body
item=BaiduimgsItem()
item["img_data"]=img_data
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BaiduimgsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
img_data=scrapy.Field()
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
class BaiduimgsPipeline(object):
num=0
def process_item(self, item, spider):
if not os.path.exists("dir_pipe"):
os.mkdir("dir_pipe")
filename="dir_pipe/%s.jpg"%self.num
self.num+=1
img_data=item["img_data"]
with open(filename,"wb") as f:
f.write(img_data)
return item
注意:要在settings.py文件中开启管道!!!
# -*- coding: utf-8 -*-
import scrapy
import re
import os
from ..items import BaiduimgsPipeItem
class BdimgSpider(scrapy.Spider):
name = 'bdimgs'
allowed_domains = ['image.baidu.com']
start_urls = ['https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=index&pos=history&word=%E7%8C%AB%E5%92%AA']
def parse(self, response):
text=response.text
image_urls=re.findall('"thumbURL":"(.*?)"',text)
# 注意:此处给字段的值是图片的URL!!!
item=BaiduimgsPipeItem()
item["image_urls"]=image_urls
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BaiduimgsPipeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
image_urls=scrapy.Field()
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'baiduimgs.pipelines.BaiduimgsPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline': 300, # 注意:一定要开启此pipeline管道!
}
# 注意:一定要指定媒体管道存储的路径!
IMAGES_STORE = r'E:\Py_Spider_High\spiderpro\scrapy_1\baiduimgs\dir0'