items.py中设置
爬虫文件设置
使用媒体管道(Media Pipleline)
管道文件的位置如下:
from scrapy. Pipelines.images import ImagesPipeline
from scrapy. Pipelines.files import FilesPipeline
from scrapy. Pipelines.media import MediaPipeline
通过在setting.py
文件中通过给IMAGES_STORE
赋值,就可以指定图片的保存路径。
并且默认情况下,文件名是通过对url使用SHA1 hash得来的。
使用媒体通道,必须在项目的settings.py文件中,设置:
对于Images Pipeline,设置为
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
对于Files Pipeline,设置为
ITEM_PIPELINES = {'scrapy.pipelines.files.FilesPipeline': 1}
【注意:我们可同时使用Images Pipeline和Files Pipeline】
确保一个有效的保存下载的文件或者图片的路径,否则即使设置了ITEM_PIPELINES,也不可用。
对于Images Pipeline,设置IMAGES_STORE:
IMAGES_STORE = '目标路径'
对于Files Pipeline,设置FILES_STORE:
FILES_STORE = '目标路径'
支持的存储其他的特征生成图片缩略图
Images Pipeline能为下载的图片自动生成缩略图,为了使用这个特性。我们必须设置字典类型IMAGE_THUMBS,
例如:
IMAGES_THUMBS = {
'small':(50,50),
'big':(270,270),
}
当用这个特性时,Images Pipeline将用规定的每一种尺寸生成缩率图,格式如下:
/thumbs//.jpg
这里:
是指在settings
文件中设置的IMAGES_STORE
路径
是指IMAGES_THUMBS
的字典的键值
是image url
的SHA1 hash
例如:
/full/63bbfea82b8880ed33cdb762aa11fab722a90a24.jpg
/thumbs/small/63bbfea82b8880ed33cdb762aa11fab722a90a24.jpg
/thumbs/big/63bbfea82b8880ed33cdb762aa11fab722a90a24.jpg
过滤小图片
当我们用Images Pipeline
时,通过声明可允许的最小尺寸(设置IMAGES_MIN_HEIGHT
和
IMAGES_MIN_WIDTH
),过滤太小的图片。
例如:
IMAGE_MIN_HEIGHT = 110
IMAGE_MIN_WIDTH = 110
注意:这些尺寸的限制不会影响缩略图生成
默认情况下,没有限制,所有的图片都会被处理。
修改图片名称
查看下ImagePipeline的源码,发现可以重写file_path函数以修改图片名称,例如:
def file_path(self, request, response=None, info=None):
open("image_urls.txt","a").write(request.url + "\n")
image_guid = request.url.split('/')[-1]
return 'full/%s' % (image_guid)
而ImagePipeline的源码如下:
class ImagesPipeline(FilesPipeline):
"""Abstract pipeline that implement the image thumbnail generation logic
"""
MEDIA_NAME = 'image'
MIN_WIDTH = 0
MIN_HEIGHT = 0
THUMBS = {}
DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
DEFAULT_IMAGES_RESULT_FIELD = 'images'
@classmethod
def from_settings(cls, settings):
cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0)
cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0)
cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
cls.THUMBS = settings.get('IMAGES_THUMBS', {})
s3store = cls.STORE_SCHEMES['s3']
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD', cls.DEFAULT_IMAGES_URLS_FIELD)
cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD', cls.DEFAULT_IMAGES_RESULT_FIELD)
store_uri = settings['IMAGES_STORE']
return cls(store_uri)
def file_downloaded(self, response, request, info):
return self.image_downloaded(response, request, info)
def image_downloaded(self, response, request, info):
checksum = None
for path, image, buf in self.get_images(response, request, info):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
width, height = image.size
self.store.persist_file(
path, buf, info,
meta={'width': width, 'height': height},
headers={'Content-Type': 'image/jpeg'})
return checksum
def get_images(self, response, request, info):
path = self.file_path(request, response=response, info=info)
orig_image = Image.open(StringIO(response.body))
width, height = orig_image.size
if width < self.MIN_WIDTH or height < self.MIN_HEIGHT:
raise ImageException("Image too small (%dx%d < %dx%d)" %
(width, height, self.MIN_WIDTH, self.MIN_HEIGHT))
image, buf = self.convert_image(orig_image)
yield path, image, buf
for thumb_id, size in self.THUMBS.iteritems():
thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
thumb_image, thumb_buf = self.convert_image(image, size)
yield thumb_path, thumb_image, thumb_buf
def convert_image(self, image, size=None):
if image.format == 'PNG' and image.mode == 'RGBA':
background = Image.new('RGBA', image.size, (255, 255, 255))
background.paste(image, image)
image = background.convert('RGB')
elif image.mode != 'RGB':
image = image.convert('RGB')
if size:
image = image.copy()
image.thumbnail(size, Image.ANTIALIAS)
buf = StringIO()
image.save(buf, 'JPEG')
return image, buf
def get_media_requests(self, item, info):
return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])]
def item_completed(self, results, item, info):
if self.IMAGES_RESULT_FIELD in item.fields:
item[self.IMAGES_RESULT_FIELD] = [x for ok, x in results if ok]
return item
def file_path(self, request, response=None, info=None):
## start of deprecation warning block (can be removed in the future)
def _warn():
from scrapy.exceptions import ScrapyDeprecationWarning
import warnings
warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
'please use file_path(request, response=None, info=None) instead',
category=ScrapyDeprecationWarning, stacklevel=1)
# check if called from image_key or file_key with url as first argument
if not isinstance(request, Request):
_warn()
url = request
else:
url = request.url
# detect if file_key() or image_key() methods have been overridden
if not hasattr(self.file_key, '_base'):
_warn()
return self.file_key(url)
elif not hasattr(self.image_key, '_base'):
_warn()
return self.image_key(url)
## end of deprecation warning block
image_guid = hashlib.sha1(url).hexdigest() # change to request.url after deprecation
return 'full/%s.jpg' % (image_guid)
def thumb_path(self, request, thumb_id, response=None, info=None):
## start of deprecation warning block (can be removed in the future)
def _warn():
from scrapy.exceptions import ScrapyDeprecationWarning
import warnings
warnings.warn('ImagesPipeline.thumb_key(url) method is deprecated, please use '
'thumb_path(request, thumb_id, response=None, info=None) instead',
category=ScrapyDeprecationWarning, stacklevel=1)
# check if called from thumb_key with url as first argument
if not isinstance(request, Request):
_warn()
url = request
else:
url = request.url
# detect if thumb_key() method has been overridden
if not hasattr(self.thumb_key, '_base'):
_warn()
return self.thumb_key(url, thumb_id)
## end of deprecation warning block
thumb_guid = hashlib.sha1(url).hexdigest() # change to request.url after deprecation
return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
# deprecated
def file_key(self, url):
return self.image_key(url)
file_key._base = True
# deprecated
def image_key(self, url):
return self.file_path(url)
image_key._base = True
# deprecated
def thumb_key(self, url, thumb_id):
return self.thumb_path(url, thumb_id)
thumb_key._base = True
爬取以后得到的是
一个字段存储提取的的图片的链接,第二个字段是下载图片后保留的图片信息,分别是校验码,路径,以及图片URL