Scrapy-ImagesPipelines-简单的图片抓取和下载

#scrapy ImagesPipeline & FilesPipeline

  • scrapy 提供了专门处理下载的pipline, 图片和文件的下载。
from scrapy.pipelines.images import ImagesPipeline
from scrapy.pipelines.files import FilesPipeline
  • ImagesPipeline & FilesPipeline 两者的用法非常相似,这篇文章主要使用ImagesPipelines
"""
Images Pipeline

See documentation in topics/media-pipeline.rst
"""
import functools
import hashlib
import six

try:
    from cStringIO import StringIO as BytesIO
except ImportError:
    from io import BytesIO

from PIL import Image

from scrapy.utils.misc import md5sum
from scrapy.utils.python import to_bytes
from scrapy.http import Request
from scrapy.settings import Settings
from scrapy.exceptions import DropItem
#TODO: from scrapy.pipelines.media import MediaPipeline
from scrapy.pipelines.files import FileException, FilesPipeline


class NoimagesDrop(DropItem):
    """Product with no images exception"""


class ImageException(FileException):
    """General image error exception"""


class ImagesPipeline(FilesPipeline):
    """Abstract pipeline that implement the image thumbnail generation logic

    """

    MEDIA_NAME = 'image'

    # Uppercase attributes kept for backward compatibility with code that subclasses
    # ImagesPipeline. They may be overridden by settings.
    MIN_WIDTH = 0
    MIN_HEIGHT = 0
    EXPIRES = 90
    THUMBS = {}
    DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
    DEFAULT_IMAGES_RESULT_FIELD = 'images'

    def __init__(self, store_uri, download_func=None, settings=None):
        super(ImagesPipeline, self).__init__(store_uri, settings=settings,
                                             download_func=download_func)

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        resolve = functools.partial(self._key_for_pipe,
                                    base_class_name="ImagesPipeline",
                                    settings=settings)
        self.expires = settings.getint(
            resolve("IMAGES_EXPIRES"), self.EXPIRES
        )

        if not hasattr(self, "IMAGES_RESULT_FIELD"):
            self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
        if not hasattr(self, "IMAGES_URLS_FIELD"):
            self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD

        self.images_urls_field = settings.get(
            resolve('IMAGES_URLS_FIELD'),
            self.IMAGES_URLS_FIELD
        )
        self.images_result_field = settings.get(
            resolve('IMAGES_RESULT_FIELD'),
            self.IMAGES_RESULT_FIELD
        )
        self.min_width = settings.getint(
            resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
        )
        self.min_height = settings.getint(
            resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
        )
        self.thumbs = settings.get(
            resolve('IMAGES_THUMBS'), self.THUMBS
        )

    @classmethod
    def from_settings(cls, settings):
        s3store = cls.STORE_SCHEMES['s3']
        s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
        s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
        s3store.POLICY = settings['IMAGES_STORE_S3_ACL']

        gcs_store = cls.STORE_SCHEMES['gs']
        gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']

        store_uri = settings['IMAGES_STORE']
        return cls(store_uri, settings=settings)

    def file_downloaded(self, response, request, info):
        return self.image_downloaded(response, request, info)

    def image_downloaded(self, response, request, info):
        checksum = None
        for path, image, buf in self.get_images(response, request, info):
            if checksum is None:
                buf.seek(0)
                checksum = md5sum(buf)
            width, height = image.size
            self.store.persist_file(
                path, buf, info,
                meta={'width': width, 'height': height},
                headers={'Content-Type': 'image/jpeg'})
        return checksum

    def get_images(self, response, request, info):
        path = self.file_path(request, response=response, info=info)
        orig_image = Image.open(BytesIO(response.body))

        width, height = orig_image.size
        if width < self.min_width or height < self.min_height:
            raise ImageException("Image too small (%dx%d < %dx%d)" %
                                 (width, height, self.min_width, self.min_height))

        image, buf = self.convert_image(orig_image)
        yield path, image, buf

        for thumb_id, size in six.iteritems(self.thumbs):
            thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
            thumb_image, thumb_buf = self.convert_image(image, size)
            yield thumb_path, thumb_image, thumb_buf

    def convert_image(self, image, size=None):
        if image.format == 'PNG' and image.mode == 'RGBA':
            background = Image.new('RGBA', image.size, (255, 255, 255))
            background.paste(image, image)
            image = background.convert('RGB')
        elif image.mode == 'P':
            image = image.convert("RGBA")
            background = Image.new('RGBA', image.size, (255, 255, 255))
            background.paste(image, image)
            image = background.convert('RGB')
        elif image.mode != 'RGB':
            image = image.convert('RGB')

        if size:
            image = image.copy()
            image.thumbnail(size, Image.ANTIALIAS)

        buf = BytesIO()
        image.save(buf, 'JPEG')
        return image, buf

    def get_media_requests(self, item, info):
        return [Request(x) for x in item.get(self.images_urls_field, [])]

    def item_completed(self, results, item, info):
        if isinstance(item, dict) or self.images_result_field in item.fields:
            item[self.images_result_field] = [x for ok, x in results if ok]
        return item

    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                          'please use file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() or image_key() methods have been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)
        ## end of deprecation warning block

        image_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return 'full/%s.jpg' % (image_guid)

    def thumb_path(self, request, thumb_id, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.thumb_key(url) method is deprecated, please use '
                          'thumb_path(request, thumb_id, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from thumb_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if thumb_key() method has been overridden
        if not hasattr(self.thumb_key, '_base'):
            _warn()
            return self.thumb_key(url, thumb_id)
        ## end of deprecation warning block

        thumb_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)

    # deprecated
    def file_key(self, url):
        return self.image_key(url)
    file_key._base = True

    # deprecated
    def image_key(self, url):
        return self.file_path(url)
    image_key._base = True

    # deprecated
    def thumb_key(self, url, thumb_id):
        return self.thumb_path(url, thumb_id)
    thumb_key._base = True

  • 简单的看下imagespipeline方法的代码
    1.它继承了FilesPipeline,它是FilesPipeline的子类
    2.DEFAULT_IMAGES_URLS_FIELD = ‘image_urls’ :图片链接存储默认为image_urls
    3.DEFAULT_IMAGES_RESULT_FIELD = ‘images’ :图片下载结果存储默认为images
    4.store_uri = settings[‘IMAGES_STORE’] :图片存储路径设置 IMAGES_STORE
    5.图片size可以进行修改(不做详细介绍)
    6.get_meida_requests方法 对每个图片的链接生成Request,加入调度队列,等待被调度执行下载
    7.item_completed方法, 图片下载过后会把下载结果传到这个方法里面,并不是每一张图片都会下载成功,要剔除掉下载失败的链接
    8.file_path方法 文件存储名字,路径,haslib的sha1进行了处理

#360摄影美图下载

  • 链接:http://image.so.com/z?ch=photography
  • 简单来看下这个网站,整理下思路
    Scrapy-ImagesPipelines-简单的图片抓取和下载_第1张图片
  • 先看下翻页问题
    ‘http://image.so.com/zj?ch=photography&sn=30&listtype=new&temp=1’
    ‘http://image.so.com/zj?ch=photography&sn=60&listtype=new&temp=1’
    ‘http://image.so.com/zj?ch=photography&sn=90&listtype=new&temp=1’
  • 一眼就能看出来翻页链接的规则 在于sn这个参数,每页30个图片
    Scrapy-ImagesPipelines-简单的图片抓取和下载_第2张图片
  • 图片的链接就在访问链接返回的json数据里面

-创建项目

scrapy startproject images360
scrapy genspider image_downloader images.so.com
  • 修改settings.py文件
#设置路径
IMAGES_STORE = './images'
#不遵守!!哈哈
ROBOTSTXT_OBEY = False
#打开item_pipeline方法
ITEM_PIPELINES = {
   'images360.pipelines.Images360Pipeline': 300,
}
  • 修改item.py文件 添加字段
  • imagespipeline默认的 图片链接列表 image_urls
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class Images360Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #图片链接
    image_urls = scrapy.Field()

  • 图片抓取文件image_downloader.py
# -*- coding: utf-8 -*-
import json
import scrapy
from scrapy.http import Request
from images360.items import Images360Item


#设置爬取页数
MAX_PAGE = 2

class ImageDownloaderSpider(scrapy.Spider):

    name = 'image_downloader'
    allowed_domains = ['images.so.com']


    def start_requests(self):
        #创建翻页链接
        for i in range(MAX_PAGE):
            sn = i*30
            start_url = 'http://image.so.com/zj?ch=photography&sn={}&listtype=new&temp=1'.format(sn)
            yield Request(start_url, callback=self.image_crawler)

    def image_crawler(self, response):
        item = Images360Item()
        content = json.loads(response.text)
        item['image_urls'] = [img.get('qhimg_url') for img in content.get('list')]
        yield item

  • pipelines.py文件
  • 重写Imagespipeline方法
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline

from scrapy.exceptions import DropItem


class Images360Pipeline(ImagesPipeline):
    '''
    继承ImagesPipeline类
    '''
    def file_path(self, request, response=None, info=None):
        '''
        修改文件路径和命名方法

        :param request:
        :param response:
        :param info:
        :return:
        '''
        url = request.url
        file_name = url.split('/')[-1]
        return file_name

    def item_completed(self, results, item, info):
        '''
        获取图片下载结果
        :param results:
        :param item:
        :param info:
        :return:
        '''
        image_paths = [x['path'] for ok, x in results if ok]

        if not image_paths:
            raise DropItem('Image Download Failed')

        return item

  • 也可以直接用ImagesPipeline方法
    -settings里面要在item_pipeline方法里面添加上

  • scrapy.pipelines.images.ImagesPipeline方法

  • run.py

# -*- coding: utf-8 -*-
from scrapy import cmdline


cmdline.execute("scrapy crawl image_downloader".split())
  • 来运行下
    Scrapy-ImagesPipelines-简单的图片抓取和下载_第3张图片
    Scrapy-ImagesPipelines-简单的图片抓取和下载_第4张图片

#总结:

  • 如果想下载更多,可以把MAX_PAGE = 2 进行修改
  • 这种文件的下载的方法,用的还是比较少,感觉再过很久不用,又会慢慢的忘记。。。

你可能感兴趣的:(python爬虫)