Scrapy 爬虫爬取unsplash图库

本文详尽介绍了通过Scrapy框架爬取Unsplash图库的过程:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
import scrapy

class UnsplashItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    author = scrapy.Field()
    author_bio=scrapy.Field()
    image_id=scrapy.Field()
    image_urls=scrapy.Field()
    images=scrapy.Field()
    image_paths=scrapy.Field()
# -*- coding: utf-8 -*-
import scrapy
import json
import urllib
from picture.items import UnsplashItem

class UnsplashSpider(scrapy.Spider):
    name = "unsplash"
    allowed_domains = ["unsplash.com"]
    custom_settings = {
        'DEFAULT_REQUEST_HEADERS':{
            #'Accept':'*/*',
            #'Accept-Encoding':'gzip, deflate, sdch, br',
            #'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
            #'accept-version':'v1',
            #'Authorization':'Client-ID d69927c7ea5c770fa2ce9a2f1e3589bd896454f7068f689d8e41a25b54fa6042',
            #'Host':'unsplash.com',
            'Upgrade-Insecure-Requests': '1',
            #'Referer':'https://unsplash.com/?grid=single',
            #'Connection':'keep-alive',
            'x-unsplash-client':'web',
            #'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
        },
        "ITEM_PIPELINES":{
            'picture.pipelines.UnsplashImagesPipeline': 300,
        },
        "IMAGES_STORE":'./images',
        "LOG_FILE":'unsplash.log',
    }

    start_urls = (
        'https://unsplash.com/napi/feeds/home',
    )

    def parse(self, response):
        for photo in json.loads(response.body)['photos']:
            item=UnsplashItem()
            item['author']=photo['user']['name']
            item['author_bio']=photo['user']['bio']
            item['image_id']=photo['id']
            item['image_urls']=[photo['urls']['full']]
            yield item

        next_page='https://unsplash.com/napi/'+json.loads(response.body)['next_page'][25:]
        if next_page:
            yield scrapy.Request(next_page,callback=self.parse)
# -*- coding: utf-8 -*-

# Define your item pipelines here
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

class UnsplashImagesPipeline(ImagesPipeline):
    def get_media_requests(self,item,info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url,meta={'item': item})


    def item_completed(self,results,item,info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item


    def file_path(self, request, response=None, info=None):
        item = request.meta['item']
        filename = 'full/{0}/{1}.jpg'.format(item['author'],item['image_id'])
        return filename

你可能感兴趣的:(Scrapy 爬虫爬取unsplash图库)