汽车之家图片下载(爬虫代码)

bmw/spiders/bmw5.py

from scrapy.spidersimport CrawlSpider,Rule

from scrapy.linkextractorsimport LinkExtractor

from bmw.itemsimport BmwItem

class Bmw5Spider(CrawlSpider):

    name= 'bmw5'

    allowed_domains= ['car.autohome.com.cn']

    start_urls= ['https://car.autohome.com.cn/pic/series/159.html']

# https://car.autohome.com.cn/pic/series/159-10.html#pvareaid=2042222

# https://car.autohome.com.cn/pic/series/159-51-p2.html

    rules= (

    Rule(LinkExtractor(allow="https://car.autohome.com.cn/pic/series/159.+"),callback='parse_page',follow=True),

)

def parse_page(self,response):

        title= response.xpath("//div[@class='uibox']/div/text()").get()

        srcs= response.xpath("//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall()

        urls= list(map(lambda x:response.urljoin(x.replace('240x180_0_q95_c42',"1024x0_1_q95")),srcs))

        yield BmwItem(title=title,image_urls=urls)

bmw/items.py

import scrapy

class BmwItem(scrapy.Item):

    title= scrapy.Field()

    image_urls= scrapy.Field()

    images= scrapy.Field()

bmw/pipelines.py

import os

from urllibimport request

from scrapy.pipelines.imagesimport ImagesPipeline

from bmwimport settings

class BmwPipelines(ImagesPipeline):

    def get_media_requests(self,item,info):

        request_objs= super(BmwPipelines,self).get_media_requests(item,info)

        for request_objin request_objs:

            request_obj.item= item

        return request_objs

def file_path(self,request,response=None,info=None):

        path= super(BmwPipelines,self).file_path(request,response,info)

        title= request.item.get('title')

        images_store= settings.IMAGES_STORE

        title_path= os.path.join(images_store,title)

        if not os.path.exists(title_path):

            os.mkdir(title_path)

        image_name= path.replace("full/","")

        image_path= os.path.join(title_path,image_name)

        return image_path

你可能感兴趣的:(汽车之家图片下载(爬虫代码))