scrapy下载汽车之家宝马5系高清图片

首先打开链接:https://www.autohome.com.cn/202/#levelsource=000000000_0&pvareaid=101594点击图片实拍:

                      scrapy下载汽车之家宝马5系高清图片_第1张图片

然后新建一个scrapy项目,接下来就进入项目里面写代码了。

bmw5.py:

# -*- coding: utf-8 -*-
import scrapy
from bmw.items import BmwItem

class Bmw5Spider(scrapy.Spider):
    name = "bmw5"
    allowed_domains = ["car.autohome.com.cn"]
    start_urls = ['https://car.autohome.com.cn/pic/series/202.html#pvareaid=3454438']

    def parse(self, response):
        uiboxs = response.xpath('//div[@class="uibox"]')[1:]
        for uibox in uiboxs:
            category = uibox.xpath('.//div[@class="uibox-title"]/a/text()').get()
            urls = uibox.xpath('.//ul/li/a/img/@src').getall()
            # for url in urls:
            #     url = response.urljoin(url)#将获取到的url中缺少的https:自动添加上
            #     print(url)
            urls = list(map(lambda url:response.urljoin(url),urls))#遍历这个列表,将列表中的每一项都执行同一个方法,然后将返回值作为一个新的列表urls返回,是一个map对象
            item = BmwItem(category=category,urls=urls)
            yield item

piplines.py:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from urllib import request

class BmwPipeline(object):
    def __init__(self):
        self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')#进入当前目录的上一级目录,然后创建一个目录images并将路径拼接起来
        if not os.path.exists(self.path):
            os.mkdir(self.path)

    def process_item(self, item, spider):
        category = item['category']
        urls = item['urls']
        category_path = os.path.join(self.path,category)
        if not os.path.exists(category_path):
            os.mkdir(category_path)
        for url in urls:
            image_name = url.split('_')[-1]
            request.urlretrieve(url,os.path.join(category_path,image_name))
        return item

items.py:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class BmwItem(scrapy.Item):
    category = scrapy.Field()
    urls = scrapy.Field()

但是这些代码只是将缩略图下载下来,并没有将高清图下载下来,我们的目的是要下载高清图,而且数量也不够。要拿到高清图,首先就需要分析缩略图和高清图的url的区别。

from scrapy.spider import CrawlSpider,Rule
from scrapy.linkextractor import LinkExtractor

class Bmw5Spider(CrawlSpider):
    name = "bmw5"
    allowed_domains = ["car.autohome.com.cn"]
    start_urls = ['https://car.autohome.com.cn/pic/series/202.html#pvareaid=3454438']
    rules = (
            Rule(LinkExtractor(allow='https://car.autohome.come.cn/pic/series/65.+'),callback='parse_page',follow=True),
    )

    def parse_page(self, response):
        category = response.xpath('//div[@class="uibox"]/div/text()').get()
        src = response.xpath('//div[contains(@class,"uibox-con")]/ul/li//img/@src').getall()#因为有很多属性,因此用contains选择第一个属性
        #将每一个url的t_去掉
        srcs = list(map(lambda x:x.replace("t_",""),src))

 

你可能感兴趣的:(爬虫)