scrapy批量下载图片

 

1,spiders 业务处理

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import YmxItem
class SbSpider(scrapy.Spider):
    name = 'sb'
    base_url = 'https://www.mzitu.com/'
    start_urls = [base_url]
    # article\/.*\.html
    # src="https://i.mmzztt.com/thumb/2020/07/240522_236.jpg"
    # rules = (
    #     # 规则解析器:可以将连接提取器提取到的所有连接表示的页面进行指定规则(回调函数)的解析
    #     Rule(LinkExtractor(allow='www.mzitu.com/\d+'), callback='parse_item', follow=False),
    # )
    def parse(self, response):
        res = response.xpath('//ul[@id="pins"]/li/a/@href').extract()

        for r in res:
           yield scrapy.Request(url=r,callback=self.parse_a)
    #  # //div[@class="pages"]//a[contains(., "下一页")]
    def parse_a(self,response):
        # img_url = response.xpath('//div[@class="main-image"]//a/img/@src')
        # next = response.xpath('//div[@class="pagenavi"]//span[contains(.,"下一页")]')
        # if next is not None:
        #     scrapy.Request(url=)
        next = response.xpath('//div[@class="pagenavi"]/a[5]/span/text()').extract_first()
        for i in range(1, int(next) + 1):
            yield scrapy.Request(url=response.url+  '/'+ str(i),callback=self.scrapy_b)

    def scrapy_b(self,response):
        item = YmxItem()
        item['img_url'] = response.xpath('//div[@class="main-image"]/p/a/img/@src').extract_first()  # 匹配图片的url
        item['title'] =  response.xpath('//div[@class="main-image"]//a/img/@alt').extract_first()
        print(item)
        yield item

 

2,容器+管道


class YmxItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    img_url = scrapy.Field()
    title = scrapy.Field()



from scrapy.exceptions import DropItem
import scrapy
import time
class YmxPipeline(ImagesPipeline):
    def file_path(self, request, response=None, info=None):
        url = request.url
        # file_name = url.split('/')[-1]
        # 防止有中文重名的特意加上时间鹾
        return '{}-{}.{}'.format(request.meta['title'], str(time.time()).split('.')[0], url.split('.')[-1])

    def item_completed(self, results, item, info):
        # results = [(ture,{'url:"asdasdasd","path":"asdasdasd"})]
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem('Image Downloaded Failed')
        # item['image_paths'] = '.'.join(image_paths)
        return item

    def get_media_requests(self, item, info):
        # tem参数是爬取生成的item对象,从中提取url字段,然后加入到调用队列中,等待下载。
        yield scrapy.Request(item['image_url'], meta={'title': item['title']})

 

 

3, settings配置

IMAGES_STORE = 'D:\爬虫数据中心\图片\图片'  # 存放路径

 

4, 中间件配置用户代理 + 代理池

  此处省略,自己去配

你可能感兴趣的:(爬虫)