Scrapy爬取图片并分类到不同文件夹

Scrapy爬取图片并分类到不同文件夹

  • 先设置settings
  • items要保存的内容
  • spider内容
  • pipelines管道处理

先设置settings

ITEM_PIPELINES = {
    # 自定义的图片处理管道
   'mzitu.pipelines.ImagesPipelinse': 300,
}
# 设置图片默认地址,必须设置
IMAGES_STORE = '/Users/paul/Desktop/images'
# 设置图片通道失效时间
IMAGES_EXPIRES =90
DOWNLOAD_DELAY = 0.5  # 延迟
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
DEFAULT_REQUEST_HEADERS = {
 	'Cookie':'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1562892240,1563870209; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1563934170',
    'Referer': 'https://www.mzitu.com/mm/',
    'Upgrade-Insecure-Requests': '1'
}
# Obey robots.txt rules
ROBOTSTXT_OBEY = False

items要保存的内容

class MzituItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 标题  也是设置目录
    title = scrapy.Field()
    # 图片地址
    imge_url = scrapy.Field()
    # 请求头要添加的来源网址
    Referer = scrapy.Field()
    # 图片名称
    image_Path = scrapy.Field()

spider内容

# -*- coding: utf-8 -*-
import scrapy

from mzitu.items import MzituItem

class MeizituSpider(scrapy.Spider):
    name = 'meizitu'  # 项目名,启动时候用的
    allowed_domains = ['mzitu.com']
    # 列表页,也就是起始网址
    start_urls = ['http://www.mzitu.com/xinggan/page/{}/'.format(str(x)) for x in range(153)]

    def parse(self, response):
        li_list = response.xpath("//ul[@id='pins']/li")

        print('#######################')
        for obj in li_list:
            # 获取详情链接
            det_url = obj.xpath("./a/@href").extract_first()
            # 遍历访问详情页
            yield scrapy.Request(url=det_url, callback=self.second_handler)

    def second_handler(self, response):
        # 二级页面
        item = MzituItem()
        # 获取页数链接进行访问
        offset = int(response.xpath('//div[@class="pagenavi"]/a/span/text()')[4].extract())
        # 生成链接访问 遍历链接访问
        for i in [response.url + "/{}".format(str(x)) for x in range(1, offset + 1)]:
            item['Referer'] = i
            # 将meta传入链接
            yield scrapy.Request(url=i, meta={'item': item}, callback=self.parse_ponse)

    def parse_ponse(self, response):
        # 获取itme资源
        item = response.meta['item']
        # 获取图片地址
        item["imge_url"] = response.xpath('//div[@class="main-image"]/p/a/img/@src')[0].extract()
        # 获取图片目录
        item["title"] = response.xpath('//div[@class="main-image"]/p/a/img/@alt')[0].extract()
        # 抛出
        yield item

pipelines管道处理

这里主要使用了shutil来移动图片到同一个title的文件夹下

# -*- coding: utf-8 -*-
# Define your item pipelines here

# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# 导入这个包为了移动文件
import shutil
import scrapy
# 导入项目设置
from scrapy.utils.project import get_project_settings
# 导入scrapy框架的图片下载类
from scrapy.pipelines.images import ImagesPipeline
import os


class ImagesPipelinse(ImagesPipeline):
    #def process_item(self, item, spider):
    #    return item
    # 获取settings文件里设置的变量值
    IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
    # 重写ImagesPipeline类的此方法
    # 发送图片下载请求
    def get_media_requests(self, item, info):
        image_url = item["imge_url"]
        # headers是请求头主要是防反爬虫
        headers={'Referer': item['Referer'], 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
        yield scrapy.Request(image_url, headers=headers)

    def item_completed(self, result, item, info):
        image_path = [x["path"] for ok, x in result if ok]
        # 定义分类保存的路径
        img_path = "%s/%s" % (self.IMAGES_STORE, item['title'])
        # # 目录不存在则创建目录
        if os.path.exists(img_path) == False:
            os.mkdir(img_path)
        # 将文件从默认下路路径移动到指定路径下
        shutil.move(self.IMAGES_STORE + "/" +image_path[0], img_path + "/" +image_path[0][image_path[0].find("full/")+6:])
        item['image_Path'] = img_path + "/" + image_path[0][image_path[0].find("full/")+6:]
        return item

你可能感兴趣的:(爬虫)