Python学习之Scrapy爬取360图片,数据存储到MongoDB

spider最简单

# -*- coding: utf-8 -*-
import scrapy,json
from urllib import parse
from Image360.items import Image360Item

class ImagezzSpider(scrapy.Spider):
    name = 'Imagezz'
    allowed_domains = ['image.so.com']

    def start_requests(self):
        base_url = 'http://image.so.com/zj?'
        data = {'ch':'photography','listtype':'new'}
        for page in range(1,2):
            data['sn'] = page * 30
            parmas = parse.urlencode(data)
            url = base_url + parmas
            yield scrapy.Request(url,callback=self.parse)

    def parse(self, response):

        result = json.loads(response.text)
        for image in result.get('list'):
            item = Image360Item()
            item['id'] = image.get('id')
            item['image_urls'] = image.get('qhimg_url')
            item['title'] = image.get('group_title')
            item['thumb'] = image.get('qhimg_thumb_url')

            yield item

重点在pipelines,自定义图片下载、MongoDB两个pipelines

import pymongo

class MongoPipeline(object):
    def __init__(self,mongo_uri,mongo_db):
        #MongoDB的ip和数据库名
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
    
    #借助from_crawler实现在初始化之前对settings参数调用
    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )

    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def process_item(self,item,spider):
        #插入数据
        self.db[item.collection].insert(dict(item))
        return item

    def close_spider(self,spider):
        #关闭连接
        self.client.close()


from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline

class ImPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        url = item['image_urls']
        #下载链接的图片
        yield Request(url)

    def file_path(self,request,response=None,info=None):
        #借助url定义文件名
        url = request.url
        file_name = url.split('/')[-1]
        print(file_name)
        return file_name

    def item_completed(self,results,item,info):
        #确认图片下载完成
        image_path = [x['path'] for ok,x in results if ok]
        if not image_path:
            raise DropItem('Image Download Faild')
        yield item

settings设置

#机器人协议关闭
ROBOTSTXT_OBEY = False

#打开图片下载、MongoDB的pipeline
ITEM_PIPELINES = {
   'Image360.pipelines.ImPipeline': 300,
   'Image360.pipelines.MongoPipeline': 301,
}
#设置图片下载位置
IMAGES_STORE = r'E:\Scrapy'
#MongoDBIP地址、数据库名
MONGO_URI = 'localhost'
MONGO_DB = 'images360'

注意IMAGES_STORE一定不能写错,不然,日志INFO: Enabled item pipelines:编写的ImPipeline没有启用,还不报错,我就找了半天!!!

你可能感兴趣的:(MongoDB,Scrapy)