使用Scrapy框架爬取58同城的出租房信息

from scrapy.exceptions import DropItem
from pymongo import MongoClient
from scrapy.conf import settings
from pymongo.errors import DuplicateKeyError
from traceback import format_exc
from .items import City58XiaoQu, City58ItemChuZuInfo
'''
想要学习Python?Python学习交流群:984632579满足你的需求,资料都已经上传群文件,可以自行下载!
'''
class City58Pipeline(object):

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
        self.client = None
        self.db = None

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGODB_URI'),
            mongo_db=settings.get('MONGODB_DATABASE', 'items')
        )

    def open_spider(self, spider):
        _ = spider
        self.client = MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]
        self.db['city58_info'].ensure_index('id', unique=True)          # 在表 city58_info 中建立索引,并保证索引的唯一性
        self.db['city58_chuzu_info'].ensure_index('url', unique=True)   # 在表 city58_chuzu_info 中建立索引,并保证索引的唯一性

    def close_spider(self, spider):
        _ = spider
        self.client.close()

    def process_item(self, item, spider):
        try:
            if isinstance(item, City58XiaoQu):    # 判断是否是小区的item
                self.db['city58_info'].update({'id': item['id']}, {'$set': item}, upsert=True)   # 通过id判断,有就更新,没有就插入
            elif isinstance(item, City58ItemChuZuInfo): # 判断是否是小区出租信息的item
                try:
                    fangjia = HandleFangjiaPipline.price_per_square_meter_dict[item['id']]  # 把HandleFangjiaPipline管道的字典price_per_square_meter_dict中每平米平均价格赋值给fangjia
                    # del HandleFangjiaPipline.price_per_square_meter_dict[item['id']]
                    item['price_pre'] = fangjia

                    self.db['city58_chuzu_info'].update({'url': item['url']}, {'$set': item}, upsert=True)   # 通过url判断,有就更新,没有就插入
                except Exception as e:
                    print(e)

        except DuplicateKeyError:
            spider.logger.debug(' duplicate key error collection')  # 唯一键冲突报错
        except Exception as e:
            _ = e
            spider.logger.error(format_exc())
        return item


class HandleZuFangPipline(object):

    def process_item(self, item, spider):
        _ = spider, self
        # self.db[self.collection_name].insert_one(dict(item))
        # 判断进来的item是否是City58ItemXiaoChuZuQuInfo,是否含有面积参数
        if isinstance(item, City58ItemChuZuInfo) and 'mianji' in item:
            item['chuzu_price_pre'] = int(item['zu_price']) / int(item['mianji'])   # 租金除以面积得到平均价格
        return item


class HandleFangjiaPipline(object):

    price_per_square_meter_dict = dict()

    def process_item(self, item, spider):
        _ = spider

        # 判断传进来的item是否是个字典,并且是否含有price_list
        if isinstance(item, dict) and 'price_list' in item:
            item['price_list'] = [int(i) for i in item['price_list']]
            if item['price_list']:
                self.price_per_square_meter_dict[item['id']] = sum(item['price_list']) / len(item['price_list'])  # 得到每个小区的平均价格
            else:
                self.price_per_square_meter_dict[item['id']] = 0
            raise DropItem()
        return item

 

你可能感兴趣的:(Python爬虫)