Scrapy实例__链家租房爬虫

Scrapy实例__链家租房爬虫

创建爬虫项目

scrapy startproject lianjia_zf

定义爬取的数据项

import scrapy


class LianjiaZfItem(scrapy.Item):
    title = scrapy.Field()  # 标题
    update_time = scrapy.Field()  # 房源更新时间
    price = scrapy.Field()  # 月租
    tags = scrapy.Field()  # 标签
    rent_method = scrapy.Field()  # 出租方式
    house_type = scrapy.Field()  # 房屋类型
    towards_and_floor = scrapy.Field()  # 朝向楼层
    basic_info = scrapy.Field()  # 房屋信息
    supporting_facilities = scrapy.Field()  # 配套设施
    description = scrapy.Field()  # 房源描述
    url = scrapy.Field()  # 详情页链接

编码spider爬取数据

# -*- coding: utf-8 -*-
import scrapy
from items import LianjiaZfItem


class LianjiaSpider(scrapy.Spider):
    name = 'lianjia'
    allowed_domains = ['lianjia.com']
    start_urls = ['https://sz.lianjia.com/zufang/']

    def parse(self, response):
        house_items = response.xpath('//*[@id="content"]/div[1]/div[1]/child::div')
        for house_item in house_items:
            href_xpath = './a/@href'
            house_url = house_item.xpath(href_xpath).get()
            if "zufang" in house_url:
                yield scrapy.Request(response.urljoin(house_url), callback=self.parse_details)

        next_page = response.xpath('//*[@id="content"]/div[1]/div[2]/a[6]/@href').get()
        if next_page:
            yield response.follow(next_page, callback=self.parse)

    def parse_details(self, response):
        item = LianjiaZfItem()

        item['url'] = response.url

        title_xpath = '//p[@class="content__title"]/text()'
        item['title'] = response.xpath(title_xpath).get().strip()

        update_time_xpath = '//div[@class="content__subtitle"]/text()'
        item['update_time'] = response.xpath(update_time_xpath).get().split(':')[-1].strip()

        price_xpath = '//div[@class="content__aside--title"]/span//text()'
        unit_xpath = '//div[@class="content__aside--title"]/text()'
        item['price'] = response.xpath(price_xpath).get().strip() + \
                        response.xpath(unit_xpath).getall()[1].split(' ')[0].strip()

        tags_xpath = '//p[@class="content__aside--tags"]//text()'
        item['tags'] = [item.strip() for item in response.xpath(tags_xpath).getall() if item.strip()]

        rent_xpath = '//*[@id="aside"]/ul[1]//li/text()'
        rent_data = response.xpath(rent_xpath).getall()
        item['rent_method'] = rent_data[0].strip()
        item['house_type'] = rent_data[1].strip()
        item['towards_and_floor'] = rent_data[2].strip()

        basic_info_xpath = '//*[@id="info"]//li/text()'
        basic_infos = response.xpath(basic_info_xpath).getall()
        basic_info = {}
        for bi in basic_infos:
            if ':' in bi:
                basic_info[bi.split(':')[0].strip()] = bi.split(':')[1].strip()
        item['basic_info'] = basic_info

        supporting_facilities_xpath = '//ul[@class="content__article__info2"]/li[@class="fl oneline  "]/text()'
        supporting_facilities = [item.strip() for item in response.xpath(supporting_facilities_xpath).getall() if item.strip()]
        item['supporting_facilities'] = supporting_facilities

        description_xpath = '//p[@data-el="houseComment"]/@data-desc'
        description = response.xpath(description_xpath).get()
        item['description'] = description.strip() if description else ''

        return item

编写pipeline处理数据

import pymongo


class LianjiaZfPipeline(object):
    def __init__(self, MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_COL):
        self.MONGO_HOST = MONGO_HOST
        self.MONGO_PORT = MONGO_PORT
        self.MONGO_DB = MONGO_DB
        self.MONGO_COL = MONGO_COL

    @classmethod
    def from_crawler(cls, crawler):
        return cls(MONGO_HOST=crawler.settings.get('MONGO_HOST'),
                   MONGO_PORT=crawler.settings.get('MONGO_PORT'),
                   MONGO_DB=crawler.settings.get('MONGO_DB'),
                   MONGO_COL=crawler.settings.get('MONGO_COL'))

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(host=self.MONGO_HOST, port=self.MONGO_PORT)
        self.col = self.client[self.MONGO_DB][self.MONGO_COL]

    def process_item(self, item, spider):
        find = {'url': item['url']}
        self.col.update(find, {'$set': item}, upsert=True)
        return item

    def close_spider(self, spider):
        self.client.close()

修改配置

MONGO_HOST = '*.*.*.*'
MONGO_PORT = 27017
MONGO_DB = 'House'
MONGO_COL = 'lianjia_zf'

DOWNLOAD_DELAY = 1
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_MAX_DELAY = 60
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0

启动爬虫

新建run.py

from scrapy import cmdline


cmdline.execute("scrapy crawl lianjia".split())
python run.py

代码地址:
https://github.com/czk512042/scrapy/tree/master/lianjia_zf/lianjia_zf

你可能感兴趣的:(Scrapy实例__链家租房爬虫)