python爬虫:使用scrapy框架对链家租房深度爬取,并存入redis、mysql、mongodb数据库

1.items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class LianjiaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    pic = scrapy.Field()
    title = scrapy.Field()
    detail_url = scrapy.Field()
    price = scrapy.Field()
    publish_info = scrapy.Field()
    pic_list = scrapy.Field()
    house_code = scrapy.Field()
    ucid = scrapy.Field()
    agent_name = scrapy.Field()
    agent_phone = scrapy.Field()

2.lianjia.py

# -*- coding: utf-8 -*-
import scrapy
from LianJia.items import LianjiaItem
import re
import json
import requests


class LianjiaSpider(scrapy.Spider):
    name = 'lianjia'
    allowed_domains = ['lianjia.com']
    start_urls = ['https://www.lianjia.com/city/']

    def parse(self, response):
        # 获取到的是新房的url
        city_url_list = response.xpath("//div[@class='city_province']//li/a/@href").extract()
        # print(city_url_list)
        city_name_list = response.xpath("//div[@class='city_province']//li/a/text()").extract()
        for index in range(len(city_url_list)):
            city_name = city_name_list[index]
            city_url = city_url_list[index]
            # print(city_url)
            # 城市首字母
            city_alp = re.findall(r"https://(\w*).", city_url)[0]
            # print(city_alp)
            # 拼接租房城市url
            city_url = "https://" + city_alp + ".lianjia.com/zufang/"
            # print("--------------------{}开始下载-------------------------------".format(city_name))
            yield scrapy.Request(url=city_url, callback=self.get_area_url)

    def get_area_url(self, response):
        # print(response.body.decode("utf-8"))
        # 获取城区url
        area_url_list = response.xpath("//li[@data-type='district'][position()>1]/a/@href").extract()
        for area_url in area_url_list:
            area_url = re.findall(r"(.*)/zufang/", response.url)[0] + area_url
            # print(area_url)
            yield scrapy.Request(url=area_url, callback=self.get_business_url)

    def get_business_url(self, response):
        # 获取商圈url
        business_url_list = response.xpath("//li[@data-type='bizcircle'][position()>1]/a/@href").extract()
        # print(business_url_list)
        for business_url in business_url_list:
            business_url = re.findall(r"(.*)/zufang/", response.url)[0] + business_url
            # print(business_url)
            yield scrapy.Request(url=business_url, callback=self.get_page_url)

    def get_page_url(self, response):
        # 获取最大页码
        max_page = response.xpath("//div[@class='content__pg']/@data-totalpage").extract()
        max_page = int(max_page[0]) if max_page else 0
        # print(max_page)
        # 遍历最大页 拼接完整的page_url
        # ---------page=0时 不会执行下面----------
        for page in range(max_page):
            page_url = response.url + "pg{}/#contentList".format(page + 1)
            # print(page_url)
            yield scrapy.Request(url=page_url, callback=self.get_page_data)

    def get_page_data(self, response):
        # 缩小范围
        fang_xml_list = response.xpath("//div[@class='content__list']/div")
        # print(fang_xml_list)
        for fang_xml in fang_xml_list:
            # 获取图片
            pic = fang_xml.xpath(".//img/@data-src").extract()
            pic = pic[0] if pic else ''
            # print(pic)

            # 获取标题
            title = fang_xml.xpath(".//p[@class='content__list--item--title twoline']/a/text()").extract()[0].strip()

            # 获取详情url
            detail_url = fang_xml.xpath(".//p[@class='content__list--item--title twoline']/a/@href").extract()[0]
            detail_url = "https://bj.lianjia.com" + detail_url
            # print(title)

            # 获取价格
            price = fang_xml.xpath(".//em/text()").extract()[0]
            # print(price)

            item = LianjiaItem()
            item["pic"] = pic
            item["title"] = title
            item["detail_url"] = detail_url
            item["price"] = price
            # print(item)
            yield scrapy.Request(url=detail_url, callback=self.get_detail_data, meta={"data": item}, dont_filter=True)

    def get_detail_data(self, response):
        item = response.meta["data"]
        # 获取发布信息
        publish_info = response.xpath("//ul/li[contains(text(), '发布')]/text()").extract()
        publish_info = publish_info[0] if publish_info else ''
        # print(publish_info)

        # 获取图片信息
        pic_list = response.xpath("//ul[@class='content__article__slide__wrapper']/div/img/@data-src").extract()
        # print(pic_list)

        # 获取房源编号
        house_code = re.findall(r"/zufang/(.*?).html", response.url)[0]
        # print(house_code)

        # 获取ucid
        ucid = response.xpath("//span[@class='contact__im im__online']/@data-im_id").extract()
        # print(ucid)
        if ucid:
            ucid = ucid[0]
            # 拼接完整的经纪人接口
            brokers_url = "https://bj.lianjia.com/zufang/aj/house/brokers?"
            agent_api = brokers_url + "house_codes={}&position=bottom&ucid={}".format(house_code, ucid)
            # print(agent_api)

            item["publish_info"] = publish_info
            item["pic_list"] = pic_list
            item["house_code"] = house_code
            item["ucid"] = ucid

            yield scrapy.Request(url=agent_api, callback=self.get_agent_data, meta={"data": item}, dont_filter=True)

    # 获取经纪人信息
    def get_agent_data(self, response):
        # 将response对象转成json对象
        result = response.body.decode("utf-8")
        json_data = json.loads(result)
        # print(json_data)

        item = response.meta["data"]
        house_code = item.get("house_code")
        # 经纪人姓名
        agent_name = json_data.get("data").get(house_code).get(house_code).get("contact_name")
        # print(agent_name)

        # 经纪人电话
        agent_phone = json_data.get("data").get(house_code).get(house_code).get("tp_number")
        # print(agent_phone)

        item["agent_name"] = agent_name
        item["agent_phone"] = agent_phone
        yield item


3.pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
import pymysql
import redis
import json


# class LianjiaPipeline(object):
#     def __init__(self):
#         self.count = 1
#
#     def process_item(self, item, spider):
#         print(self.count, dict(item))
#         self.count += 1
#         return item


# 插入redis数据库
class RedisPipeline(object):
    def __init__(self):
        self.count = 1
        self.r = redis.Redis(host="localhost", port="6379", db=3)

    def process_item(self, item, spider):
        # print(item)
        item_dict = dict(item)
        print(self.count, item_dict)
        item_str = json.dumps(item_dict)

        self.r.lpush("lianjia", item_str)
        # print("insert successfully")
        self.count += 1
        return item


# 插入mongodb
class MongodbPipeline(object):
    def __init__(self):
        mongo_client = pymongo.MongoClient("localhost", 27017)
        self.db = mongo_client.lianjia

    def process_item(self, item, spider):
        item = dict(item)
        self.db.lianjia.insert(item)
        return item


# 插入mysql
class MysqlPipeline(object):
    def __init__(self):
        self.conn = pymysql.connect("localhost", "root", "123456", "lianjia")
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        title = item["title"]
        # print(title)
        price = item["price"]
        agent_name = item["agent_name"]
        agent_phone = item["agent_phone"]
        import time
        # 时间戳  数据监控时用
        refresh_time = int(time.time())
        sql = "insert into fang(title, price, agent_name, agent_phone, refresh_time) values('{}', '{}', '{}', '{}', '{}')"\
            .format(title, price, agent_name, agent_phone, refresh_time)
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item

    def __del__(self):
        self.cursor.close()
        self.conn.close()

4.main.py (项目执行脚本)

from scrapy import cmdline

cmdline.execute("scrapy crawl lianjia --nolog".split())
# cmdline.execute("scrapy crawl lianjia".split())

python爬虫:使用scrapy框架对链家租房深度爬取,并存入redis、mysql、mongodb数据库_第1张图片

你可能感兴趣的:(python爬虫)