Python爬取房天下网站深圳房租信息入库并进行数据分析可视化

概述

  • 请求库:requests
  • HTML 解析:BeautifulSoup
  • 词云:wordcloud
  • 数据可视化:pyecharts
  • 数据库:MongoDB
  • 数据库连接:pymongo

爬虫思路&&页面解析

先爬取房某下深圳各个板块的数据,然后存进 MongoDB 数据库,最后再进行数据分析。 

Python爬取房天下网站深圳房租信息入库并进行数据分析可视化_第1张图片

右键网页,查看页面源码,找出我们要爬取得部分

Python爬取房天下网站深圳房租信息入库并进行数据分析可视化_第2张图片

爬虫源代码实现

import requests
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient

class HouseSpider:
    def __init__(self):
        self.client = MongoClient('mongodb://localhost:27017/')
        self.zfdb = self.client.zfdb

    session = requests.Session()
    baseUrl = "http://sz.zu.fang.com"

    # 每个区域的url
    urlDir = {
        "不限": "/house/",
        "宝安": "/house-a089/",
        "龙岗": "/house-a090/",
        "南山": "/house-a087/",
        "福田": "/house-a085/",
        "罗湖": "/house-a086/",
        "盐田": "/house-a088/",
        "龙华区": "/house-a013080/",
        "坪山区": "/house-a013081/",
        "光明新区": "/house-a013079/",
        "大鹏新区": "/house-a013082/",
        "惠州": "/house-a013058/",
        "东莞": "/house-a013057/",
        "深圳周边": "/house-a016375/",
    }

    region = "不限"
    page = 100
    # 通过名字获取 url 地址
    def getRegionUrl(self, name="宝安", page=10):
        urlList = []
        for index in range(page):
            if index == 0:
                urlList.append(self.baseUrl + self.urlDir[name])
            else:
                urlList.append(self.baseUrl + self.urlDir[name] + "i3" + str(index + 1) + "/")
        return urlList


    # MongoDB 存储数据结构
    def getRentMsg(self, title, rooms, area, price, address, traffic, region, direction):
        return {
            "title": title,  # 标题
            "rooms": rooms,  # 房间数
            "area": area,  # 平方数
            "price": price,  # 价格
            "address": address,  # 地址
            "traffic": traffic,  # 交通描述
            "region": region,  # 区、(福田区、南山区)
            "direction": direction,  # 房子朝向(朝南、朝南北)
        }

    # 获取数据库 collection
    def getCollection(self, name):
        zfdb = self.zfdb
        if name == "不限":
            return zfdb.rent
        if name == "宝安":
            return zfdb.baoan
        if name == "龙岗":
            return zfdb.longgang
        if name == "南山":
            return zfdb.nanshan
        if name == "福田":
            return zfdb.futian
        if name == "罗湖":
            return zfdb.luohu
        if name == "盐田":
            return zfdb.yantian
        if name == "龙华区":
            return zfdb.longhuaqu
        if name == "坪山区":
            return zfdb.pingshanqu
        if name == "光明新区":
            return zfdb.guangmingxinqu
        if name == "大鹏新区":
            return zfdb.dapengxinqu

    #
    def getAreaList(self):
        return ["不限","宝安","龙岗","南山","福田","罗湖","盐田","龙华区","坪山区","光明新区","大鹏新区",]

    def getOnePageData(self, pageUrl, reginon="不限"):
        rent = self.getCollection(self.region)
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'})
        res = self.session.get(
            pageUrl
        )
        soup = BeautifulSoup(res.text, "html.parser")
        # 获取需要爬取得 div
        divs = soup.find_all("dd", attrs={"class": "info rel"})

        for div in divs:
            ps = div.find_all("p")
            try:  # 捕获异常,因为页面中有些数据没有被填写完整,或者被插入了一条广告,则会没有相应的标签,所以会报错
                for index, p in enumerate(ps):  # 从源码中可以看出,每一条 p 标签都有我们想要的信息,故在此遍历 p 标签,
                    text = p.text.strip()
                    print(text)  # 输出看看是否为我们想要的信息
                print("===================================")
                # 爬取并存进 MongoDB 数据库
                roomMsg = ps[1].text.split("|")
                # rentMsg 这样处理是因为有些信息未填写完整,导致对象报空
                area = roomMsg[2].strip()[:len(roomMsg[2]) - 2]
                # 标题 房间数 平方数 价格 地址 交通描述 区 房子朝向
                rentMsg = self.getRentMsg(
                    ps[0].text.strip(),
                    roomMsg[1].strip(),
                    int(float(area)),
                    int(ps[len(ps) - 1].text.strip()[:len(ps[len(ps) - 1].text.strip()) - 3]),
                    ps[2].text.strip(),
                    ps[3].text.strip(),
                    ps[2].text.strip()[:2],
                    roomMsg[3],
                )
                # 插入到数据库中
                rent.insert(rentMsg)
            except:
                continue

    # 设置区域
    def setRegion(self, region):
        self.region = region

    # 设置页数
    def setPage(self, page):
        self.page = page

    def startSpicder(self):
        for url in self.getRegionUrl(self.region, self.page):
            self.getOnePageData(url, self.region)
            print("*" * 30 + "one page 分割线" + "*" * 30)
            time.sleep(1)


spider = HouseSpider()
spider.setPage(10)# 设置爬取页数
for i in range(0,11):
    spider.setRegion(spider.getAreaList()[i]) # 设置爬取区域
    spider.startSpicder()# 开启爬虫

 爬取数据后,数据存储在MongoDB中的zfdb数据库

Python爬取房天下网站深圳房租信息入库并进行数据分析可视化_第3张图片

数据分析实现

对MongoDB中的zfdb数据库数据分析,租房信息可视化::

from os import path
from wordcloud import WordCloud, ImageColorGenerator
import jieba.analyse
import matplotlib.pyplot as plt
from scipy.misc import imread

baseUrl = "http://sz.zu.fang.com"
from pymongo import MongoClient

class Analycis:
    def __init__(self):
        self.client = MongoClient('mongodb://localhost:27017/')
        self.zfdb = self.client.zfdb

    pinyinDir = {
        # "不限": "rent",
        "宝安": "baoan",
        "龙岗": "longgang",
        "南山": "nanshan",
        "福田": "futian",
        "罗湖": "luohu",
        "盐田": "yantian",
        "龙华": "longhuaqu",
        "坪山": "pingshanqu",
        "光明": "guangmingxinqu",
        "大鹏": "dapengxinqu",
    }

    def getAreaList(self):
        return [# "不限",
            "福田","南山","罗湖","宝安","龙华","盐田","龙岗","坪山","光明","大鹏",]

    # 获取区的拼音
    def getPinyin(self, region):
        try:
            pinyin = self.pinyinDir[region]
        except:
            print("no such region pinyin")
        return pinyin

    # 获取数据库 collection
    def getCollection(self, name):
            zfdb = self.zfdb
            if name == "不限":
                return zfdb.rent
            if name == "宝安":
                return zfdb.baoan
            if name == "龙岗":
                return zfdb.longgang
            if name == "南山":
                return zfdb.nanshan
            if name == "福田":
                return zfdb.futian
            if name == "罗湖":
                return zfdb.luohu
            if name == "盐田":
                return zfdb.yantian
            if name == "龙华区":
                return zfdb.longhuaqu
            if name == "坪山区":
                return zfdb.pingshanqu
            if name == "光明新区":
                return zfdb.guangmingxinqu
            if name == "大鹏新区":
                return zfdb.dapengxinqu

    # 求一个区的  平方米/元  的平均数
    def getAvgPrice(self, region):
        # 这种获取数据库的方法也可以
        # collection = self.getCollection(region)

        areaPinYin = self.getPinyin(region=region)
        collection = self.zfdb[areaPinYin]
        # $group:将集合中的文档分组,可用于统计结果  $sum:计算总和
        totalPrice = collection.aggregate([{'$group': {'_id': '$region', 'total_price': {'$sum': '$price'}}}])
        totalArea = collection.aggregate([{'$group': {'_id': '$region', 'total_area': {'$sum': '$area'}}}])
        totalPrice2 = list(totalPrice)[0]["total_price"]
        totalArea2 = list(totalArea)[0]["total_area"]
        return totalPrice2 / totalArea2

    # 获取各个区 每个月一平方米需要多少钱
    def getTotalAvgPrice(self):
        totalAvgPriceList = []
        totalAvgPriceDirList = []
        for index, region in enumerate(self.getAreaList()):
            avgPrice = self.getAvgPrice(region)
            # round函数是一个用于四舍五入的函数
            totalAvgPriceList.append(round(avgPrice, 3))
            totalAvgPriceDirList.append({"value": round(avgPrice, 3), "name": region + "  " + str(round(avgPrice, 3))})
        return totalAvgPriceDirList

    # 获取各个区 每一天一平方米需要多少钱
    def getTotalAvgPricePerDay(self):
        totalAvgPriceList = []
        for index, region in enumerate(self.getAreaList()):
            avgPrice = self.getAvgPrice(region)
            totalAvgPriceList.append(round(avgPrice / 30, 3))
        return (self.getAreaList(), totalAvgPriceList)

    # 获取各区统计数据量
    def getAnalycisNum(self):
        analycisList = []
        for index, region in enumerate(self.getAreaList()):
            collection = self.zfdb[self.pinyinDir[region]]
            print(region)
            totalNum = collection.aggregate([{'$group': {'_id': '', 'total_num': {'$sum': 1}}}])
            totalNum2 = list(totalNum)[0]["total_num"]
            analycisList.append(totalNum2)
        return (self.getAreaList(), analycisList)

    # 获取各个区的房源比重
    def getAreaWeight(self):
        result = self.zfdb.rent.aggregate([{'$group': {'_id': '$region', 'weight': {'$sum': 1}}}])
        areaName = []
        areaWeight = []
        for item in result:
            if item["_id"] in self.getAreaList():
                areaWeight.append(item["weight"])
                areaName.append(item["_id"])
                print(item["_id"])
                print(item["weight"])
                # print(type(item))
        return (areaName, areaWeight)

    # 获取 title 数据,用于构建词云
    def getTitle(self):
        collection = self.zfdb["rent"]
        queryArgs = {}
        projectionFields = {'_id': False, 'title': True}  # 用字典指定
        searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000)
        content = ''
        for result in searchRes:
            print(result["title"])
            content += result["title"]
        return content

    # 获取户型数据(3 室 2 厅)
    def getRooms(self):
        results = self.zfdb.rent.aggregate([{'$group': {'_id': '$rooms', 'weight': {'$sum': 1}}}])
        roomList = []
        weightList = []
        for result in results:
            roomList.append(result["_id"])
            weightList.append(result["weight"])
        # print(list(result))
        return (roomList, weightList)

    # 获取租房面积
    def getAcreage(self):
        results0_30 = self.zfdb.rent.aggregate([
            {'$match': {'area': {'$gt': 0, '$lte': 30}}},
            {'$group': {'_id': '', 'count': {'$sum': 1}}}
        ])
        results30_60 = self.zfdb.rent.aggregate([
            {'$match': {'area': {'$gt': 30, '$lte': 60}}},
            {'$group': {'_id': '', 'count': {'$sum': 1}}}
        ])
        results60_90 = self.zfdb.rent.aggregate([
            {'$match': {'area': {'$gt': 60, '$lte': 90}}},
            {'$group': {'_id': '', 'count': {'$sum': 1}}}
        ])
        results90_120 = self.zfdb.rent.aggregate([
            {'$match': {'area': {'$gt': 90, '$lte': 120}}},
            {'$group': {'_id': '', 'count': {'$sum': 1}}}
        ])
        results120_200 = self.zfdb.rent.aggregate([
            {'$match': {'area': {'$gt': 120, '$lte': 200}}},
            {'$group': {'_id': '', 'count': {'$sum': 1}}}
        ])
        results200_300 = self.zfdb.rent.aggregate([
            {'$match': {'area': {'$gt': 200, '$lte': 300}}},
            {'$group': {'_id': '', 'count': {'$sum': 1}}}
        ])
        results300_400 = self.zfdb.rent.aggregate([
            {'$match': {'area': {'$gt': 300, '$lte': 400}}},
            {'$group': {'_id': '', 'count': {'$sum': 1}}}
        ])
        results400_10000 = self.zfdb.rent.aggregate([
            {'$match': {'area': {'$gt': 300, '$lte': 10000}}},
            {'$group': {'_id': '', 'count': {'$sum': 1}}}
        ])
        results0_30_ = list(results0_30)[0]["count"]
        results30_60_ = list(results30_60)[0]["count"]
        results60_90_ = list(results60_90)[0]["count"]
        results90_120_ = list(results90_120)[0]["count"]
        results120_200_ = list(results120_200)[0]["count"]
        results200_300_ = list(results200_300)[0]["count"]
        results300_400_ = list(results300_400)[0]["count"]
        results400_10000_ = list(results400_10000)[0]["count"]
        attr = ["0-30平方米", "30-60平方米", "60-90平方米", "90-120平方米", "120-200平方米", "200-300平方米", "300-400平方米", "400+平方米"]
        value = [
            results0_30_, results30_60_, results60_90_, results90_120_, results120_200_, results200_300_, results300_400_, results400_10000_
        ]
        return (attr, value)

    # 展示饼图
    def showPie(self, title, attr, value):
        from pyecharts import Pie
        pie = Pie(title)
        pie.add("aa", attr, value, is_label_show=True)
        pie.render()

    # 展示矩形树图
    def showTreeMap(self, title, data):
        from pyecharts import TreeMap
        data = data
        treemap = TreeMap(title, width=1200, height=600)
        treemap.add("深圳", data, is_label_show=True, label_pos='inside', label_text_size=19)
        treemap.render()

    # 展示条形图
    def showLine(self, title, attr, value):
        from pyecharts import Bar
        bar = Bar(title)
        bar.add("深圳", attr, value, is_convert=False, is_label_show=True, label_text_size=18, is_random=True,
                # xaxis_interval=0, xaxis_label_textsize=9,
                legend_text_size=18, label_text_color=["#000"])
        bar.render()

    # # 展示词云
    def showWorkCloud(self, content, image_filename, font_filename, out_filename):
        d = path.dirname(__name__)
        # content = open(path.join(d, filename), 'rb').read()
        # 基于TF-IDF算法的关键字抽取, topK返回频率最高的几项, 默认值为20, withWeight
        # 为是否返回关键字的权重
        tags = jieba.analyse.extract_tags(content, topK=100, withWeight=False)
        text = " ".join(tags)
        # 需要显示的背景图片
        img = imread(path.join(d, image_filename))
        # 指定中文字体, 不然会乱码的
        wc = WordCloud(font_path=font_filename,
                       background_color='black',
                       # 词云形状,
                       mask=img,
                       # 允许最大词汇
                       max_words=400,
                       # 最大号字体,如果不指定则为图像高度
                       max_font_size=100,
                       # 画布宽度和高度,如果设置了msak则不会生效
                       # width=600,
                       # height=400,
                       margin=2,
                       # 词语水平摆放的频率,默认为0.9.即竖直摆放的频率为0.1
                       prefer_horizontal=0.9
                       )
        wc.generate(text)
        img_color = ImageColorGenerator(img)
        plt.imshow(wc.recolor(color_func=img_color))
        plt.axis("off")
        plt.show()
        wc.to_file(path.join(d, out_filename))

    # 展示 pyecharts 的词云
    def showPyechartsWordCloud(self, attr, value):
        from pyecharts import WordCloud
        wordcloud = WordCloud(width=1300, height=620)
        wordcloud.add("", attr, value, word_size_range=[20, 100])
        wordcloud.render()

analycis = Analycis()

# 构建词云
# analycis.getTitle()
# analycis.showWorkCloud(analycis.getTitle(), "docker.jpeg", "kh.ttf", out_filename="output.jpeg")

# 统计租房面积
(attr, value) = analycis.getAcreage()
analycis.showPie("租房面积统计", attr, value)

# 户型统计
# (attr, value) = analycis.getRooms()
# analycis.showLine("户型统计", attr, value)

# 获取每月每平方米多少钱
# data = analycis.getTotalAvgPrice()
# print(data)
# analycis.showTreeMap("深圳各区房租单价:平方米/月", data)

# 获取每日每平方米多少钱
# (attr, value) = analycis.getTotalAvgPricePerDay()
# print(attr, value)
# analycis.showLine(title="深圳各区房租单价:平方米/日", attr=attr, value=value)

# 样本数量统计
# (attr, value) = analycis.getAnalycisNum()
# print(attr, value)
# analycis.showLine(title="统计样本数量", attr=attr, value=value)

# 房源分布
# (attr, value) = analycis.getAreaWeight()
# print(attr, value)
# analycis.showPie("深圳房源分布", attr, value)

租房信息可视化结果:

深圳房源分布:(按区划分)

 Python爬取房天下网站深圳房租信息入库并进行数据分析可视化_第4张图片

房租单价:(每月每平方米单价 – 平均数) 
即是 1 平方米 1 个月的价格。方块越大,代表价格越高。

Python爬取房天下网站深圳房租信息入库并进行数据分析可视化_第5张图片

房租单价:(每日每平方米单价 – 平均数)

即是 1 平方米 1 天的价格。

Python爬取房天下网站深圳房租信息入库并进行数据分析可视化_第6张图片

户型 
Python爬取房天下网站深圳房租信息入库并进行数据分析可视化_第7张图片

租房面积统计 

Python爬取房天下网站深圳房租信息入库并进行数据分析可视化_第8张图片

租房描述词云

其中字体越大,标识出现的次数越多。其中【精装】占据了很大的部分

Python爬取房天下网站深圳房租信息入库并进行数据分析可视化_第9张图片

 样本数量

 Python爬取房天下网站深圳房租信息入库并进行数据分析可视化_第10张图片

 

你可能感兴趣的:(python,爬虫)