先爬取房某下深圳各个板块的数据,然后存进 MongoDB 数据库,最后再进行数据分析。
右键网页,查看页面源码,找出我们要爬取得部分
import requests
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient
class HouseSpider:
def __init__(self):
self.client = MongoClient('mongodb://localhost:27017/')
self.zfdb = self.client.zfdb
session = requests.Session()
baseUrl = "http://sz.zu.fang.com"
# 每个区域的url
urlDir = {
"不限": "/house/",
"宝安": "/house-a089/",
"龙岗": "/house-a090/",
"南山": "/house-a087/",
"福田": "/house-a085/",
"罗湖": "/house-a086/",
"盐田": "/house-a088/",
"龙华区": "/house-a013080/",
"坪山区": "/house-a013081/",
"光明新区": "/house-a013079/",
"大鹏新区": "/house-a013082/",
"惠州": "/house-a013058/",
"东莞": "/house-a013057/",
"深圳周边": "/house-a016375/",
}
region = "不限"
page = 100
# 通过名字获取 url 地址
def getRegionUrl(self, name="宝安", page=10):
urlList = []
for index in range(page):
if index == 0:
urlList.append(self.baseUrl + self.urlDir[name])
else:
urlList.append(self.baseUrl + self.urlDir[name] + "i3" + str(index + 1) + "/")
return urlList
# MongoDB 存储数据结构
def getRentMsg(self, title, rooms, area, price, address, traffic, region, direction):
return {
"title": title, # 标题
"rooms": rooms, # 房间数
"area": area, # 平方数
"price": price, # 价格
"address": address, # 地址
"traffic": traffic, # 交通描述
"region": region, # 区、(福田区、南山区)
"direction": direction, # 房子朝向(朝南、朝南北)
}
# 获取数据库 collection
def getCollection(self, name):
zfdb = self.zfdb
if name == "不限":
return zfdb.rent
if name == "宝安":
return zfdb.baoan
if name == "龙岗":
return zfdb.longgang
if name == "南山":
return zfdb.nanshan
if name == "福田":
return zfdb.futian
if name == "罗湖":
return zfdb.luohu
if name == "盐田":
return zfdb.yantian
if name == "龙华区":
return zfdb.longhuaqu
if name == "坪山区":
return zfdb.pingshanqu
if name == "光明新区":
return zfdb.guangmingxinqu
if name == "大鹏新区":
return zfdb.dapengxinqu
#
def getAreaList(self):
return ["不限","宝安","龙岗","南山","福田","罗湖","盐田","龙华区","坪山区","光明新区","大鹏新区",]
def getOnePageData(self, pageUrl, reginon="不限"):
rent = self.getCollection(self.region)
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'})
res = self.session.get(
pageUrl
)
soup = BeautifulSoup(res.text, "html.parser")
# 获取需要爬取得 div
divs = soup.find_all("dd", attrs={"class": "info rel"})
for div in divs:
ps = div.find_all("p")
try: # 捕获异常,因为页面中有些数据没有被填写完整,或者被插入了一条广告,则会没有相应的标签,所以会报错
for index, p in enumerate(ps): # 从源码中可以看出,每一条 p 标签都有我们想要的信息,故在此遍历 p 标签,
text = p.text.strip()
print(text) # 输出看看是否为我们想要的信息
print("===================================")
# 爬取并存进 MongoDB 数据库
roomMsg = ps[1].text.split("|")
# rentMsg 这样处理是因为有些信息未填写完整,导致对象报空
area = roomMsg[2].strip()[:len(roomMsg[2]) - 2]
# 标题 房间数 平方数 价格 地址 交通描述 区 房子朝向
rentMsg = self.getRentMsg(
ps[0].text.strip(),
roomMsg[1].strip(),
int(float(area)),
int(ps[len(ps) - 1].text.strip()[:len(ps[len(ps) - 1].text.strip()) - 3]),
ps[2].text.strip(),
ps[3].text.strip(),
ps[2].text.strip()[:2],
roomMsg[3],
)
# 插入到数据库中
rent.insert(rentMsg)
except:
continue
# 设置区域
def setRegion(self, region):
self.region = region
# 设置页数
def setPage(self, page):
self.page = page
def startSpicder(self):
for url in self.getRegionUrl(self.region, self.page):
self.getOnePageData(url, self.region)
print("*" * 30 + "one page 分割线" + "*" * 30)
time.sleep(1)
spider = HouseSpider()
spider.setPage(10)# 设置爬取页数
for i in range(0,11):
spider.setRegion(spider.getAreaList()[i]) # 设置爬取区域
spider.startSpicder()# 开启爬虫
对MongoDB中的zfdb数据库数据分析,租房信息可视化::
from os import path
from wordcloud import WordCloud, ImageColorGenerator
import jieba.analyse
import matplotlib.pyplot as plt
from scipy.misc import imread
baseUrl = "http://sz.zu.fang.com"
from pymongo import MongoClient
class Analycis:
def __init__(self):
self.client = MongoClient('mongodb://localhost:27017/')
self.zfdb = self.client.zfdb
pinyinDir = {
# "不限": "rent",
"宝安": "baoan",
"龙岗": "longgang",
"南山": "nanshan",
"福田": "futian",
"罗湖": "luohu",
"盐田": "yantian",
"龙华": "longhuaqu",
"坪山": "pingshanqu",
"光明": "guangmingxinqu",
"大鹏": "dapengxinqu",
}
def getAreaList(self):
return [# "不限",
"福田","南山","罗湖","宝安","龙华","盐田","龙岗","坪山","光明","大鹏",]
# 获取区的拼音
def getPinyin(self, region):
try:
pinyin = self.pinyinDir[region]
except:
print("no such region pinyin")
return pinyin
# 获取数据库 collection
def getCollection(self, name):
zfdb = self.zfdb
if name == "不限":
return zfdb.rent
if name == "宝安":
return zfdb.baoan
if name == "龙岗":
return zfdb.longgang
if name == "南山":
return zfdb.nanshan
if name == "福田":
return zfdb.futian
if name == "罗湖":
return zfdb.luohu
if name == "盐田":
return zfdb.yantian
if name == "龙华区":
return zfdb.longhuaqu
if name == "坪山区":
return zfdb.pingshanqu
if name == "光明新区":
return zfdb.guangmingxinqu
if name == "大鹏新区":
return zfdb.dapengxinqu
# 求一个区的 平方米/元 的平均数
def getAvgPrice(self, region):
# 这种获取数据库的方法也可以
# collection = self.getCollection(region)
areaPinYin = self.getPinyin(region=region)
collection = self.zfdb[areaPinYin]
# $group:将集合中的文档分组,可用于统计结果 $sum:计算总和
totalPrice = collection.aggregate([{'$group': {'_id': '$region', 'total_price': {'$sum': '$price'}}}])
totalArea = collection.aggregate([{'$group': {'_id': '$region', 'total_area': {'$sum': '$area'}}}])
totalPrice2 = list(totalPrice)[0]["total_price"]
totalArea2 = list(totalArea)[0]["total_area"]
return totalPrice2 / totalArea2
# 获取各个区 每个月一平方米需要多少钱
def getTotalAvgPrice(self):
totalAvgPriceList = []
totalAvgPriceDirList = []
for index, region in enumerate(self.getAreaList()):
avgPrice = self.getAvgPrice(region)
# round函数是一个用于四舍五入的函数
totalAvgPriceList.append(round(avgPrice, 3))
totalAvgPriceDirList.append({"value": round(avgPrice, 3), "name": region + " " + str(round(avgPrice, 3))})
return totalAvgPriceDirList
# 获取各个区 每一天一平方米需要多少钱
def getTotalAvgPricePerDay(self):
totalAvgPriceList = []
for index, region in enumerate(self.getAreaList()):
avgPrice = self.getAvgPrice(region)
totalAvgPriceList.append(round(avgPrice / 30, 3))
return (self.getAreaList(), totalAvgPriceList)
# 获取各区统计数据量
def getAnalycisNum(self):
analycisList = []
for index, region in enumerate(self.getAreaList()):
collection = self.zfdb[self.pinyinDir[region]]
print(region)
totalNum = collection.aggregate([{'$group': {'_id': '', 'total_num': {'$sum': 1}}}])
totalNum2 = list(totalNum)[0]["total_num"]
analycisList.append(totalNum2)
return (self.getAreaList(), analycisList)
# 获取各个区的房源比重
def getAreaWeight(self):
result = self.zfdb.rent.aggregate([{'$group': {'_id': '$region', 'weight': {'$sum': 1}}}])
areaName = []
areaWeight = []
for item in result:
if item["_id"] in self.getAreaList():
areaWeight.append(item["weight"])
areaName.append(item["_id"])
print(item["_id"])
print(item["weight"])
# print(type(item))
return (areaName, areaWeight)
# 获取 title 数据,用于构建词云
def getTitle(self):
collection = self.zfdb["rent"]
queryArgs = {}
projectionFields = {'_id': False, 'title': True} # 用字典指定
searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000)
content = ''
for result in searchRes:
print(result["title"])
content += result["title"]
return content
# 获取户型数据(3 室 2 厅)
def getRooms(self):
results = self.zfdb.rent.aggregate([{'$group': {'_id': '$rooms', 'weight': {'$sum': 1}}}])
roomList = []
weightList = []
for result in results:
roomList.append(result["_id"])
weightList.append(result["weight"])
# print(list(result))
return (roomList, weightList)
# 获取租房面积
def getAcreage(self):
results0_30 = self.zfdb.rent.aggregate([
{'$match': {'area': {'$gt': 0, '$lte': 30}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results30_60 = self.zfdb.rent.aggregate([
{'$match': {'area': {'$gt': 30, '$lte': 60}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results60_90 = self.zfdb.rent.aggregate([
{'$match': {'area': {'$gt': 60, '$lte': 90}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results90_120 = self.zfdb.rent.aggregate([
{'$match': {'area': {'$gt': 90, '$lte': 120}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results120_200 = self.zfdb.rent.aggregate([
{'$match': {'area': {'$gt': 120, '$lte': 200}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results200_300 = self.zfdb.rent.aggregate([
{'$match': {'area': {'$gt': 200, '$lte': 300}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results300_400 = self.zfdb.rent.aggregate([
{'$match': {'area': {'$gt': 300, '$lte': 400}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results400_10000 = self.zfdb.rent.aggregate([
{'$match': {'area': {'$gt': 300, '$lte': 10000}}},
{'$group': {'_id': '', 'count': {'$sum': 1}}}
])
results0_30_ = list(results0_30)[0]["count"]
results30_60_ = list(results30_60)[0]["count"]
results60_90_ = list(results60_90)[0]["count"]
results90_120_ = list(results90_120)[0]["count"]
results120_200_ = list(results120_200)[0]["count"]
results200_300_ = list(results200_300)[0]["count"]
results300_400_ = list(results300_400)[0]["count"]
results400_10000_ = list(results400_10000)[0]["count"]
attr = ["0-30平方米", "30-60平方米", "60-90平方米", "90-120平方米", "120-200平方米", "200-300平方米", "300-400平方米", "400+平方米"]
value = [
results0_30_, results30_60_, results60_90_, results90_120_, results120_200_, results200_300_, results300_400_, results400_10000_
]
return (attr, value)
# 展示饼图
def showPie(self, title, attr, value):
from pyecharts import Pie
pie = Pie(title)
pie.add("aa", attr, value, is_label_show=True)
pie.render()
# 展示矩形树图
def showTreeMap(self, title, data):
from pyecharts import TreeMap
data = data
treemap = TreeMap(title, width=1200, height=600)
treemap.add("深圳", data, is_label_show=True, label_pos='inside', label_text_size=19)
treemap.render()
# 展示条形图
def showLine(self, title, attr, value):
from pyecharts import Bar
bar = Bar(title)
bar.add("深圳", attr, value, is_convert=False, is_label_show=True, label_text_size=18, is_random=True,
# xaxis_interval=0, xaxis_label_textsize=9,
legend_text_size=18, label_text_color=["#000"])
bar.render()
# # 展示词云
def showWorkCloud(self, content, image_filename, font_filename, out_filename):
d = path.dirname(__name__)
# content = open(path.join(d, filename), 'rb').read()
# 基于TF-IDF算法的关键字抽取, topK返回频率最高的几项, 默认值为20, withWeight
# 为是否返回关键字的权重
tags = jieba.analyse.extract_tags(content, topK=100, withWeight=False)
text = " ".join(tags)
# 需要显示的背景图片
img = imread(path.join(d, image_filename))
# 指定中文字体, 不然会乱码的
wc = WordCloud(font_path=font_filename,
background_color='black',
# 词云形状,
mask=img,
# 允许最大词汇
max_words=400,
# 最大号字体,如果不指定则为图像高度
max_font_size=100,
# 画布宽度和高度,如果设置了msak则不会生效
# width=600,
# height=400,
margin=2,
# 词语水平摆放的频率,默认为0.9.即竖直摆放的频率为0.1
prefer_horizontal=0.9
)
wc.generate(text)
img_color = ImageColorGenerator(img)
plt.imshow(wc.recolor(color_func=img_color))
plt.axis("off")
plt.show()
wc.to_file(path.join(d, out_filename))
# 展示 pyecharts 的词云
def showPyechartsWordCloud(self, attr, value):
from pyecharts import WordCloud
wordcloud = WordCloud(width=1300, height=620)
wordcloud.add("", attr, value, word_size_range=[20, 100])
wordcloud.render()
analycis = Analycis()
# 构建词云
# analycis.getTitle()
# analycis.showWorkCloud(analycis.getTitle(), "docker.jpeg", "kh.ttf", out_filename="output.jpeg")
# 统计租房面积
(attr, value) = analycis.getAcreage()
analycis.showPie("租房面积统计", attr, value)
# 户型统计
# (attr, value) = analycis.getRooms()
# analycis.showLine("户型统计", attr, value)
# 获取每月每平方米多少钱
# data = analycis.getTotalAvgPrice()
# print(data)
# analycis.showTreeMap("深圳各区房租单价:平方米/月", data)
# 获取每日每平方米多少钱
# (attr, value) = analycis.getTotalAvgPricePerDay()
# print(attr, value)
# analycis.showLine(title="深圳各区房租单价:平方米/日", attr=attr, value=value)
# 样本数量统计
# (attr, value) = analycis.getAnalycisNum()
# print(attr, value)
# analycis.showLine(title="统计样本数量", attr=attr, value=value)
# 房源分布
# (attr, value) = analycis.getAreaWeight()
# print(attr, value)
# analycis.showPie("深圳房源分布", attr, value)
深圳房源分布:(按区划分)
房租单价:(每月每平方米单价 – 平均数)
即是 1 平方米 1 个月的价格。方块越大,代表价格越高。
房租单价:(每日每平方米单价 – 平均数)
即是 1 平方米 1 天的价格。
租房面积统计
租房描述词云
其中字体越大,标识出现的次数越多。其中【精装】占据了很大的部分