python采集链家二手房源数据并做数据可视化展示
win + R 输入cmd 输入安装命令 pip install 模块名
response.text (服务器返回响应文本数据)
[静态网页]
爬虫: 对于网页上面的数据内容进行采集程序
import pprint # 格式化输出模块 内置模块
import requests # 数据请求模块 第三方模块 pip install requests
import parsel # 数据解析模块 第三方模块 pip install parsel
import csv # csv模块 内置模块 不需要安装
"""
68-102: 文件创建 69行: 文件创建 76行文件配置 102行写入表头 fieldnames 里面的数据是字典里面键
"""
f = open('房源.csv', mode='a', encoding='utf-8', newline='')
"""
如何实现快速替换:
1. 选择需要替换的内容
2. 按 ctrl + R
3. 输入正则表达式语法 进行替换 第一行写正则语法(匹配数据) 第二行写替换的内容
4. 点击全部替换
"""
csv_writer = csv.DictWriter(f, fieldnames=[
'上次交易',
'交易权属',
'产权所属',
'单价',
'售价',
'套内面积',
'建筑类型',
'建筑结构',
'建筑面积',
'户型结构',
'房屋年限',
'房屋户型',
'房屋朝向',
'房屋用途',
'房本备件',
'房源核验码',
'所在楼层',
'抵押信息',
'挂牌时间',
'标题',
'梯户比例',
'装修情况',
'详情页',
'配备电梯',
])
csv_writer.writeheader() # 写入表头
for page in range(1, 11):
print(f'===================正在爬取第{page}页数据内容===================')
# 1. 发送请求
url = f'网址' # 确定请求的url地址
# 模拟浏览器发送请求 需要对于python代码进行伪装
# headers:请求头 字典数据类型 键值对形式
# header作用: 伪装
# 加那些参数: cookie User-Agent(UA) referer host
# 参数意思是什么
# cookie: 用户信息, 常用于检测是否登陆账号
# User-Agent: 浏览器基本身份标识(用户代理)
# referer: 防盗链 告诉服务器我们发送请求的url地址是从哪里跳转过来 (动态网页)
# host: 域名
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
# 2. 获取数据, 获取网页源代码 response.text
# print(response.text) # 返回数据字符串数据类型
# 3. 解析数据 解析方式: css xpath re
# css选择器: 根据标签属性内容提取数据
selector = parsel.Selector(response.text) # 返回对象
# attr 属性选择器 getall() 获取所有 返回数据列表 人家语法就是这样 a::attr(href) 取a标签里面href属性
href = selector.css('.sellListContent li .title a::attr(href)').getall()
# print(href)
# 列表 数据容器(一个箱子) 'https://cs.lianjia.com/ershoufang/104107916240.html' 元素(箱子里面东西)
# for 循环 遍历就是从这个箱子里面一个一个拿东西出来
for link in href:
# 4. 发送请求, 对于房源详情页url地址发送请求
# 5.获取数据, 获取网页源代码response.text
response_1 = requests.get(url=link, headers=headers)
# 6. 解析数据提取我们想要数据
# print(response_1.text)
selector_1 = parsel.Selector(response_1.text) # 返回对象
# get() 取一个
title = selector_1.css('.title .main::text').get() # 标题
price = selector_1.css('.price .total::text').get() # 售价
price_1 = selector_1.css('.unitPriceValue::text').get() # 单价
attr_list = selector_1.css('.base .content li .label::text').getall()
attr_list_1 = selector_1.css('.transaction .content li .label::text').getall()
content_list = selector_1.css('.base .content li::text').getall()
content_list_1 = selector_1.css('.transaction .content li span::text').getall()
# 两个列表 如何创建成一个字典 attr_list 做键 content_list 做值
# print(attr_list)
# print(content_list)
# 保存csv文件表格
# 需要创建一个字典
dit = {
"详情页": link,
"标题": title,
"售价": price,
"单价": price_1,
# "区域": price_1,
}
dit_1 = dict(zip(attr_list, content_list))
dit_2 = dict(zip(attr_list_1, content_list_1))
dit.update(dit_1)
dit.update(dit_2)
# print(title, price, price_1)
csv_writer.writerow(dit) # 写入数据
pprint.pprint(dit) # 格式化输出模块
#%% md
## 导入模块
#%%
import pandas as pd
from pyecharts.charts import Map
from pyecharts.charts import Bar
from pyecharts.charts import Line
from pyecharts.charts import Grid
from pyecharts.charts import Pie
from pyecharts.charts import Scatter
from pyecharts import options as opts
#%% md
## 读取数据
#%%
df = pd.read_csv('data.csv', encoding = 'utf-8')
df.head()
#%%
df.describe()
#%%
df.isnull().sum()
#%%
df['电梯'].unique()
#%%
df['电梯'].fillna('未知', inplace=True)
df.isnull().sum()
#%%
df['电梯'].unique()
#%%
df['朝向'].unique()
#%%
df['朝向'] = df['朝向'].str.replace('南西','西南')
df['朝向'].unique()
#%%
g = df.groupby('市区')
df_region = g.count()['小区']
region = df_region.index.tolist()
count = df_region.values.tolist()
df_region
#%%
# 各城区二手房数量北京市地图
new = [x + '区' for x in region]
m = (
Map()
.add('', [list(z) for z in zip(new, count)], '北京')
.set_global_opts(
title_opts=opts.TitleOpts(title='北京市二手房各区分布'),
visualmap_opts=opts.VisualMapOpts(max_=3000),
)
)
m.render_notebook()
#%%
df_price = g.mean()['价格(万元)']
df_price
#%%
df_price.values.tolist()
price = [round(x,2) for x in df_price.values.tolist()]
bar = (
Bar()
.add_xaxis(region)
.add_yaxis('数量', count,
label_opts=opts.LabelOpts(is_show=True))
.extend_axis(
yaxis=opts.AxisOpts(
name="价格(万元)",
type_="value",
min_=200,
max_=900,
interval=100,
axislabel_opts=opts.LabelOpts(formatter="{value}"),
)
)
.set_global_opts(
title_opts=opts.TitleOpts(title='各城区二手房数量-平均价格柱状图'),
tooltip_opts=opts.TooltipOpts(
is_show=True, trigger="axis", axis_pointer_type="cross"
),
xaxis_opts=opts.AxisOpts(
type_="category",
axispointer_opts=opts.AxisPointerOpts(is_show=True, type_="shadow"),
),
yaxis_opts=opts.AxisOpts(name='数量',
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=False),)
)
)
line2 = (
Line()
.add_xaxis(xaxis_data=region)
.add_yaxis(
series_name="价格",
yaxis_index=1,
y_axis=price,
label_opts=opts.LabelOpts(is_show=True),
z=10
)
)
bar.overlap(line2)
grid = Grid()
grid.add(bar, opts.GridOpts(pos_left="5%", pos_right="20%"), is_control_axis_index=True)
grid.render_notebook()
#%%
top_price = df.sort_values(by="价格(万元)",ascending=False)[:15]
top_price
#%%
area0 = top_price['小区'].values.tolist()
count = top_price['价格(万元)'].values.tolist()
bar = (
Bar()
.add_xaxis(area0)
.add_yaxis('数量', count,category_gap = '50%')
.set_global_opts(
yaxis_opts=opts.AxisOpts(name='价格(万元)'),
xaxis_opts=opts.AxisOpts(name='数量'),
)
)
bar.render_notebook()
#%%
s = (
Scatter()
.add_xaxis(df['面积(㎡)'].values.tolist())
.add_yaxis('',df['价格(万元)'].values.tolist())
.set_global_opts(xaxis_opts=opts.AxisOpts(type_='value'))
)
s.render_notebook()
#%%
g = df.groupby('朝向')
g.count()['小区']
#%%
df_direction = g.count()['小区']
df_direction
#%%
directions = df_direction.index.tolist()
count = df_direction.values.tolist()
c1 = (
Pie(init_opts=opts.InitOpts(
width='800px', height='600px',
)
)
.add(
'',
[list(z) for z in zip(directions, count)],
radius=['20%', '60%'],
center=['40%', '50%'],
# rosetype="radius",
label_opts=opts.LabelOpts(is_show=True),
)
.set_global_opts(title_opts=opts.TitleOpts(title='房屋朝向占比',pos_left='33%',pos_top="5%"),
legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%",pos_top="25%",orient="vertical")
)
.set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{c} ({d}%)'),position="outside")
)
c1.render_notebook()
#%%
g1 = df.groupby('装修情况')
g1.count()['小区']
g2 = df.groupby('电梯')
g2.count()['小区']
#%%
df_fitment = g1.count()['小区']
df_direction = g2.count()['小区']
df_fitment
#%%
fitment = df_fitment.index.tolist()
count1 = df_fitment.values.tolist()
directions = df_direction.index.tolist()
count2 = df_direction.values.tolist()
bar = (
Bar()
.add_xaxis(fitment)
.add_yaxis('', count1, category_gap = '50%')
.reversal_axis()
.set_series_opts(label_opts=opts.LabelOpts(position='right'))
.set_global_opts(
xaxis_opts=opts.AxisOpts(name='数量'),
title_opts=opts.TitleOpts(title='装修情况/有无电梯玫瑰图(组合图)',pos_left='33%',pos_top="5%"),
legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="58%",orient="vertical")
)
)
c2 = (
Pie(init_opts=opts.InitOpts(
width='800px', height='600px',
)
)
.add(
'',
[list(z) for z in zip(directions, count2)],
radius=['10%', '30%'],
center=['75%', '65%'],
rosetype="radius",
label_opts=opts.LabelOpts(is_show=True),
)
.set_global_opts(title_opts=opts.TitleOpts(title='有/无电梯',pos_left='33%',pos_top="5%"),
legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="15%",orient="vertical")
)
.set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{c} \n ({d}%)'),position="outside")
)
bar.overlap(c2)
bar.render_notebook()
#%%
g = df.groupby('楼层')
df_floor = g.count()['小区']
df_floor
#%%
floor = df_floor.index.tolist()
count = df_floor.values.tolist()
bar = (
Bar()
.add_xaxis(floor)
.add_yaxis('数量', count)
.set_global_opts(
title_opts=opts.TitleOpts(title='二手房楼层分布柱状缩放图'),
yaxis_opts=opts.AxisOpts(name='数量'),
xaxis_opts=opts.AxisOpts(name='楼层'),
datazoom_opts=opts.DataZoomOpts(type_='slider')
)
)
bar.render_notebook()
#%%
area_level = [0, 50, 100, 150, 200, 250, 300, 350, 400, 1500]
label_level = ['小于50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350-400', '大于400']
jzmj_cut = pd.cut(df['面积(㎡)'], area_level, labels=label_level)
df_area = jzmj_cut.value_counts()
df_area
#%%
area = df_area.index.tolist()
count = df_area.values.tolist()
bar = (
Bar()
.add_xaxis(area)
.add_yaxis('数量', count)
.reversal_axis()
.set_series_opts(label_opts=opts.LabelOpts(position="right"))
.set_global_opts(
title_opts=opts.TitleOpts(title='房屋面积分布纵向柱状图'),
yaxis_opts=opts.AxisOpts(name='面积(㎡)'),
xaxis_opts=opts.AxisOpts(name='数量'),
)
)
bar.render_notebook()
#%%
好了,我的这篇文章写到这里就结束啦!
有更多建议或问题可以评论区或私信我哦!一起加油努力叭(ง •_•)ง
喜欢就关注一下博主,或点赞收藏评论一下我的文章叭!!!