楼房信息爬取及用matplotlib做可视化图表 2019-05-30

image.png

今天主要学习两大块内容，一个是爬取深圳楼房信息，一个是如何对已有的数据做可视化。
此次爬取的新增知识点是：编码问题和按特定字符串划分问题

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
url='http://sz.jiwu.com/loupan/'
headers={
   #自行添加
}
response = requests.get(url=url,headers=headers)
soup = BeautifulSoup(response.text,'lxml')
urls = soup.find('div',class_='loupan-list mt10 clearfix').find_all('a')

#创建一个空列表，获取所有网址
url_lst = []
for url in urls:
    url_lst.append(url['href'])
url_lst
len(url_lst)

url_lst1 = []
for u in url_lst:
    if 'http://sz.jiwu.com/loupan/' in u and 'page' not in u[:]:
        url_lst1.append(u)
        
url_lst = url_lst1[::2]

获取每一个网址的信息

def datacrawler(urli):
    headers={
    #自行添加
}
    ri = requests.get(url=urli,headers=headers)
    ri.encoding = 'utf-8'
    soupi = BeautifulSoup(ri.text,'lxml')
    price = soupi.find('strong',class_='lpm-s6-3 fnone').text
    price = re.split('[\n,\r,\t]',price)
    for item in price[:]:
        if item == '' or  ' ' in item:
            price.remove(item)
    open_time = soupi.find('p',class_='fl lpm-s8-a1').text
    open_time = re.split('[\n,\r,\t]',open_time)
    for item in open_time[:]:
        if '盘' not in item:
            open_time.remove(item)
    house_info = soupi.find('div',class_='lpm-section4 mt30 clearfix').text
    house_info = re.split('[\n,\r,\t]',house_info)
    for item in house_info[:]:
        if item == '' or  ' ' in item:
            house_info.remove(item)
    #plan_ares = soupi.find('td',class_='lpm-table-2').text
    #Plot_ratio = soupi.find('span',class_='fl').text
    #larg_dist = soupi.find('td',class_='lpm-table-2').text
    #prop_comp = soupi.find('td',class_='lpm-table-2').text
    #dic1 = dict(zip(house_info[::2],house_info[1::2]))
    dic = {'参考总价':price,'开盘时间':open_time,'房屋信息':house_info}
    return dic

获取网址及每个网址对应的信息

data = []
n = 0
for urli in url_lst:
    print('采集网址：',urli)
    n += 1
    data.append(datacrawler(urli))
    print('采集网页成功，总采集%i条网址/n'%n)

tb=pd.DataFrame(data)
tb['参考售价'] = tb['参考总价'].astype(str).str.split(':').str[1]
tb['建筑形式'] = tb['建筑形式'].astype(str).str.split(':').str[1]
tb['建筑形式'] = tb['房屋信息'].astype(str).str.split(',').str[1]
tb['物业类型'] = tb['房屋信息'].astype(str).str.split(',').str[2]
tb.to_excel('D:/深圳新房楼盘信息.xlsx',encoding='utf-8')

此次爬取没什么难点，但是数据清洗方面还有很多欠缺之处，需要继续完善。由于爬取到的数据数字信息太少，故采用另一份已有的数据进行可视化操作，如下：

#数据读取
data = pd.read_csv(r'C:\Users\Administrator\Desktop\资料02_深圳罗湖二手房信息.csv',engine = 'python')
data

做气泡图，横轴为经度，纵轴为纬度，圆圈的大小表示单价的高低，圆圈的深浅表示总价的高低

plt.scatter(data['经度'],data['纬度'],
            s = data['房屋单价']/300,#大小
            c = data['参考总价'],#颜色深浅
            alpha = 0.6,cmap = 'Reds'#透明度，颜色选择
           )
plt.grid()#加网格线

image.png

直方图--查看参考首付的分布

data['参考首付'].hist()
#data['参考首付'].hist(bins=20)#表示划分成20个区间

image.png

#对数据进行切分--当数据范围较大时，可切分为多个区间
gcut = pd.cut(data['参考总价'],10,right=False)
#对分组的数据进行统计,用sort排序
gcut_count = gcut.value_counts(sort = False)

#频率分布情况--定量字段
#求出目标字段下频率分布的其他统计量--频数、频率、累计频率
r_zj=pd.DataFrame(gcut_count)
r_zj.rename(columns={gcut_count.name:'频数'},inplace=True)
r_zj['频率']=r_zj['频数']/r_zj['频数'].sum()
r_zj['累计频率'] = r_zj['频率'].cumsum()
r_zj['频率%'] = r_zj['频率'].apply(lambda x:"%.2f%%"%(x*100))
r_zj['累计频率%'] = r_zj['累计频率'].apply(lambda x:"%.2f%%"%(x*100))
r_zj

绘制频率直方图

#r_zj['频率'].plot(kind = 'bar')
r_zj['频率'].plot(kind = 'bar',width = 0.8,figsize=(12,2),rot = 0,color='K',grid = True,alpha = 0.5)
#添加频数标签
x = len(r_zj)
y = r_zj['频率']
m = r_zj['频数']
for i,j,k in zip(range(x),y,m):
    plt.text(i-0.1,j+0.01,'%i' % k, color = 'k')

做密度图

import seaborn as sns
# 导入seaborn
sns.set_context("paper")#设置输出图片的大小尺寸，设置为paper，输出为最小的尺寸
# 直方图 - distplot()
rs = np.random.RandomState(10)  # 设定随机数种子
s = pd.Series(rs.randn(100) * 100)
#以上两句相当于s = pd.Series(np.random.randn(100) * 100)
sns.distplot(s,bins = 10,hist = True,kde = True,norm_hist=False,
            rug = True,vertical = False,
            color = 'y',label = 'distplot',axlabel = 'x')
plt.legend()
plt.grid()
# bins → 箱数
# hist、ked → 是否显示箱/密度曲线
# norm_hist → 直方图是否按照密度来显示
# rug → 是否显示数据分布情况
# vertical → 是否水平显示（横线/纵向）
# color → 设置颜色
# label → 图例
# axlabel → x轴标注

image.png

# 综合散点图 （图形联合）- jointplot()
# 散点图 + 分布图
# 密度图

df = pd.DataFrame(np.random.randn(300,2),columns = ['A','B'])
# 创建数据

g = sns.jointplot(x=df['A'], y=df['B'],data = df,
                  kind="kde", color="k")
# kind : { "scatter" | "reg" | "resid" | "kde" | "hex" }
# 创建密度图

g.plot_joint(plt.scatter,c="w", s=30, linewidth=1, marker="+")
plt.grid(alpha = 0.3)
# 添加散点图

image.png

# 相互对比的指标在量级上不能差别过大
# （1）折线图比较
# （2）多系列柱状图比较

data = pd.DataFrame(np.random.rand(30,2)*1000,
                   columns = ['A_sale','B_sale'],
                   index = pd.period_range('20170601','20170630'))
print(data)
# 创建数据 → 30天内A/B产品的日销售额

data.plot(kind='line',
       style = '--.',
       alpha = 0.8,
       figsize = (10,3),
       title = 'AB产品销量对比-折线图')
# 折线图比较

data.plot(kind = 'bar',
          width = 0.8,
          alpha = 0.8,
          figsize = (10,3),
          title = 'AB产品销量对比-柱状图')
# 多系列柱状图比较

image.png

（3）柱状图堆叠图+差值折线图比较

# 用add_subplot创建一个或多个subplot
fig3 = plt.figure(figsize=(10,6))#创建子图
plt.subplots_adjust(hspace=0.3)#设置了子图之间的纵、横两方向上的间隙(hspace=0.3, wspace=0.3)
# 创建子图及间隔设置

ax1 = fig3.add_subplot(2,1,1)  #创建subplot，211表示这是2行1列表格中的第1个图像
x = range(len(data))
y1 = data['A_sale']
y2 = -data['B_sale']
plt.bar(x,y1,width = 1,facecolor = 'yellowgreen')
plt.bar(x,y2,width = 1,facecolor = 'lightskyblue')
plt.title('AB产品销量对比-堆叠图')
plt.grid()
plt.xticks(range(0,30,6))#调整X坐标间隔，5块，每块6条
ax1.set_xticklabels(data.index[::6])
# 创建堆叠图

ax2 = fig3.add_subplot(2,1,2)  #创建subplot，212表示这是2行1列表格中的第2个图像
y3 = data['A_sale']-data['B_sale']
plt.plot(x,y3,'--go')
#plt.axhline(0,hold=None,color='r',linestyle="--",alpha=0.8)  # 添加y轴参考线
plt.grid()
plt.title('AB产品销量对比-差值折线')
plt.xticks(range(0,30,6))
ax2.set_xticklabels(data.index[::6])
# 创建差值折线图

image.png

楼房信息爬取及用matplotlib做可视化图表 2019-05-30

你可能感兴趣的:(楼房信息爬取及用matplotlib做可视化图表 2019-05-30)