数据集链接:爱彼迎短租数据集
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
%matplotlib inline
#房源基础信息,包括房源、房东、位置、类型、价格、评论数量和可租时间等等。
listings=pd.read_csv('listings.csv')
listings.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 44054 | Modern and Comfortable Living in CBD | 192875 | East Apartments | NaN | 朝阳区 / Chaoyang | 39.89503 | 116.45163 | Entire home/apt | 792 | 1 | 89 | 2019-03-04 | 0.85 | 9 | 341 |
1 | 100213 | The Great Wall Box Deluxe Suite A团园长城小院东院套房 | 527062 | Joe | NaN | 密云县 / Miyun | 40.68434 | 117.17231 | Private room | 1201 | 1 | 2 | 2017-10-08 | 0.10 | 4 | 0 |
2 | 128496 | Heart of Beijing: House with View 2 | 467520 | Cindy | NaN | 东城区 | 39.93213 | 116.42200 | Entire home/apt | 389 | 3 | 259 | 2019-02-05 | 2.70 | 1 | 93 |
3 | 161902 | cozy studio in center of Beijing | 707535 | Robert | NaN | 东城区 | 39.93357 | 116.43577 | Entire home/apt | 376 | 1 | 26 | 2016-12-03 | 0.28 | 5 | 290 |
4 | 162144 | nice studio near subway, sleep 4 | 707535 | Robert | NaN | 朝阳区 / Chaoyang | 39.93668 | 116.43798 | Entire home/apt | 537 | 1 | 37 | 2018-08-01 | 0.40 | 5 | 352 |
#房源时间表信息,包括房源、时间、是否可租、租金和可租天数等等。
calendar=pd.read_csv('calendar_detail.csv')
calendar.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
listing_id | date | available | price | adjusted_price | minimum_nights | maximum_nights | |
---|---|---|---|---|---|---|---|
0 | 1165040 | 2019-04-17 | f | $511.00 | $511.00 | 1.0 | 1125.0 |
1 | 1165040 | 2019-04-18 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
2 | 1165040 | 2019-04-19 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
3 | 1165040 | 2019-04-20 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
4 | 1165040 | 2019-04-21 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
#北京的行政区划
neighbour=pd.read_csv('neighbourhoods.csv')
neighbour.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
neighbourhood_group | neighbourhood | |
---|---|---|
0 | NaN | 东城区 |
1 | NaN | 丰台区 / Fengtai |
2 | NaN | 大兴区 / Daxing |
3 | NaN | 密云县 / Miyun |
4 | NaN | 平谷区 / Pinggu |
#房源的评论信息。包括房源 listing_id和评论日期,包括评论相关的内容和作者信息。
reviews=pd.read_csv('reviews_detail.csv')
reviews.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
listing_id | id | date | reviewer_id | reviewer_name | comments | |
---|---|---|---|---|---|---|
0 | 44054 | 84748 | 2010-08-25 | 207019 | Jarrod | Sev was very helpful. Sev showed us where to ... |
1 | 44054 | 118384 | 2010-10-13 | 218723 | Kimberly | We arrived in Beijing very early in the mornin... |
2 | 44054 | 436978 | 2011-08-11 | 609177 | Emma | It is a really massive apartment and really co... |
3 | 44054 | 1118657 | 2012-04-12 | 1787536 | Andreyna | Sev was incredibly helpful, showed us around t... |
4 | 44054 | 2140650 | 2012-08-30 | 1179565 | Frances | The appartment was ideal for our party of 6 ad... |
listings.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 44054 | Modern and Comfortable Living in CBD | 192875 | East Apartments | NaN | 朝阳区 / Chaoyang | 39.89503 | 116.45163 | Entire home/apt | 792 | 1 | 89 | 2019-03-04 | 0.85 | 9 | 341 |
1 | 100213 | The Great Wall Box Deluxe Suite A团园长城小院东院套房 | 527062 | Joe | NaN | 密云县 / Miyun | 40.68434 | 117.17231 | Private room | 1201 | 1 | 2 | 2017-10-08 | 0.10 | 4 | 0 |
2 | 128496 | Heart of Beijing: House with View 2 | 467520 | Cindy | NaN | 东城区 | 39.93213 | 116.42200 | Entire home/apt | 389 | 3 | 259 | 2019-02-05 | 2.70 | 1 | 93 |
3 | 161902 | cozy studio in center of Beijing | 707535 | Robert | NaN | 东城区 | 39.93357 | 116.43577 | Entire home/apt | 376 | 1 | 26 | 2016-12-03 | 0.28 | 5 | 290 |
4 | 162144 | nice studio near subway, sleep 4 | 707535 | Robert | NaN | 朝阳区 / Chaoyang | 39.93668 | 116.43798 | Entire home/apt | 537 | 1 | 37 | 2018-08-01 | 0.40 | 5 | 352 |
#对listings表分析
listings.info()
RangeIndex: 28452 entries, 0 to 28451
Data columns (total 16 columns):
id 28452 non-null int64
name 28451 non-null object
host_id 28452 non-null int64
host_name 28452 non-null object
neighbourhood_group 0 non-null float64
neighbourhood 28452 non-null object
latitude 28452 non-null float64
longitude 28452 non-null float64
room_type 28452 non-null object
price 28452 non-null int64
minimum_nights 28452 non-null int64
number_of_reviews 28452 non-null int64
last_review 17294 non-null object
reviews_per_month 17294 non-null float64
calculated_host_listings_count 28452 non-null int64
availability_365 28452 non-null int64
dtypes: float64(4), int64(7), object(5)
memory usage: 3.5+ MB
# 观察发现listings表有几个问题:
# 1.neighbourhood_group列存在很多空值,查看统计信息
# 2.neighbourhood列有中文有英文,决定删掉‘/yingwen’,仅保留neighbourhood列中文部分
# 3.查看经纬度是否有异常值
# 4.查看房屋类型有多少种
# 5.查看价格是否存在异常值
# 6.查看最小入住天数是否有异常值
# 7.查看评论数前10的id
# 8.查看每月评论数前十的id
# 9.查看365天中天数是否有异常值
# 10.name,last_review和reviews_per_month中都存在空值,不过影响不大
#发现neighbourhood_group列有很多空值,查看neighbourhood_group列的统计信息
listings['neighbourhood_group'].describe()
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
Name: neighbourhood_group, dtype: float64
由以上neighbourhood_group列的统计信息可知,neighbourhood_group列全为空值,无意义
所以决定删除在listings表中删除neighbourhood_group列
#删除neighbourhood_group列
listings=listings.drop(['neighbourhood_group'],axis=1)
listings.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 44054 | Modern and Comfortable Living in CBD | 192875 | East Apartments | 朝阳区 / Chaoyang | 39.89503 | 116.45163 | Entire home/apt | 792 | 1 | 89 | 2019-03-04 | 0.85 | 9 | 341 |
1 | 100213 | The Great Wall Box Deluxe Suite A团园长城小院东院套房 | 527062 | Joe | 密云县 / Miyun | 40.68434 | 117.17231 | Private room | 1201 | 1 | 2 | 2017-10-08 | 0.10 | 4 | 0 |
2 | 128496 | Heart of Beijing: House with View 2 | 467520 | Cindy | 东城区 | 39.93213 | 116.42200 | Entire home/apt | 389 | 3 | 259 | 2019-02-05 | 2.70 | 1 | 93 |
3 | 161902 | cozy studio in center of Beijing | 707535 | Robert | 东城区 | 39.93357 | 116.43577 | Entire home/apt | 376 | 1 | 26 | 2016-12-03 | 0.28 | 5 | 290 |
4 | 162144 | nice studio near subway, sleep 4 | 707535 | Robert | 朝阳区 / Chaoyang | 39.93668 | 116.43798 | Entire home/apt | 537 | 1 | 37 | 2018-08-01 | 0.40 | 5 | 352 |
#查看neighbourhood列有哪几种不同元素
listings['neighbourhood'].unique()
array(['朝阳区 / Chaoyang', '密云县 / Miyun', '东城区', '西城区', '海淀区',
'顺义区 / Shunyi', '房山区', '怀柔区 / Huairou', '昌平区', '通州区 / Tongzhou',
'丰台区 / Fengtai', '大兴区 / Daxing', '延庆县 / Yanqing', '石景山区',
'门头沟区 / Mentougou', '平谷区 / Pinggu'], dtype=object)
#neighbourhood列删掉‘/yingwen’,仅保留neighbourhood列中文部分
for i in range(len(listings)):
new_neighbourhood=listings['neighbourhood'][i].split('/')
listings.loc[i,'neighbourhood']=new_neighbourhood[0].strip()
listings['neighbourhood'].unique()
array(['朝阳区', '密云县', '东城区', '西城区', '海淀区', '顺义区', '房山区', '怀柔区', '昌平区',
'通州区', '丰台区', '大兴区', '延庆县', '石景山区', '门头沟区', '平谷区'], dtype=object)
# 查看经纬度是否有异常值
listings['longitude'].describe()
count 28452.000000
mean 116.442000
std 0.204796
min 115.473390
25% 116.355283
50% 116.434665
75% 116.491122
max 117.495270
Name: longitude, dtype: float64
#经度和纬度的箱型图
fig=plt.figure()
fig.add_subplot(121)
listings.boxplot(column='longitude')
fig.add_subplot(122)
listings.boxplot(column='latitude')
plt.show()
[外链图片转存失败(img-lm4nPlmK-1569078306325)(output_18_0.png)]
#从经纬度的箱型图看,考虑到北京城区面积大,异常点的误差都在1°内,
# 所以把所有经纬度数据认为是正常值范围。
# 查看房屋类型有多少种
#房屋类型有三种,分别是Entire home/apt,Private room,Shared room
listings['room_type'].unique()
array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object)
# 查看价格是否存在异常值
listings['price'].describe()
count 28452.000000
mean 611.203325
std 1623.535077
min 0.000000
25% 235.000000
50% 389.000000
75% 577.000000
max 68983.000000
Name: price, dtype: float64
fig=plt.figure()
fig.add_subplot(131)
listings[listings.room_type=='Entire home/apt'].boxplot(column='price')
plt.title('Entire home/apt')
fig.add_subplot(132)
listings[listings.room_type=='Private room'].boxplot(column='price')
plt.title('Private room')
fig.add_subplot(133)
listings[listings.room_type=='Shared room'].boxplot(column='price')
plt.title('Shared room')
plt.show()
[外链图片转存失败(img-2PSdDcwj-1569078306326)(output_24_0.png)]
#查看价格为0的房源基本信息,
listings[listings['price']==0]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5085 | 20670843 | 【胡同老宅~轻语竹林房】旅游绝佳地段】步行即到雍和宫、近故宫天安门、南锣鼓巷、美食簋街 | 129840905 | Jing | 东城区 | 39.94292 | 116.41323 | Entire home/apt | 0 | 2 | 81 | 2019-03-31 | 4.09 | 8 | 27 |
5806 | 21246510 | 限时 北京二环四合院别墅拍摄聚会 商务会议 娱乐同仁堂老宅 近簋街、雍和宫、东直门、南锣鼓巷... | 83233661 | Eva | 东城区 | 39.93677 | 116.42076 | Entire home/apt | 0 | 1 | 0 | NaN | NaN | 6 | 167 |
28234 | 33895187 | 测试房源mm2 | 185140389 | Ning Host | 朝阳区 | 39.98147 | 116.47109 | Private room | 0 | 1 | 0 | NaN | NaN | 2 | 359 |
#由名字可判断这个价格肯定不会为0,
#所以我把几个房源的price改为nan
listings.loc[listings['price']==0,'price']=np.nan
listings['price'].describe()
count 28449.000000
mean 611.267777
std 1623.608547
min 27.000000
25% 235.000000
50% 389.000000
75% 577.000000
max 68983.000000
Name: price, dtype: float64
#查看了一下50000以上的房源的基本信息
#我认为因为北京有很多四合院,所以50000以上应该也是存在的吧
#这里不认为50000以上是异常值了
listings[listings['price']>50000]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1067 | 12689987 | Artistic apartment with culture | 68973377 | 晨斌 | 朝阳区 | 39.92300 | 116.57996 | Entire home/apt | 67104.0 | 1 | 2 | 2016-06-03 | 0.06 | 2 | 365 |
2012 | 15488817 | Hotel apartment close to huge Mall | 68973377 | 晨斌 | 朝阳区 | 39.91962 | 116.59173 | Entire home/apt | 63346.0 | 1 | 16 | 2017-04-17 | 0.55 | 2 | 180 |
5167 | 20748712 | 大望路/九龙山大床房 | 141070198 | 洋 | 朝阳区 | 39.88798 | 116.47667 | Entire home/apt | 59997.0 | 1 | 1 | 2017-09-05 | 0.05 | 3 | 91 |
6612 | 21942314 | 【温馨小窝窝】近地铁一号线五棵松/万寿路,距离北京西站3站地,15分钟。 | 48178909 | Qing | 海淀区 | 39.89523 | 116.28252 | Shared room | 59997.0 | 1 | 4 | 2018-03-23 | 0.24 | 1 | 180 |
10170 | 24994830 | 良乡大学城两室温馨小屋 | 188806180 | 王 | 房山区 | 39.72157 | 116.15182 | Entire home/apt | 68828.0 | 1 | 1 | 2018-09-28 | 0.15 | 1 | 181 |
13668 | 27587044 | 房源已下架 | 208158466 | 晶 | 昌平区 | 40.08912 | 116.29895 | Private room | 66667.0 | 30 | 0 | NaN | NaN | 1 | 91 |
14697 | 28134193 | 此房不能租,不要询问了 | 212328505 | 陈 | 海淀区 | 39.94947 | 116.36246 | Entire home/apt | 68983.0 | 1 | 1 | 2018-09-10 | 0.14 | 1 | 90 |
16207 | 28803519 | 【北京站地铁3分钟.故宫周边最优惠.王府井商圈】溪流到家静雅民宿 | 216392612 | 容 | 东城区 | 39.90583 | 116.42199 | Entire home/apt | 65970.0 | 1 | 1 | 2018-10-29 | 0.18 | 1 | 0 |
17083 | 29138170 | 全A小筑 | 74938348 | 洋 | 朝阳区 | 39.89685 | 116.45925 | Entire home/apt | 59997.0 | 1 | 0 | NaN | NaN | 1 | 0 |
21809 | 31535043 | 水立方,鸟巢附近六人间男神床位 | 236331220 | 王林 | 昌平区 | 40.07817 | 116.42163 | Shared room | 67909.0 | 1 | 0 | NaN | NaN | 2 | 180 |
listings['minimum_nights'].describe()
count 28452.000000
mean 2.729685
std 17.920932
min 1.000000
25% 1.000000
50% 1.000000
75% 1.000000
max 1125.000000
Name: minimum_nights, dtype: float64
#查看最小入住天数的箱型图
listings.boxplot(column='minimum_nights')
plt.show()
[外链图片转存失败(img-NiB0o4PZ-1569078306327)(output_30_0.png)]
# 查阅入住最小天数为400天以上的情况
#经查,结合地理位置和房型等信息,入住最小天数为400天以上是合理的
listings[listings['minimum_nights']>400]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1175 | 13183350 | 鸟巢旁欧式罗曼蒂克两居 | 56183837 | Jack | 朝阳区 | 39.99724 | 116.40110 | Entire home/apt | 463.0 | 1000 | 2 | 2016-07-25 | 0.06 | 1 | 0 |
5609 | 21098578 | 6号14号线金台路青旅燕儿窝上下铺国贸CBD | 129486966 | 磊 | 朝阳区 | 39.91924 | 116.48459 | Shared room | 121.0 | 1124 | 4 | 2018-08-03 | 0.33 | 6 | 362 |
6505 | 21841908 | 鸟巢水立方朝南大主卧 | 159278266 | 小梦 | 海淀区 | 40.03241 | 116.36671 | Private room | 255.0 | 1000 | 0 | NaN | NaN | 1 | 0 |
19983 | 30752723 | 中关村新东方北大清华五道口颐和园圆明园魏公村 | 220133868 | 未知 | 海淀区 | 39.98541 | 116.31650 | Entire home/apt | 9998.0 | 1125 | 1 | 2019-01-29 | 0.38 | 2 | 0 |
25664 | 33297102 | 姚家园西里小区 | 250785867 | 子豪 | 朝阳区 | 39.94690 | 116.51349 | Shared room | 2798.0 | 500 | 0 | NaN | NaN | 1 | 365 |
#查看availability_365列
#由统计信息可知,availability_365的最大值是365,最小值是0是合理的
#所以判定availability_365列无异常值
listings['availability_365'].describe()
count 28452.000000
mean 220.342120
std 138.430677
min 0.000000
25% 87.000000
50% 209.000000
75% 361.000000
max 365.000000
Name: availability_365, dtype: float64
#由经纬度查看房源的地理位置分布情况
plt.scatter(x=listings['longitude'],y=listings['latitude'],alpha=0.1)
longitude_mean=listings['longitude'].mean()
latitude_mean=listings['latitude'].mean()
plt.scatter(x=longitude_mean,y=latitude_mean,c='r')
plt.text(longitude_mean, latitude_mean-0.1, ('%1.3f'%longitude_mean,'%1.3f'%latitude_mean),ha='center', va='bottom', fontsize=12)
plt.title('北京房源的地理位置分布情况')
plt.xlabel('经度')
plt.ylabel('纬度')
plt.savefig('北京房源的地理位置分布散点图.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外链图片转存失败(img-wnd30ozn-1569078306327)(output_34_0.png)]
由散点图可看出北京房源的地理位置分布情况
其中经纬度的均值点为(116.442,39.983),该点为北京市朝阳区西坝河路附近
北京朝阳区的房源密度最高
#绘制北京各城区房源数量条形图
plt.bar(listings['neighbourhood'].value_counts().index,listings['neighbourhood'].value_counts())
plt.title('北京各城区房源数量')
plt.xticks(listings['neighbourhood'].value_counts().index, listings['neighbourhood'].value_counts().index, rotation='45')
plt.savefig('北京各城区房源数量.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外链图片转存失败(img-TNNfZQMX-1569078306328)(output_36_0.png)]
listings['neighbourhood'].value_counts()
朝阳区 10810
东城区 3346
海淀区 3197
丰台区 1758
西城区 1701
通州区 1290
昌平区 1034
密云县 935
顺义区 920
怀柔区 833
大兴区 823
延庆县 718
房山区 579
石景山区 213
门头沟区 152
平谷区 143
Name: neighbourhood, dtype: int64
由北京各城区房源数量条形图可看出,北京朝阳区的房源数量最多,超过10000套,远远高于其他行政区,北京平谷区房源最少.
房源数量排名前三位的分别是朝阳区,东城区和海淀区.
listings['neighbourhood'].value_counts()
朝阳区 10810
东城区 3346
海淀区 3197
丰台区 1758
西城区 1701
通州区 1290
昌平区 1034
密云县 935
顺义区 920
怀柔区 833
大兴区 823
延庆县 718
房山区 579
石景山区 213
门头沟区 152
平谷区 143
Name: neighbourhood, dtype: int64
#查看不同房型的房源数量
fig,ax1=plt.subplots()
t=listings['room_type'].value_counts().index
data1=listings['room_type'].value_counts()
data2=[listings.loc[listings.room_type=='Entire home/apt','price'].mean(),
listings.loc[listings.room_type=='Private room','price'].mean(),
listings.loc[listings.room_type=='Shared room','price'].mean()]
ax1.bar(t,data1,width=0.3)
ax1.set_ylabel('房源数量')
#为每一个柱子添加数字标签
for x,y in enumerate(listings['room_type'].value_counts()):
plt.text(x,y+200,y,ha='center')
ax2 = ax1.twinx()
#查看不同房型的平均价格
ax2.plot(t,data2,c='r')
ax2.set_ylabel('平均价格')
plt.show()
[外链图片转存失败(img-yWhxoEAv-1569078306328)(output_40_0.png)]
# 查看北京不同地区不同房型的房源数量
listings['neighbourhood'].value_counts().index
labels=['朝阳区', '东城区', '海淀区', '丰台区', '西城区', '通州区', '昌平区', '密云县', '顺义区', '怀柔区',
'大兴区', '延庆县', '房山区', '石景山区', '门头沟区', '平谷区']
# 房型
# typeroom=['Entire home/apt', 'Private room', 'Shared room']
x=np.arange(16)
# Entire home/apt
y1=listings[listings.room_type=='Entire home/apt']['neighbourhood'].value_counts().values
plt.bar(x,y1,width=0.25,label='Entire home/apt')
#Private room
y2=listings[listings.room_type=='Private room']['neighbourhood'].value_counts().values
plt.bar(x+0.25,y2,width=0.25,label='Private room')
#Shared room
y3=listings[listings.room_type=='Shared room']['neighbourhood'].value_counts().values
plt.bar(x+0.5,y3,width=0.25,label='Shared room')
plt.title('北京不同地区不同房型的房源数量')
plt.xticks(np.arange(16),labels,rotation='60')
plt.legend()
plt.savefig('北京各城区不同房型的房源数量.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外链图片转存失败(img-CB99ABrO-1569078306328)(output_41_0.png)]
# 北京不同城区不同房型平均价格折线图
mm=listings.groupby(['neighbourhood','room_type'])['price'].mean()
xx=np.arange(16)
labels=['东城区','丰台区','大兴区','密云县','平谷区','延庆县','怀柔区','房山区',
'昌平区','朝阳区','海淀区','石景山区','西城区','通州区','门头沟区','顺义区']
y11=[]
y22=[]
y33=[]
#entire room/apt
for i in range(16):
y11.append(mm[3*i])
#private room
for i in range(16):
y22.append(mm[3*i+1])
#shared_room
for i in range(16):
y33.append(mm[3*i+2])
plt.plot(xx,y11,label='Entire home/apt')
plt.plot(xx,y22,label='Private room')
plt.plot(xx,y33,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.show()
[外链图片转存失败(img-nnLBtSZJ-1569078306329)(output_42_0.png)]
# 考虑到这个图有部分城区的Shared room比Entire home/apt的平均价格还高,显然是不合理的
# 所以我从从实际出发,取每种房型的25%-75%之间的租金图,然后取平均数
# 北京不同城区不同房型平均价格条形图
mm=listings.groupby(['neighbourhood','room_type'])['price'].mean()
xx=np.arange(16)
labels=['东城区','丰台区','大兴区','密云县','平谷区','延庆县','怀柔区','房山区',
'昌平区','朝阳区','海淀区','石景山区','西城区','通州区','门头沟区','顺义区']
y11=[]
y22=[]
y33=[]
#entire room/apt
for i in range(16):
y11.append(mm[3*i])
#private room
for i in range(16):
y22.append(mm[3*i+1])
#shared_room
for i in range(16):
y33.append(mm[3*i+2])
plt.bar(xx,y11,width=0.1,label='Entire home/apt')
plt.bar(xx+0.1,y22,width=0.1,label='Private room')
plt.bar(xx+0.2,y33,width=0.1,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.show()
[外链图片转存失败(img-nWVXDVbX-1569078306329)(output_44_0.png)]
考虑到这个图有部分城区的Shared room比Entire home/apt的平均价格还高,显然是不合理的
所以我从从实际出发,取箱型图的中位数作为各城区不同房型的价格参考标准
#这是不同房型的箱型图
fig=plt.figure()
fig.add_subplot(131)
listings[listings.room_type=='Entire home/apt'].boxplot(column='price')
plt.title('Entire home/apt')
fig.add_subplot(132)
listings[listings.room_type=='Private room'].boxplot(column='price')
plt.title('Private room')
fig.add_subplot(133)
listings[listings.room_type=='Shared room'].boxplot(column='price')
plt.title('Shared room')
plt.savefig('不同房型价格分布箱型图.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外链图片转存失败(img-yV66TVHx-1569078306329)(output_46_0.png)]
#不同城区Entire home/apt的中位数
listings[listings.room_type=='Entire home/apt'].groupby('neighbourhood')['price'].describe()['50%']
neighbourhood
东城区 530.0
丰台区 396.0
大兴区 379.0
密云县 799.0
平谷区 819.0
延庆县 1000.0
怀柔区 1678.0
房山区 282.0
昌平区 537.0
朝阳区 470.0
海淀区 490.0
石景山区 429.0
西城区 497.0
通州区 336.0
门头沟区 289.0
顺义区 396.0
Name: 50%, dtype: float64
#不同城区Private room的中位数
listings[listings.room_type=='Private room'].groupby('neighbourhood')['price'].describe()['50%']
neighbourhood
东城区 336.0
丰台区 188.0
大兴区 177.5
密云县 356.0
平谷区 382.0
延庆县 497.0
怀柔区 537.0
房山区 201.0
昌平区 201.0
朝阳区 215.0
海淀区 242.0
石景山区 251.5
西城区 302.0
通州区 188.0
门头沟区 899.0
顺义区 255.0
Name: 50%, dtype: float64
#不同城区Shared room的中位数
listings[listings.room_type=='Shared room'].groupby('neighbourhood')['price'].describe()['50%']
neighbourhood
东城区 107.0
丰台区 127.0
大兴区 148.0
密云县 148.0
平谷区 101.0
延庆县 1188.0
怀柔区 886.0
房山区 174.0
昌平区 107.0
朝阳区 101.0
海淀区 101.0
石景山区 140.5
西城区 107.0
通州区 94.0
门头沟区 94.0
顺义区 157.5
Name: 50%, dtype: float64
# 北京地区各城区不同房型中位数价格分布折线图(这个图太丑,舍弃)
xx=np.arange(16)
labels=['东城区','丰台区','大兴区','密云县','平谷区','延庆县','怀柔区','房山区',
'昌平区','朝阳区','海淀区','石景山区','西城区','通州区','门头沟区','顺义区']
#不同城区Entire home/apt的中位数
y11=listings[listings.room_type=='Entire home/apt'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城区Private room的中位数
y22=listings[listings.room_type=='Private room'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城区Shared room的中位数
y33=listings[listings.room_type=='Shared room'].groupby('neighbourhood')['price'].describe()['50%'].values
plt.plot(xx,y11,label='Entire home/apt')
plt.plot(xx,y22,label='Private room')
plt.plot(xx,y33,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.show()
[外链图片转存失败(img-xiRIpoJM-1569078306330)(output_50_0.png)]
# 北京地区各城区不同房型中位数价格分布
xx=np.arange(16)
labels=['东城区','丰台区','大兴区','密云县','平谷区','延庆县','怀柔区','房山区',
'昌平区','朝阳区','海淀区','石景山区','西城区','通州区','门头沟区','顺义区']
#不同城区Entire home/apt的中位数
y11=listings[listings.room_type=='Entire home/apt'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城区Private room的中位数
y22=listings[listings.room_type=='Private room'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城区Shared room的中位数
y33=listings[listings.room_type=='Shared room'].groupby('neighbourhood')['price'].describe()['50%'].values
plt.bar(xx,y11,width=0.2,label='Entire home/apt')
plt.bar(xx+0.2,y22,width=0.2,label='Private room')
plt.bar(xx+0.4,y33,width=0.2,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.grid(axis='y',alpha=0.2)
plt.title('北京地区不同城区不同房型中位数价格分布')
plt.savefig('北京地区各城区不同房型中位数价格分布.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外链图片转存失败(img-tqB1XZ1n-1569078306330)(output_51_0.png)]
怀柔区,延庆县,平谷区,密云县的Entire home/apt的平均价格远高于北京主城区,可能是因为这些行政区内有多处度假村。
延庆县,怀柔区的Shared room的价格高于Private room,可能是因为他们Shared room的样本数据太少,均只有三组数据。
门头沟区的Private room价格远高于Entire home/apt,可能是因为门头沟区的Private room的样本数据较少,只有39组,且提供的数据中高租金价格占比较多。
len(listings[(listings.neighbourhood=='密云县') & (listings.room_type=='Entire home/apt')]['price'])
496
jiagecanzhaobiao=listings.groupby(['neighbourhood','room_type'])['price'].describe()['50%']
jiagecanzhaobiao.to_excel('价格参照表.xlsx')
len(listings[(listings.neighbourhood=='密云县') & (listings.room_type=='Shared room')]['price'])
5
#查看排名前十的房东信息,这是大客户
dakehufangdong=listings.groupby(['host_id','host_name']).agg({'id':'count'}).sort_values(by='id',axis=0,ascending=False)[:20]
dakehufangdong.to_excel('大客户房东信息表.xlsx')
# 查看第一名大客户"美婷"的房源分布情况
listings[listings.host_id==209669028]['neighbourhood'].value_counts()
朝阳区 178
东城区 44
Name: neighbourhood, dtype: int64
# 查看第二名大客户"兴伟"的房源分布情况
listings[listings.host_id==54436429]['neighbourhood'].value_counts()
海淀区 137
朝阳区 57
东城区 15
丰台区 1
Name: neighbourhood, dtype: int64
# 查看第三名大客户"海梅"的房源分布情况
listings[listings.host_id==156249912]['neighbourhood'].value_counts()
朝阳区 113
海淀区 2
Name: neighbourhood, dtype: int64
# 查看第四名大客户"Cathy"的房源分布情况
# listings[listings.host_id==17619297]
listings[listings.host_id==17619297]['neighbourhood'].value_counts()
海淀区 47
朝阳区 45
西城区 4
昌平区 1
东城区 1
Name: neighbourhood, dtype: int64
# 查看第三名大客户"金桔精品民宿"的房源分布情况
# listings[listings.host_id==156143513]
listings[listings.host_id==156143513]['neighbourhood'].value_counts()
通州区 54
顺义区 19
朝阳区 4
Name: neighbourhood, dtype: int64
#绘制前三名客户的房源分布图
x=np.arange(5)
labels=['美婷','兴伟','海梅','Cathy','金桔精品民宿']
#海淀区
y1=[0,137,2,47,0]
plt.bar(x,y1,width=0.1,label='海淀区')
#朝阳区
y2=[178,57,113,45,4]
plt.bar(x+0.1,y2,width=0.1,label='朝阳区')
#东城区
y3=[44,15,0,1,0]
plt.bar(x+0.2,y3,width=0.1,label='东城区')
#丰台区
y4=[0,1,0,0,0]
plt.bar(x+0.3,y4,width=0.1,label='丰台区')
#昌平区
y5=[0,0,0,1,0]
plt.bar(x+0.4,y5,width=0.1,label='昌平区')
#西城区
y6=[0,0,0,4,0]
plt.bar(x+0.5,y6,width=0.1,label='西城区')
# 通州区
y7=[0,0,0,0,54]
plt.bar(x+0.6,y7,width=0.1,label='通州区')
# 顺义区
y8=[0,0,0,0,19]
plt.bar(x+0.7,y8,width=0.1,label='顺义区')
plt.xticks(np.arange(5),labels)
plt.legend()
plt.title('前五名大房东的房源分布图')
plt.savefig('前五名大房东的房源分布图.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外链图片转存失败(img-UlqCKdXy-1569078306330)(output_63_0.png)]
由上图可以看出,房源数量排名前五的大房东中朝阳区和海淀区的房源数量最多,而且这些大房东的房源分布通常在两个到三个行政区.
calendar.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
listing_id | date | available | price | adjusted_price | minimum_nights | maximum_nights | |
---|---|---|---|---|---|---|---|
0 | 1165040 | 2019-04-17 | f | $511.00 | $511.00 | 1.0 | 1125.0 |
1 | 1165040 | 2019-04-18 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
2 | 1165040 | 2019-04-19 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
3 | 1165040 | 2019-04-20 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
4 | 1165040 | 2019-04-21 | t | $511.00 | $511.00 | 1.0 | 1125.0 |
#查看calendar表
#删除price和adjusted_price的美元符号
for i in range(len(calendar)):
new_price=calendar['price'].values[i].strip('$')
new_price=new_price.replace(',','')
calendar['price'].values[i]=float(new_price)
new_adjustedprice=calendar['adjusted_price'].values[i].strip('$')
new_adjustedprice=new_adjustedprice.replace(',','')
calendar['adjusted_price'].values[i]=float(new_adjustedprice)
#查看修改后的calendar的
calendar.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
listing_id | date | available | price | adjusted_price | minimum_nights | maximum_nights | |
---|---|---|---|---|---|---|---|
0 | 1165040 | 2019-04-17 | f | 511 | 511 | 1.0 | 1125.0 |
1 | 1165040 | 2019-04-18 | t | 511 | 511 | 1.0 | 1125.0 |
2 | 1165040 | 2019-04-19 | t | 511 | 511 | 1.0 | 1125.0 |
3 | 1165040 | 2019-04-20 | t | 511 | 511 | 1.0 | 1125.0 |
4 | 1165040 | 2019-04-21 | t | 511 | 511 | 1.0 | 1125.0 |
#假定自己是一名普通游客,带妈妈在朝阳区租一个房间private room,价格在300-1000左右,计划入住三天,从2019-10-01开始入住,2019-10-04退房
#查看合适的房子
#want是listings表中符合要求的房源的基本信息,共计688家.
want=listings[(listings.room_type=='Private room')& (listings.price>300) &
(listings.price<1000) & (listings.availability_365>0)
& (listings.minimum_nights<4)&(listings.neighbourhood=='朝阳区')]
len(want)
688
#这是calendar表里满足listing_id在want表的id的信息,且10-1到10-4都是可租的
new1=calendar[(calendar.date=='2019-10-01')&(calendar.available=='t')&(calendar['listing_id'].isin(want['id'].values))]
new2=calendar[(calendar.date=='2019-10-02')&(calendar.available=='t')&(calendar['listing_id'].isin(new1['listing_id'].values))]
new3=calendar[(calendar.date=='2019-10-03')&(calendar.available=='t')&(calendar['listing_id'].isin(new2['listing_id'].values))]
new4=calendar[(calendar.date=='2019-10-04')&(calendar.available=='t')&(calendar['listing_id'].isin(new3['listing_id'].values))]
#查看calendar建成的表new4的统计信息
new4[['minimum_nights','maximum_nights']].describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
minimum_nights | maximum_nights | |
---|---|---|
count | 513.000000 | 513.000000 |
mean | 1.222222 | 939.087719 |
std | 1.168154 | 391.935275 |
min | 1.000000 | 1.000000 |
25% | 1.000000 | 1125.000000 |
50% | 1.000000 | 1125.000000 |
75% | 1.000000 | 1125.000000 |
max | 24.000000 | 1125.000000 |
#取new4表中最小晚数<=4的,最大晚数>=3的
new5=new4[(new4.minimum_nights<=4)&(new4.maximum_nights>=3)]
#new5即为满足要求的房源的时间信息表
reviews.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
listing_id | id | date | reviewer_id | reviewer_name | comments | |
---|---|---|---|---|---|---|
0 | 44054 | 84748 | 2010-08-25 | 207019 | Jarrod | Sev was very helpful. Sev showed us where to ... |
1 | 44054 | 118384 | 2010-10-13 | 218723 | Kimberly | We arrived in Beijing very early in the mornin... |
2 | 44054 | 436978 | 2011-08-11 | 609177 | Emma | It is a really massive apartment and really co... |
3 | 44054 | 1118657 | 2012-04-12 | 1787536 | Andreyna | Sev was incredibly helpful, showed us around t... |
4 | 44054 | 2140650 | 2012-08-30 | 1179565 | Frances | The appartment was ideal for our party of 6 ad... |
reviews.info()
RangeIndex: 202099 entries, 0 to 202098
Data columns (total 6 columns):
listing_id 202099 non-null int64
id 202099 non-null int64
date 202099 non-null object
reviewer_id 202099 non-null int64
reviewer_name 202093 non-null object
comments 201983 non-null object
dtypes: int64(3), object(3)
memory usage: 9.3+ MB
# 虽然reviewer_name有部分缺失,但是reviewer_id没有缺失,所以没有关系
# comments缺失也是可以接受的
#查找前20名评论次数最多的reviewer_id
top20_reviewers=reviews['reviewer_id'].value_counts()[:20]
top20_reviewers
186684246 43
21067785 35
158695647 34
99325050 32
149769588 26
140955472 26
213893643 24
6532783 23
229832388 23
196283240 23
28903457 23
117241519 21
104082034 21
3671922 21
165536239 20
16660997 20
10684339 20
228835331 20
43905550 19
50995265 19
Name: reviewer_id, dtype: int64
#创建dataframe topreviewer
#收集前20名评论次数最多的reviewer_id的基本信息,包括reviewer_id和reviewer_name和评论总条数
#创建dataframe topreviewer用来存放前20名评论次数最多的reviewer的信息
topreviewer=pd.DataFrame({'top_reviewer_id':np.arange(20),'top_reviewer_name':['none']*20,'sum_reviews':np.arange(20)})
#在topreviewer存放reviewer_id信息
topreviewer['top_reviewer_id']=top20_reviewers.index
#在topreviewer存放reviewer_name信息
for i in range(20):
name=reviews[reviews.reviewer_id ==top20_reviewers.index[i] ].reviewer_name.unique()
topreviewer.loc[i,'top_reviewer_name']=name
#在topreviewer存放sum_reviews信息
topreviewer['sum_reviews']=top20_reviewers.values
topreviewer
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
top_reviewer_id | top_reviewer_name | sum_reviews | |
---|---|---|---|
0 | 186684246 | Tomm | 43 |
1 | 21067785 | Jasmine | 35 |
2 | 158695647 | 未 | 34 |
3 | 99325050 | 新月 | 32 |
4 | 149769588 | 金龙 | 26 |
5 | 140955472 | Marines | 26 |
6 | 213893643 | 赛亚 | 24 |
7 | 6532783 | Dee | 23 |
8 | 229832388 | 星河 | 23 |
9 | 196283240 | 羊阳 | 23 |
10 | 28903457 | Yan | 23 |
11 | 117241519 | 兰兰 | 21 |
12 | 104082034 | Jonmiae | 21 |
13 | 3671922 | Kum Hong | 21 |
14 | 165536239 | Holm | 20 |
15 | 16660997 | Tao | 20 |
16 | 10684339 | Mia | 20 |
17 | 228835331 | Y | 20 |
18 | 43905550 | Salome | 19 |
19 | 50995265 | Bitong | 19 |
# 把topreviewer保存成表格topreviewer
topreviewer.to_excel('topreviewer.xlsx',index = False)
#删除评论中的‘/r/n’,并写入file
file = open('comments1.txt','w',encoding='utf-8');
for i in range(len(reviews)):
str1=str(reviews.comments.values[i]).replace('\r\n','')
file.write(str1)
file.close()
#导入词云相关的库
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
import numpy as np
import jieba
import wordcloud
#绘制评论词云
file = open('comments1.txt','r',encoding='utf-8');
data = file.read() # 读出数据
path_img='beijingmap.jpg'
background_image = np.array(Image.open(path_img))
w = wordcloud.WordCloud(font_path='./fonts/simhei.ttf',max_words=100,
background_color="white",
mask=background_image).generate(data)
image_colors = ImageColorGenerator(background_image)
# 下面代码表示显示图片
plt.imshow(w.recolor(color_func=image_colors))
plt.axis("off")
plt.show()
w.to_file('comments.png')
[外链图片转存失败(img-vIn6IWtk-1569078306331)(output_87_0.png)]
从评论的词云图里可看出,旅客最看重交通便利,房东热情,房屋干净这三点。
此外部分旅客还会关注设施齐全,床舒服,离地铁站近,有家的感觉等。
reviews.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
listing_id | id | date | reviewer_id | reviewer_name | comments | |
---|---|---|---|---|---|---|
0 | 44054 | 84748 | 2010-08-25 | 207019 | Jarrod | Sev was very helpful. Sev showed us where to ... |
1 | 44054 | 118384 | 2010-10-13 | 218723 | Kimberly | We arrived in Beijing very early in the mornin... |
2 | 44054 | 436978 | 2011-08-11 | 609177 | Emma | It is a really massive apartment and really co... |
3 | 44054 | 1118657 | 2012-04-12 | 1787536 | Andreyna | Sev was incredibly helpful, showed us around t... |
4 | 44054 | 2140650 | 2012-08-30 | 1179565 | Frances | The appartment was ideal for our party of 6 ad... |
len(reviews)
202099
#评论表中不同房源出现次数
#np.array实现把index转换为数组
np.array(reviews['listing_id'].value_counts().index)
array([ 6622351, 6596814, 11911698, ..., 33781069, 28482595, 33261981],
dtype=int64)
# 取前5%
haofangzi_id=np.array(reviews['listing_id'].value_counts().index)[:865]
listings.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | host_id | host_name | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 44054 | Modern and Comfortable Living in CBD | 192875 | East Apartments | 朝阳区 | 39.89503 | 116.45163 | Entire home/apt | 792.0 | 1 | 89 | 2019-03-04 | 0.85 | 9 | 341 |
1 | 100213 | The Great Wall Box Deluxe Suite A团园长城小院东院套房 | 527062 | Joe | 密云县 | 40.68434 | 117.17231 | Private room | 1201.0 | 1 | 2 | 2017-10-08 | 0.10 | 4 | 0 |
2 | 128496 | Heart of Beijing: House with View 2 | 467520 | Cindy | 东城区 | 39.93213 | 116.42200 | Entire home/apt | 389.0 | 3 | 259 | 2019-02-05 | 2.70 | 1 | 93 |
3 | 161902 | cozy studio in center of Beijing | 707535 | Robert | 东城区 | 39.93357 | 116.43577 | Entire home/apt | 376.0 | 1 | 26 | 2016-12-03 | 0.28 | 5 | 290 |
4 | 162144 | nice studio near subway, sleep 4 | 707535 | Robert | 朝阳区 | 39.93668 | 116.43798 | Entire home/apt | 537.0 | 1 | 37 | 2018-08-01 | 0.40 | 5 | 352 |
aneighbour=[]
broomtype=[]
for i in range(865):
m1=listings[listings.id==haofangzi_id[i]]['neighbourhood'].values[0]
aneighbour.append(m1)
m2=listings[listings.id==haofangzi_id[i]]['room_type'].values[0]
broomtype.append(m2)
# from collections import Counter
# Counter(aneighbour) 它是用来统计不同元素出现次数的方法
#np.unique(y,return_counts=True) 这个方法也是统计不同元素出现次数的方法
aaneighbour=np.unique(aneighbour,return_counts=True)
bbroomtype=np.unique( broomtype,return_counts=True)
plt.bar(aaneighbour[0],aaneighbour[1])
plt.xticks(rotation='60')
plt.grid(axis='y',alpha=0.2)
plt.title('入住次数前5%的房源地区分布')
plt.savefig('入住次数前5%的房源地区分布.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外链图片转存失败(img-BSjHOqar-1569078306332)(output_97_0.png)]
# 从上图可以看出,朝阳区和东城区的民宿入住需求最高。
plt.bar(bbroomtype[0],bbroomtype[1],width=0.25)
# plt.xticks(rotation='60')
plt.grid(axis='y',alpha=0.2)
plt.title('入住次数前5%的房型分布')
plt.savefig('入住次数前5%的房型分布.png',dpi=500,bbox_inches = 'tight')
plt.show()
[外链图片转存失败(img-74gelpwG-1569078306332)(output_99_0.png)]
# 入住次数前5%的房型分布饼图
plt.figure(figsize=(5,5))
values=bbroomtype[1].tolist()
labels=bbroomtype[0].tolist()
explode=[0.01,0.01,0.01]#设定各项距离圆心n个半径
plt.pie(values,explode=explode,labels=labels,autopct='%1.1f%%',startangle=261)
plt.title('入住次数前5%的房型分布饼图')#绘制标题
plt.savefig('入住次数前5%的房型分布饼图',dpi=500,bbox_inches = 'tight')#保存图片
plt.show()
[外链图片转存失败(img-mEcm9b8L-1569078306332)(output_100_0.png)]
# 最受欢迎价格分布
pprice=[]
for i in range(865):
m1=listings[listings.id==haofangzi_id[i]]['price'].values[0]
pprice.append(m1)
pprice
dandan=pd.DataFrame(pprice)
dandan.plot.box(title="入住次数前5%的价格分布")
plt.grid(linestyle="--", alpha=0.3)
plt.savefig('入住次数前5%的价格分布饼图',dpi=500,bbox_inches = 'tight')
plt.show()
[外链图片转存失败(img-urzMcN8y-1569078306333)(output_103_0.png)]
# 入住次数前5%的价格分布散点图
plt.scatter(x=pprice,y=np.arange(len(pprice)))
plt.show()
[外链图片转存失败(img-YLRZC1YC-1569078306333)(output_104_0.png)]
dandan.describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | |
---|---|
count | 864.000000 |
mean | 383.391204 |
std | 259.259195 |
min | 67.000000 |
25% | 201.000000 |
50% | 329.000000 |
75% | 483.000000 |
max | 2221.000000 |