数据分析项目——北京二手房价数据分析
整个项目的代码和数据集获取:
https://github.com/Proberen/Data-analysis___Beijing__Houseprice
通过上述分析,可以发现户型与房屋总价具有关联性,由于该列的数据存在中文,且户型多样,因此需要进行处理。
共添加7列,包括别墅列、车位列、室列、厅列等。分解户型列的代码如下:
1、初始化添加列
data.loc[:,'BigHouse0'] = 0
data.loc[:,'BigHouse1'] = 0
data.loc[:,'BigHouse2'] = 0
data.loc[:,'BigHouse3'] = 0
data.loc[:,'car'] = 0
data.loc[:,'room'] = 0
data.loc[:,'office'] = 0
2、别墅列、车位列赋值
data=data.copy()
for i in range(len(data)):
if data.at[i,'HouseType']==' 叠拼别墅 ':
data.at[i,'HouseType'] = 0
data.at[i,'BigHouse0'] = 1
if data.at[i,'HouseType']==' 联排别墅 ':
data.at[i,'HouseType'] = 1
data.at[i,'BigHouse1'] = 1
if data.at[i,'HouseType']==' 独栋别墅 ':
data.at[i,'HouseType'] = 2
data.at[i,'BigHouse2'] = 1
if data.at[i,'HouseType']==' 双拼别墅 ':
data.at[i,'HouseType'] = 3
data.at[i,'BigHouse3'] = 1
if data.at[i,'HouseType']==' 车位 ':
data.at[i,'HouseType']= 4
data.at[i,'car'] = 1
3、定义分解室、厅函数
# 定义分解函数
def apart_room(x):
if x == 0:
return 0
if x == 1:
return 0
if x == 2:
return 0
if x == 3:
return 0
if x == 4:
return 0
room = x.split('室')[0]
return int(room)
def apart_hall(x):
if x == 0:
return 0
if x == 1:
return 0
if x == 2:
return 0
if x == 3:
return 0
if x == 4:
return 0
hall = x.split('厅')[0].split('室')[1]
return int(hall)
4、室、厅列赋值
# 给room、office列赋值
data['room'] = data['HouseType'].map(apart_room)
data['office'] = data['HouseType'].map(apart_hall)
在进行建模前,需要根据数据可视化分析的结果进行特征的选取,在本阶段,选取面积、户型、总价、城区、发布时间为特征,删除多余特征列。
# 删除户型、单价、简介、小区列
newdata = data.copy()
newdata.drop(columns=['HouseType','Price','Introduction','Quarters','Lotitude','Longitude','Latitude','PeopleNumber','WatchNumber'],inplace=True)
由于需要城区列中含有中文,城区为分类数据,根据下列代码将城区转换为数字。
labels = newdata['Dist'].unique().tolist()
newdata['Dist'] = newdata['Dist'].apply(lambda x :labels.index(x))
在建模前,对数据特征进行相关行分析,构建相关性矩阵热力图如下图所示,通过观察下图可以发现,面积、户型等特征与房屋总价的相关性都大于0.6,为重要的特征因素。
plt.figure(figsize=(10,10))
sns.heatmap(newdata.corr(),cmap="YlGnBu",linewidths=0.1,vmax=1.0,square=True,linecolor= 'white',annot=True)
plt.title('特征相关性热力图',size=20)
plt.show()
将数据集中的20%作为训练集,80%作为测试集,划分数据集的代码如下:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import graphviz
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
features = newdata.iloc[:,newdata.columns != 'TotalPrice']
target = newdata.iloc[:,newdata.columns == 'TotalPrice']
#划分数据集
features_train,features_test,target_train,target_test= train_test_split(features,target,test_size=0.2,random_state=25)
for i in [features_train,features_test,target_train,target_test]:
i.index = range(i.shape[0])
tr=[]
te=[]
tc=[]
N=10
for i in range(10):
clf = DecisionTreeRegressor(random_state=25,max_depth=i+1,criterion='mse')
clf=clf.fit(features_train,target_train)
score_tr = clf.score(features_train,target_train)
score_te = clf.score(features_test,target_test)
score_tc = cross_val_score(clf,features,target,cv=10).mean()
tr.append(score_tr)
te.append(score_te)
tc.append(score_tc)
print(max(te))
print(max(tc))
plt.figure(figsize=(20,10))
plt.plot(range(1,N+1),tr,color='red',label='train')
plt.plot(range(1,N+1),te,color='blue',label='test')
plt.plot(range(1,N+1),tc,color='green',label='cross')
plt.xticks(range(1,N+1))
plt.legend()
plt.xlabel("max_depth",size=20)
plt.ylabel("score",size=20)
plt.title('决策树学习曲线',size=20)
plt.show()
决策树的学习曲线如下图所示,在max_depth为7时交叉验证的得分最高,得分最高为0.59
根据学习曲线,构建决策树模型,决策树的测试集得分为0.74,交叉验证得分为0.59,说明该模型的拟合度较高
dt = DecisionTreeRegressor(max_depth=7,random_state=25)
dt=dt.fit(features_train,target_train)
print(f'训练集得分:{round(dt.score(features_train,target_train),5)}')
print(f'测试集得分:{round(dt.score(features_test,target_test),5)}')
print(f'交叉验证得分:{round(cross_val_score(dt,features,target.values.ravel(),cv=10).mean(),5)}')
feature_name = ['面积','发布时间','区域','别墅0','别墅1','别墅2','别墅3','车库','室','厅']
import graphviz
dot_data = tree.export_graphviz(dt,out_file = None
,feature_names = feature_name
,class_names=['a','b','c']
,filled=True
,rounded=True)
graph = graphviz.Source(dot_data)
graph.save('./graph/tree.dot')
graph
tr=[]
te=[]
tc=[]
N=10
for i in range(10):
clf = RandomForestRegressor(random_state=25,max_depth=i+1,criterion='mse')
clf=clf.fit(features_train,target_train.values.ravel())
score_tr = clf.score(features_train,target_train)
score_te = clf.score(features_test,target_test)
score_tc = cross_val_score(clf,features,target.values.ravel(),cv=10).mean()
tr.append(score_tr)
te.append(score_te)
tc.append(score_tc)
print(max(te))
print(max(tc))
plt.figure(figsize=(20,10))
plt.plot(range(1,N+1),tr,color='red',label='train')
plt.plot(range(1,N+1),te,color='blue',label='test')
plt.plot(range(1,N+1),tc,color='green',label='cross')
plt.xticks(range(1,N+1))
plt.legend()
plt.xlabel("max_depth",size=20)
plt.ylabel("score",size=20)
plt.title('随机森林学习曲线',size=20)
plt.show()
随机森林的学习曲线如下图所示,在max_depth为10时交叉验证的得分最高,得分最高为0.68
rf = RandomForestRegressor(n_estimators=100,max_depth=10)
rf.fit(features_train,target_train.values.ravel())
print(f'训练集得分:{round(rf.score(features_train,target_train),5)}')
print(f'测试集得分:{round(rf.score(features_test,target_test),5)}')
print(f'交叉验证得分:{round(cross_val_score(rf,features,target.values.ravel(),cv=10).mean(),5)}')
构建该组数据的k-近邻模型如下所示,测试集得分表现较差,仅为0.57。准确率不如决策树和随机森林。
本次测试选取测试用例为30天前发布的面积为100平米西城区2室2厅的二手房
apply = np.array([100,
30,
1,
0,0,0,0,
0,
2,2]).reshape(1,-1)
print('------------总价预测结果-------------')
print(f'随机森林回归:{rf.predict(apply)[0]}万元')
print(f'决策树回归:{dt.predict(apply)[0]}万元')
print(f'K近邻回归:{kn.predict(apply)[0][0]}万元')
print('------------综合预测结果-------------')
print(((rf.predict(apply)+dt.predict(apply)+kn.predict(apply))/3.0)[0][0],'万元')