Scrapy爬虫与机器学习之三:房屋挂牌价格预测
本文在前期抓取房产中介二手房某区域所有2453套房屋基础上,使用机器学习的线性回归模型进行预测朋友拟挂牌房屋的价格。经过比较几个模型,使用平均误差评价指标MAE,发现表现最好的模型是Grandient Boosting Regressor. MAE=1837元/平方米。把朋友房屋信息输入此预测模型,得到的预测挂牌价格是75064.67元/平方米。比去年市场挂牌价下降了10%左右。
代码:
import pandas as pd
#import some necessary librairies
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)
from scipy import stats
from scipy.stats import norm, skew #for some statistics
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
from subprocess import check_output
#读入数据文件
dftrain = pd.read_csv(r'C:\Users\Guoli\Desktop\scrapyfolder\new_env\ajk\ajk0115.csv',encoding="gbk")
#删除总价
dftrain=dftrain.drop('totalprice',axis=1)
dftrain.head()
print("keys of df dataset:\n{}".format(dftrain.keys()))
from sklearn.model_selection import train_test_split
print("type of data:{}".format(type(dftrain['unitprice'])))
fig, ax = plt.subplots()
ax.scatter(x = dftrain['floorsize'], y = dftrain['unitprice'])
plt.ylabel('unitprice', fontsize=13)
plt.xlabel('floorsize', fontsize=13)
plt.show()
#删除异常数据:面积超过400平方米
dftrain = dftrain.drop(dftrain[(dftrain['floorsize']>400)].index)
#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(dftrain['floorsize'], dftrain['unitprice'])
plt.ylabel('unitprice', fontsize=13)
plt.xlabel('floorsize', fontsize=13)
plt.show()
#删除单价大于10万的数据
dftrain = dftrain.drop(dftrain[(dftrain['unitprice']>100000)].index)
#面积与单价的关系画图
#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(dftrain['floorsize'], dftrain['unitprice'])
plt.ylabel('unitprice', fontsize=13)
plt.xlabel('floorsize', fontsize=13)
plt.show()
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)
#查看单价的分布状态
from scipy import stats
from scipy.stats import norm, skew #for some statistics
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
from subprocess import check_output
import seaborn as sns
import matplotlib.pyplot as plt
sns.distplot(dftrain['unitprice'] , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(dftrain['unitprice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('unitprice distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(dftrain['unitprice'], plot=plt)
plt.show()
#相关系数
#Correlation map to see how features are correlated with SalePrice
corrmat = dftrain.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
dftrain.corr()#correlations of features
#category 独热编码
dftraindummy = pd.get_dummies(dftrain)
#分箱
bins = [0,0.1,0.5,0.9,1]
groupnames = ['floorsize1','floorsize2','floorsize3','floorsize4']
floorsize_binned = pd.get_dummies(pd.qcut(dftraindummy['floorsize'],bins,labels=groupnames))
dftrainbined = pd.concat([dftraindummy,floorsize_binned],axis=1)
#Correlation map to see how features are correlated with SalePrice
corrmat = dftrainbined.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
#划分训练集和测试集
X = dftrainbined.iloc[:,dftrainbined.columns!='unitprice']
y = dftrainbined['unitprice'].values
#预测分析
import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
#线性回归模型
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)#random_state=42时SCORE OF TEST:0.79
lr = LinearRegression().fit(X_train, y_train)
# print("Training set score:{:.2f}".format(lr.score(X_train,y_train)))
# print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))
print("Train set score:{:.2f}".format(lr.score(X_train,y_train)))
print("Test set score:{:.2f}".format(lr.score(X_test,y_test)))
mse = mean_squared_error(y_test, lr.predict(X_test))
print("MSE: %.4f" % mse)
#Ridge模型
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train, y_train)#alpha=1.0 by default
print("Ridge Training set score:{:.2f}".format(ridge.score(X_train, y_train)))
print("Ridge Test set score:{:.2f}".format(ridge.score(X_test, y_test)))
from sklearn.model_selection import cross_val_score
#Ridge CV优化
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)#random_state=42时SCORE OF TEST:0.79
ridge = Ridge()
param_grid={'alpha':[0.001,0.01,0.1,1]}
grid_search = GridSearchCV(ridge, param_grid, cv=5)
grid_search.fit(X_train,y_train)
print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))
print("Best parameters:{}".format(grid_search.best_params_))
print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))
print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))
#Lasso 模型
# L1 regularization, some coefficients are exactly zero.
from sklearn.linear_model import Lasso
lasso = Lasso(max_iter=1000000)
from sklearn.model_selection import GridSearchCV
param_grid={'alpha':[0.001,0.01,0.1,1]}
grid_search = GridSearchCV(lasso, param_grid,cv=5)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
grid_search.fit(X_train,y_train)
print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))
print("Best parameters:{}".format(grid_search.best_params_))
print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))
print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))
lasso=grid_search.fit(X_train,y_train)
acutuals = y_test
predictions = lasso.predict(X_test)
maelasso = median_absolute_error(actuals, predictions)
print("MAE: %.4f" % maelasso)#平均绝对差
#Lasso negelect some features, the score is not as good as Ridge, let's do another combination
#ElasticNet 模型
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
ENet = ElasticNet(l1_ratio=.9,max_iter=1000000, random_state=42)
param_grid={'alpha':[0.001,0.01,0.1,1]}
grid_search = GridSearchCV(ENet, param_grid,cv=5)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
grid_search.fit(X_train,y_train)
print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))
print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))
enet=grid_search.fit(X_train,y_train)
acutuals = y_test
predictions = enet.predict(X_test)
maeenet = median_absolute_error(actuals, predictions)
print("MAE: %.4f" % maeenet)#平均绝对差
# GBoost模型
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=5)
GBoost = GradientBoostingRegressor(n_estimators=500,learning_rate=0.1,max_depth=10, max_features='sqrt',
min_samples_leaf=5, min_samples_split=5,
loss='huber', random_state =42)
# param_grid={'n_estimator':[100,500,1000,2000,30000]}
# grid_search = GridSearchCV(GBoost, param_grid,cv=5)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
GBoost.fit(X_train,y_train)
print("cross-validation train scores:\n{}".format(np.mean(cross_val_score(GBoost,X_train,y_train,cv=kfold))))
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(GBoost,X_test,y_test,cv=kfold))))
mse = mean_squared_error(y_test, GBoost.predict(X_test))
print("MSE: %.4f" % mse)
from sklearn.metrics import mean_squared_log_error
msle = mean_squared_log_error(y_test, GBoost.predict(X_test))
print("MSLE: %.4f" % msle)
from sklearn.metrics import median_absolute_error
mae = median_absolute_error(y_test, GBoost.predict(X_test))
print("MAE: %.4f" % mae)#平均绝对差
# GradientBoosting Regressor 模型参数优化
GradientBoostingRegressor优化后mae:1795.62元/平方米,在所有模型表现最好
params = {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5,
'learning_rate': 0.1, 'loss': 'ls'}
clfr = ensemble.GradientBoostingRegressor(**params)
clfr.fit(X_train, y_train)
print("cross-validation train scores:\n{}".format(np.mean(cross_val_score(clfr,X_train,y_train,cv=kfold))))
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clfr,X_test,y_test,cv=kfold))))
mse = mean_squared_error(y_test, clfr.predict(X_test))
print("MSE: %.4f" % mse)
mae = median_absolute_error(y_test, clfr.predict(X_test))
print("MAE: %.4f" % mae)#平均绝对差
# compute test set deviance
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(clfr.staged_predict(X_test)):
test_score[i] = clfr.loss_(y_test, y_pred)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, clfr.train_score_, 'b-',
label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
# #############################################################################
# Plot feature importance
feature_importance = clfr.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, dftrainbined.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()
#
预测房屋的拟挂牌价格76293元/平方米,误差在1795.62元/平方米
# In[259]:
import numpy as np
X2 =np.array([3,2004,155,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0])
X3=X2.reshape(1, -1)
pred_clfr=clfr.predict(X3)
# In[260]:
#LightGBM模型
import lightgbm as lgb
gbm1 = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
learning_rate=0.05,
n_estimators=1000)
gbm1.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric='l1',
early_stopping_rounds=5)
print('Start predicting...')
y_pred = gbm1.predict(X_test)
# eval
#print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
maelgbm = median_absolute_error(y_test, y_pred)
print("MAE: %.4f" % maelgbm)#平均绝对差
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(gbm1,X_test,y_test,cv=5))))
#XGbOOST 模型
import pickle
import xgboost as xgb
import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import KFold
kf = KFold(n_splits=2, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X):
xgb_model = xgb.XGBRegressor().fit(X, y)
predictions = xgb_model.predict(X)
actuals = y
print(mean_squared_error(actuals, predictions))
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(xgb_model,X_test,y_test,cv=kfold))))
maelxgb_model = median_absolute_error(actuals, predictions)
print("MAE: %.4f" % maelxgb_model)#平均绝对差
xgb_model2 = xgb.XGBRegressor(n_estimators= 1000,early_stopping_rounds=5,learning_rate=0.05,n_jobs=-1)
clf = xgb_model2.fit(X_train,y_train)
predictions = clf.predict(X_test)
actuals = y_test
print(mean_squared_error(actuals, predictions))
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf,X_test,y_test,cv=kfold))))
maeclf = median_absolute_error(actuals, predictions)
print("MAE: %.4f" % maeclf)#平均绝对差
xgb_model3 = xgb.XGBRegressor(n_estimators= 2000,learning_rate=0.01,max_depth=2,n_jobs=-1)#learning rate 越小,得分越高
clf2 = xgb_model3.fit(X_train,y_train)
predictions = clf2.predict(X_test)
actuals = y_test
print(mean_squared_error(actuals, predictions))
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf2,X_test,y_test,cv=kfold))))
maeclf2 = median_absolute_error(actuals, predictions)
print("MAE: %.4f" % maeclf2)#平均绝对差
xgb_model4 = xgb.XGBRegressor(n_estimators= 1000,learning_rate=0.1,max_depth=3,gamma=0.01,n_jobs=-1)
clf3 = xgb_model4.fit(X_train,y_train)
predictions = clf3.predict(X_test)
actuals = y_test
print(mean_squared_error(actuals, predictions))
print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf3,X_test,y_test,cv=5))))
maeclf3 = median_absolute_error(actuals, predictions)
print("MAE: %.4f" % maeclf3)#平均绝对差
#
ensemble3 = X2*0.3+ pred_gbm1*0.7
ensemble3
#把表现最好的GBM模型保存
from sklearn.externals import joblib
model1=clfr
filename = 'GBMclfr_finalized_model0115.sav'
joblib.dump(model1, filename)
# some time later...
# load the model from disk
# loaded_model = joblib.load(filename)
# result = loaded_model.score(X_test, Y_test)
# print(result)
# #The sklearn API models are picklable
# print("Pickling sklearn API models")
# # must open in binary format to pickle
# pickle.dump(clf3, open("XGB0918NYCTaxi.pkl", "wb"))
# # clf2 = pickle.load(open("best_boston.pkl", "rb"))
# # print(np.allclose(clf.predict(X), clf2.predict(X)))