Scrapy爬虫与机器学习之三：房屋挂牌价格预测

本文在前期抓取房产中介二手房某区域所有2453套房屋基础上，使用机器学习的线性回归模型进行预测朋友拟挂牌房屋的价格。经过比较几个模型，使用平均误差评价指标MAE，发现表现最好的模型是Grandient Boosting Regressor. MAE=1837元/平方米。把朋友房屋信息输入此预测模型，得到的预测挂牌价格是75064.67元/平方米。比去年市场挂牌价下降了10%左右。

代码：

import pandas as pd

#import some necessary librairies

import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt # Matlab-style plotting

import seaborn as sns

color = sns.color_palette()

sns.set_style('darkgrid')

import warnings

def ignore_warn(*args, **kwargs):

pass

warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

from scipy import stats

from scipy.stats import norm, skew #for some statistics

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

from subprocess import check_output

#读入数据文件

dftrain = pd.read_csv(r'C:\Users\Guoli\Desktop\scrapyfolder\new_env\ajk\ajk0115.csv',encoding="gbk")

#删除总价

dftrain=dftrain.drop('totalprice',axis=1)

dftrain.head()

print("keys of df dataset:\n{}".format(dftrain.keys()))

from sklearn.model_selection import train_test_split

print("type of data:{}".format(type(dftrain['unitprice'])))

fig, ax = plt.subplots()

ax.scatter(x = dftrain['floorsize'], y = dftrain['unitprice'])

plt.ylabel('unitprice', fontsize=13)

plt.xlabel('floorsize', fontsize=13)

plt.show()

#删除异常数据：面积超过400平方米

dftrain = dftrain.drop(dftrain[(dftrain['floorsize']>400)].index)

#Check the graphic again

fig, ax = plt.subplots()

ax.scatter(dftrain['floorsize'], dftrain['unitprice'])

plt.ylabel('unitprice', fontsize=13)

plt.xlabel('floorsize', fontsize=13)

plt.show()

#删除单价大于10万的数据

dftrain = dftrain.drop(dftrain[(dftrain['unitprice']>100000)].index)

#面积与单价的关系画图

#Check the graphic again

fig, ax = plt.subplots()

ax.scatter(dftrain['floorsize'], dftrain['unitprice'])

plt.ylabel('unitprice', fontsize=13)

plt.xlabel('floorsize', fontsize=13)

plt.show()

import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt # Matlab-style plotting

import seaborn as sns

color = sns.color_palette()

sns.set_style('darkgrid')

import warnings

def ignore_warn(*args, **kwargs):

pass

warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

#查看单价的分布状态

from scipy import stats

from scipy.stats import norm, skew #for some statistics

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

from subprocess import check_output

import seaborn as sns

import matplotlib.pyplot as plt

sns.distplot(dftrain['unitprice'] , fit=norm);

# Get the fitted parameters used by the function

(mu, sigma) = norm.fit(dftrain['unitprice'])

print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],

loc='best')

plt.ylabel('Frequency')

plt.title('unitprice distribution')

#Get also the QQ-plot

fig = plt.figure()

res = stats.probplot(dftrain['unitprice'], plot=plt)

plt.show()

#相关系数

#Correlation map to see how features are correlated with SalePrice

corrmat = dftrain.corr()

plt.subplots(figsize=(12,9))

sns.heatmap(corrmat, vmax=0.9, square=True)

dftrain.corr()#correlations of features

#category 独热编码

dftraindummy = pd.get_dummies(dftrain)

#分箱

bins = [0,0.1,0.5,0.9,1]

groupnames = ['floorsize1','floorsize2','floorsize3','floorsize4']

floorsize_binned = pd.get_dummies(pd.qcut(dftraindummy['floorsize'],bins,labels=groupnames))

dftrainbined = pd.concat([dftraindummy,floorsize_binned],axis=1)

#Correlation map to see how features are correlated with SalePrice

corrmat = dftrainbined.corr()

plt.subplots(figsize=(12,9))

sns.heatmap(corrmat, vmax=0.9, square=True)

#划分训练集和测试集

X = dftrainbined.iloc[:,dftrainbined.columns!='unitprice']

y = dftrainbined['unitprice'].values

#预测分析

import numpy as np

import matplotlib.pyplot as plt

from sklearn import ensemble

from sklearn.utils import shuffle

from sklearn.metrics import mean_squared_error

#线性回归模型

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)#random_state=42时SCORE OF TEST:0.79

lr = LinearRegression().fit(X_train, y_train)

# print("Training set score:{:.2f}".format(lr.score(X_train,y_train)))

# print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

print("Train set score:{:.2f}".format(lr.score(X_train,y_train)))

print("Test set score:{:.2f}".format(lr.score(X_test,y_test)))

mse = mean_squared_error(y_test, lr.predict(X_test))

print("MSE: %.4f" % mse)

#Ridge模型

from sklearn.linear_model import Ridge

ridge = Ridge().fit(X_train, y_train)#alpha=1.0 by default

print("Ridge Training set score:{:.2f}".format(ridge.score(X_train, y_train)))

print("Ridge Test set score:{:.2f}".format(ridge.score(X_test, y_test)))

from sklearn.model_selection import cross_val_score

#Ridge CV优化

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)#random_state=42时SCORE OF TEST:0.79

ridge = Ridge()

param_grid={'alpha':[0.001,0.01,0.1,1]}

grid_search = GridSearchCV(ridge, param_grid, cv=5)

grid_search.fit(X_train,y_train)

print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))

print("Best parameters:{}".format(grid_search.best_params_))

print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))

#Lasso 模型

# L1 regularization, some coefficients are exactly zero.

from sklearn.linear_model import Lasso

lasso = Lasso(max_iter=1000000)

from sklearn.model_selection import GridSearchCV

param_grid={'alpha':[0.001,0.01,0.1,1]}

grid_search = GridSearchCV(lasso, param_grid,cv=5)

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

grid_search.fit(X_train,y_train)

print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))

print("Best parameters:{}".format(grid_search.best_params_))

print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))

lasso=grid_search.fit(X_train,y_train)

acutuals = y_test

predictions = lasso.predict(X_test)

maelasso = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maelasso)#平均绝对差

#Lasso negelect some features, the score is not as good as Ridge, let's do another combination

#ElasticNet 模型

from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC

ENet = ElasticNet(l1_ratio=.9,max_iter=1000000, random_state=42)

param_grid={'alpha':[0.001,0.01,0.1,1]}

grid_search = GridSearchCV(ENet, param_grid,cv=5)

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

grid_search.fit(X_train,y_train)

print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))

enet=grid_search.fit(X_train,y_train)

acutuals = y_test

predictions = enet.predict(X_test)

maeenet = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maeenet)#平均绝对差

# GBoost模型

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=5)

GBoost = GradientBoostingRegressor(n_estimators=500,learning_rate=0.1,max_depth=10, max_features='sqrt',

min_samples_leaf=5, min_samples_split=5,

loss='huber', random_state =42)

# param_grid={'n_estimator':[100,500,1000,2000,30000]}

# grid_search = GridSearchCV(GBoost, param_grid,cv=5)

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

GBoost.fit(X_train,y_train)

print("cross-validation train scores:\n{}".format(np.mean(cross_val_score(GBoost,X_train,y_train,cv=kfold))))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(GBoost,X_test,y_test,cv=kfold))))

mse = mean_squared_error(y_test, GBoost.predict(X_test))

print("MSE: %.4f" % mse)

from sklearn.metrics import mean_squared_log_error

msle = mean_squared_log_error(y_test, GBoost.predict(X_test))

print("MSLE: %.4f" % msle)

from sklearn.metrics import median_absolute_error

mae = median_absolute_error(y_test, GBoost.predict(X_test))

print("MAE: %.4f" % mae)#平均绝对差

# GradientBoosting Regressor 模型参数优化

GradientBoostingRegressor优化后mae：1795.62元/平方米，在所有模型表现最好

params = {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5,

'learning_rate': 0.1, 'loss': 'ls'}

clfr = ensemble.GradientBoostingRegressor(**params)

clfr.fit(X_train, y_train)

print("cross-validation train scores:\n{}".format(np.mean(cross_val_score(clfr,X_train,y_train,cv=kfold))))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clfr,X_test,y_test,cv=kfold))))

mse = mean_squared_error(y_test, clfr.predict(X_test))

print("MSE: %.4f" % mse)

mae = median_absolute_error(y_test, clfr.predict(X_test))

print("MAE: %.4f" % mae)#平均绝对差

# compute test set deviance

test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(clfr.staged_predict(X_test)):

test_score[i] = clfr.loss_(y_test, y_pred)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)

plt.title('Deviance')

plt.plot(np.arange(params['n_estimators']) + 1, clfr.train_score_, 'b-',

label='Training Set Deviance')

plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',

label='Test Set Deviance')

plt.legend(loc='upper right')

plt.xlabel('Boosting Iterations')

plt.ylabel('Deviance')

# #############################################################################

# Plot feature importance

feature_importance = clfr.feature_importances_

# make importances relative to max importance

feature_importance = 100.0 * (feature_importance / feature_importance.max())

sorted_idx = np.argsort(feature_importance)

pos = np.arange(sorted_idx.shape[0]) + .5

plt.subplot(1, 2, 2)

plt.barh(pos, feature_importance[sorted_idx], align='center')

plt.yticks(pos, dftrainbined.columns[sorted_idx])

plt.xlabel('Relative Importance')

plt.title('Variable Importance')

plt.show()

预测房屋的拟挂牌价格76293元/平方米，误差在1795.62元/平方米

# In[259]:

import numpy as np

X2 =np.array([3,2004,155,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0])

X3=X2.reshape(1, -1)

pred_clfr=clfr.predict(X3)

# In[260]:

#LightGBM模型

import lightgbm as lgb

gbm1 = lgb.LGBMRegressor(objective='regression',

num_leaves=31,

learning_rate=0.05,

n_estimators=1000)

gbm1.fit(X_train, y_train,

eval_set=[(X_test, y_test)],

eval_metric='l1',

early_stopping_rounds=5)

print('Start predicting...')

y_pred = gbm1.predict(X_test)

# eval

#print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

maelgbm = median_absolute_error(y_test, y_pred)

print("MAE: %.4f" % maelgbm)#平均绝对差

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(gbm1,X_test,y_test,cv=5))))

#XGbOOST 模型

import pickle

import xgboost as xgb

import numpy as np

from sklearn.model_selection import KFold, train_test_split, GridSearchCV

from sklearn.metrics import confusion_matrix, mean_squared_error

from sklearn.model_selection import KFold

kf = KFold(n_splits=2, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):

xgb_model = xgb.XGBRegressor().fit(X, y)

predictions = xgb_model.predict(X)

actuals = y

print(mean_squared_error(actuals, predictions))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(xgb_model,X_test,y_test,cv=kfold))))

maelxgb_model = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maelxgb_model)#平均绝对差

xgb_model2 = xgb.XGBRegressor(n_estimators= 1000,early_stopping_rounds=5,learning_rate=0.05,n_jobs=-1)

clf = xgb_model2.fit(X_train,y_train)

predictions = clf.predict(X_test)

actuals = y_test

print(mean_squared_error(actuals, predictions))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf,X_test,y_test,cv=kfold))))

maeclf = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maeclf)#平均绝对差

xgb_model3 = xgb.XGBRegressor(n_estimators= 2000,learning_rate=0.01,max_depth=2,n_jobs=-1)#learning rate 越小，得分越高

clf2 = xgb_model3.fit(X_train,y_train)

predictions = clf2.predict(X_test)

actuals = y_test

print(mean_squared_error(actuals, predictions))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf2,X_test,y_test,cv=kfold))))

maeclf2 = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maeclf2)#平均绝对差

xgb_model4 = xgb.XGBRegressor(n_estimators= 1000,learning_rate=0.1,max_depth=3,gamma=0.01,n_jobs=-1)

clf3 = xgb_model4.fit(X_train,y_train)

predictions = clf3.predict(X_test)

actuals = y_test

print(mean_squared_error(actuals, predictions))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf3,X_test,y_test,cv=5))))

maeclf3 = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maeclf3)#平均绝对差

ensemble3 = X2*0.3+ pred_gbm1*0.7

ensemble3

#把表现最好的GBM模型保存

from sklearn.externals import joblib

model1=clfr

filename = 'GBMclfr_finalized_model0115.sav'

joblib.dump(model1, filename)

# some time later...

# load the model from disk

# loaded_model = joblib.load(filename)

# result = loaded_model.score(X_test, Y_test)

# print(result)

# #The sklearn API models are picklable

# print("Pickling sklearn API models")

# # must open in binary format to pickle

# pickle.dump(clf3, open("XGB0918NYCTaxi.pkl", "wb"))

# # clf2 = pickle.load(open("best_boston.pkl", "rb"))

# # print(np.allclose(clf.predict(X), clf2.predict(X)))

2019-01-15

Scrapy爬虫与机器学习之三：房屋挂牌价格预测

GradientBoostingRegressor优化后mae：1795.62元/平方米，在所有模型表现最好

预测房屋的拟挂牌价格76293元/平方米，误差在1795.62元/平方米

你可能感兴趣的:(2019-01-15)