本文参考七月在线内容。所有数据从kaggle网站获取
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei'] #指定默认字体
from sklearn.linear_model import Ridge
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
#读取数据
train_df = pd.read_csv('G:/KNNtest/HousePrice/train.csv',index_col=0)
test_df = pd.read_csv('G:/KNNtest/HousePrice/test.csv',index_col=0)
'''
1、鉴于kaggle项目train_df和test_df都已给出,做处理时可以合并起来一起处理,另外train_df中标签项
即SalePrice需要先取出
'''
#log1p, 也就是 log(x+1),将label平滑化
prices = pd.DataFrame({"price":train_df["SalePrice"], "log(price + 1)":np.log1p(train_df["SalePrice"])})
# plt.ylabel(u"数量")
# plt.title(u"房屋总价")
# prices.hist()
# plt.show()
y_train = np.log1p(train_df.pop('SalePrice'))
all_df = pd.concat((train_df,test_df),axis=0)
'''
2、特征处理:变量转化,将不方便处理的数据或不统一的数据进行处理
'''
#!!!!! MMSubClass应该为Category,因此需要现将其转换为string
all_df['MSSubClass'] = all_df['MSSubClass'].astype(str)
# print(all_df['MSSubClass'].value_counts())
# print(pd.get_dummies(all_df['MSSubClass'],prefix='MSSubClass').head(5))
#将所有category数据进行OneHot
all_dummy_df = pd.get_dummies(all_df)
# all_dummy_df.to_csv('G:/KNNtest/HousePrice/One_hot.csv') #将所有Category数据进行OneHot之后保存
# print(all_dummy_df.isnull().sum().sort_values(ascending=False).head(20)) #读取出所有特征中缺失项的数目
#使用平均值来填补空缺
mean_cols = all_dummy_df.mean()
all_dummy_df = all_dummy_df.fillna(mean_cols)
#标准化numeric数据,一般regression分类器需要将源数据放在一个标准分布内(x-x')/s
numric_cols = all_df.columns[all_df.dtypes != 'object'] #找出numeric数据
numric_cols_means = all_dummy_df.loc[:,numric_cols].mean()
numric_cols_std = all_dummy_df.loc[:,numric_cols].std() #标准差s
all_dummy_df.loc[:,numric_cols] = (all_dummy_df.loc[:,numric_cols] - numric_cols_means)/numric_cols_std
'''
3、建立模型
Bagging把很多的小分类器放在一起,每个train随机的一部分数据,然后把它们的最终结果综合起来(多数投票制)。
(1)先测试ridge模型得到CV值在alpha=15时最小为0.135
(2)使用25个ridge的Bagging得到值低于0.133
(3)Boosting比Bagging理论上更高级点,它也是揽来一把的分类器。但是把他们线性排列。下一个分类器把上一个
分类器分类得不好的地方加上更高的权重,这样下一个分类器就能在这个部分学得更加“深刻”。
(4)XGBoost分类器。效果显著!!!
'''
#把数据重新分回训练集和测试集
dummy_train_df = all_dummy_df.loc[train_df.index]
dummy_test_df = all_dummy_df.loc[test_df.index]
X_train = dummy_train_df.values
X_test = dummy_test_df.values
'''
(1)ridge模型测试
'''
# alphas = np.logspace(-3, 2, 50) #-3,2表示开始点和结束点10的幂次方,50代表元素个数
# test_scores = []
# for alpha in alphas:
# clf = Ridge(alpha)
# test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
# test_scores.append(np.mean(test_score))
# plt.plot(alphas,test_scores)
# plt.title('Aplha VS CV Error')
# plt.show() #由图知道在alpha=15左右交叉验证误差基本最小,约为0.135
'''
(2)bagging测试
'''
# ridge = Ridge(15) #由ridge模型测试得到
# params = [1, 10, 15, 20, 25, 30, 40,60]
# test_scores = []
# for param in params:
# clf = BaggingRegressor(n_estimators=param, base_estimator=ridge) #基础模型为ridge
# test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
# test_scores.append(np.mean(test_score))
# plt.plot(params, test_scores)
# plt.title("n_estimator VS CV Error")
# plt.show()
'''
(3)Boosting测试 *实际效果并不好,反而会随着param值增大*
'''
# params = [10,15,20,25,30,35,40,45,50]
# test_scores = []
# for param in params:
# clf = AdaBoostRegressor(n_estimators=param)
# test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
# test_scores.append(np.mean(test_score))
# plt.plot(params, test_scores)
# plt.title("n_estimator VS CV Error")
# plt.show()
'''
(4)XGBoost,params为5时CV降低至0.127,但是运行时间略慢
'''
params = [1,2,3,4,5,6]
test_scores = []
for param in params:
clf = XGBRegressor(max_depth=param)
test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
test_scores.append(np.mean(test_score))
plt.plot(params, test_scores)
plt.title("n_estimator VS CV Error")
plt.show()
因此确定最后选择XGBoost模型:
clf = XGBRegressor(max_depth=5)
clf.fit(X_train,y_train)
Pricr_Predict = np.expm1(clf.predict(X_test))
submission_df = pd.DataFrame(data={'ID':test_df.index,'SalePrice':Pricr_Predict})
print(submission_df.head(10))
运行结果示意:
ID SalePrice
0 1461 121317.992188
1 1462 160514.984375
2 1463 184872.671875
3 1464 188044.953125
4 1465 187002.328125
5 1466 176525.343750
6 1467 176312.000000
7 1468 163346.546875
8 1469 190007.890625
9 1470 122269.976562
Process finished with exit code 0