kaggle 房价预测-----python

本文参考七月在线内容。所有数据从kaggle网站获取

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']        #指定默认字体
from sklearn.linear_model import Ridge
from sklearn.ensemble import  BaggingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor



#读取数据
train_df = pd.read_csv('G:/KNNtest/HousePrice/train.csv',index_col=0)
test_df = pd.read_csv('G:/KNNtest/HousePrice/test.csv',index_col=0)

'''
    1、鉴于kaggle项目train_df和test_df都已给出,做处理时可以合并起来一起处理,另外train_df中标签项
即SalePrice需要先取出
'''
#log1p, 也就是 log(x+1),将label平滑化
prices = pd.DataFrame({"price":train_df["SalePrice"], "log(price + 1)":np.log1p(train_df["SalePrice"])})

# plt.ylabel(u"数量")
# plt.title(u"房屋总价")
# prices.hist()
# plt.show()

y_train = np.log1p(train_df.pop('SalePrice'))
all_df = pd.concat((train_df,test_df),axis=0)

'''
    2、特征处理:变量转化,将不方便处理的数据或不统一的数据进行处理
'''
#!!!!! MMSubClass应该为Category,因此需要现将其转换为string
all_df['MSSubClass'] = all_df['MSSubClass'].astype(str)
# print(all_df['MSSubClass'].value_counts())
# print(pd.get_dummies(all_df['MSSubClass'],prefix='MSSubClass').head(5))

#将所有category数据进行OneHot
all_dummy_df = pd.get_dummies(all_df)
# all_dummy_df.to_csv('G:/KNNtest/HousePrice/One_hot.csv')      #将所有Category数据进行OneHot之后保存

# print(all_dummy_df.isnull().sum().sort_values(ascending=False).head(20))      #读取出所有特征中缺失项的数目
#使用平均值来填补空缺
mean_cols = all_dummy_df.mean()
all_dummy_df = all_dummy_df.fillna(mean_cols)

#标准化numeric数据,一般regression分类器需要将源数据放在一个标准分布内(x-x')/s
numric_cols = all_df.columns[all_df.dtypes != 'object']     #找出numeric数据
numric_cols_means = all_dummy_df.loc[:,numric_cols].mean()
numric_cols_std = all_dummy_df.loc[:,numric_cols].std()      #标准差s
all_dummy_df.loc[:,numric_cols] = (all_dummy_df.loc[:,numric_cols] - numric_cols_means)/numric_cols_std


'''
    3、建立模型
    Bagging把很多的小分类器放在一起,每个train随机的一部分数据,然后把它们的最终结果综合起来(多数投票制)。
    
    (1)先测试ridge模型得到CV值在alpha=15时最小为0.135
    (2)使用25个ridge的Bagging得到值低于0.133
    (3)Boosting比Bagging理论上更高级点,它也是揽来一把的分类器。但是把他们线性排列。下一个分类器把上一个
    分类器分类得不好的地方加上更高的权重,这样下一个分类器就能在这个部分学得更加“深刻”。
    (4)XGBoost分类器。效果显著!!!
'''
#把数据重新分回训练集和测试集
dummy_train_df = all_dummy_df.loc[train_df.index]
dummy_test_df = all_dummy_df.loc[test_df.index]

X_train = dummy_train_df.values
X_test = dummy_test_df.values

'''
(1)ridge模型测试
'''
# alphas = np.logspace(-3, 2, 50) #-3,2表示开始点和结束点10的幂次方,50代表元素个数
# test_scores = []
# for alpha in alphas:
#     clf = Ridge(alpha)
#     test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
#     test_scores.append(np.mean(test_score))
# plt.plot(alphas,test_scores)
# plt.title('Aplha VS CV Error')
# plt.show()                                 #由图知道在alpha=15左右交叉验证误差基本最小,约为0.135


'''
(2)bagging测试
'''
# ridge = Ridge(15)                           #由ridge模型测试得到
# params = [1, 10, 15, 20, 25, 30, 40,60]
# test_scores = []
# for param in params:
#     clf = BaggingRegressor(n_estimators=param, base_estimator=ridge)            #基础模型为ridge
#     test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
#     test_scores.append(np.mean(test_score))
# plt.plot(params, test_scores)
# plt.title("n_estimator VS CV Error")
# plt.show()


'''
(3)Boosting测试 *实际效果并不好,反而会随着param值增大*
'''

# params = [10,15,20,25,30,35,40,45,50]
# test_scores = []
# for param in params:
#     clf = AdaBoostRegressor(n_estimators=param)
#     test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
#     test_scores.append(np.mean(test_score))
# plt.plot(params, test_scores)
# plt.title("n_estimator VS CV Error")
# plt.show()

'''
(4)XGBoost,params为5时CV降低至0.127,但是运行时间略慢
'''
params = [1,2,3,4,5,6]
test_scores = []
for param in params:
    clf = XGBRegressor(max_depth=param)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
plt.plot(params, test_scores)
plt.title("n_estimator VS CV Error")
plt.show()

因此确定最后选择XGBoost模型:

clf = XGBRegressor(max_depth=5)
clf.fit(X_train,y_train)
Pricr_Predict = np.expm1(clf.predict(X_test))
submission_df = pd.DataFrame(data={'ID':test_df.index,'SalePrice':Pricr_Predict})
print(submission_df.head(10))

运行结果示意:

     ID      SalePrice
0  1461  121317.992188
1  1462  160514.984375
2  1463  184872.671875
3  1464  188044.953125
4  1465  187002.328125
5  1466  176525.343750
6  1467  176312.000000
7  1468  163346.546875
8  1469  190007.890625
9  1470  122269.976562

Process finished with exit code 0

你可能感兴趣的:(机器学习)