1.数据集
链接:https://pan.baidu.com/s/1GPpCp04MPhCRjmoXr59kiQ
提取码:8ko6
2.数据集读取
#数据读取
train_data=pd.read_csv("train.csv")
test_data=pd.read_csv("test.csv")
print("train_data.shape:",train_data.shape)#(1460, 81)
print("test_data.shape:",test_data.shape)#(1459, 80)
#数据相关性分析
corrmat=train_data.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
plt.figure()
#前五个相关性大的特征
k = 5
plt.figure(figsize=(12,9))
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
3.数据集预处理
#数据预处理:对数据集进行归一化和标准化
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
# 获取数值型数据的索引
numeric_features_idx = all_features.dtypes[all_features.dtypes != 'object'].index
# 标准化数据
all_features[numeric_features_idx] = all_features[numeric_features_idx].apply(lambda x: (x - x.mean()) / x.std())
all_features = all_features.fillna(0)
# 离散值处理:
all_features = pd.get_dummies(all_features, dummy_na=True)
num_train = len(train_data)
train_datasets = np.array(all_features[:num_train].values, dtype=np.float)
test_datasets = np.array(all_features[num_train:].values, dtype=np.float)
train_label = train_data.values[:, -1]
print("是否保存数据集:填True 或 No")
save=input()
if save=="True":
save_train_data = np.zeros((num_train, len(train_datasets[0]) + 1), dtype=float)
save_train_data[:, :-1] = train_datasets
save_train_data[:, -1] = np.mat(train_label)
pd.DataFrame.to_csv(pd.DataFrame(save_train_data), 'house_price_train.csv',
index=False, header=False, float_format='%.6f')
pd.DataFrame.to_csv(pd.DataFrame(test_datasets), 'house_price_test.csv',
index=False, header=False, float_format='%.6f')
print("保存成功")
print("查看文件吧.....")
4.模型构建
首先导入包
报错
解决方法:pip install xgboost -i https://pypi.tuna.tsinghua.edu.cn/simple
模型运行结果有点问题,完善后补充代码
5.模型评价方法
def rmse_cv(model,X,y):
rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
return rmse