import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option(‘display.max_columns’, None)
pd.set_option(‘display.max_rows’, None)
plt.rcParams[‘font.sans-serif’]=[‘SimHei’] #用来正常显示中文标签
plt.rcParams[‘axes.unicode_minus’]=False #用来正常显示负号
train_df = pd.read_csv(“C:/Users/13369/Desktop/kaggle/kaggle/house price/input/train.csv”, index_col=0)
test_df = pd.read_csv(“C:/Users/13369/Desktop/kaggle/kaggle/house price/input/test.csv”, index_col=0)
#index_col:将第一列变为索引
print(train_df.head())
print(train_df.shape)
prices = pd.DataFrame({“price”:train_df[“SalePrice”], “log(price+1)”:np.log1p(train_df[“SalePrice”])})
“”“可见,label本身并不平滑。为了我们分类器的学习更加准确,我们会首先把label给“平滑化”(正态化)
这一步大部分同学会miss掉,导致自己的结果总是达不到一定标准。
这里我们使用最有逼格的log1p, 也就是 log(x+1),避免了复值的问题。
记住哟,如果我们这里把数据都给平滑化了,那么最后算结果的时候,要记得把预测到的平滑数据给变回去。
按照“怎么来的怎么去”原则,log1p()就需要expm1(); 同理,log()就需要exp(), … etc”""
prices.hist()
y_train = np.log1p(train_df.pop(“SalePrice”))
all_df = pd.concat((train_df, test_df), axis=0)
#将MSSubClass里面的数值型转换为字符型,为了方便后续处理
all_df[“MSSubClass”] = all_df[“MSSubClass”].astype(str)
print(all_df[“MSSubClass”].value_counts())#values_counts:用于计算每个字符出现大的次数
#接下来就是常用得到one_hot处理
all_dummy_df = pd.get_dummies(all_df)
#然后做数据缺失部分
#先看看哪些数据缺失了
mean_cols = all_dummy_df.mean()
all_dummy_df = all_dummy_df.fillna(mean_cols)
#标准化数据集
#先看看哪些数据是numerial格式的
numeric_cols = all_df.columns[all_df.dtypes != “object”]
#求出标准化需要的均值与方差
numeric_cols_means = all_dummy_df.loc[:, numeric_cols].mean()
numeric_cols_std = all_dummy_df.loc[:, numeric_cols].std()
all_dummy_df.loc[:, numeric_cols] = (all_dummy_df.loc[:, numeric_cols] - numeric_cols_means) / numeric_cols_std
#建立模型
#将数据分回成训练集与测试集
dummy_train_df = all_dummy_df.loc[train_df.index]
dummy_test_df = all_dummy_df.loc[test_df.index]
#用Ridge Regression(对于多因子数据集,这种模型可以方便的把所有的变量都无脑的放进去)
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
x_train = dummy_train_df.values
x_test = dummy_test_df.values
alphas = np.logspace(-3, 2, 50)
test_scores = []
for alpha in alphas:
clf = Ridge(alpha)
test_score = np.sqrt(-cross_val_score(clf, x_train, y_train, cv=10, scoring=“neg_mean_squared_error”))
test_scores.append(np.mean(test_score))
print(test_scores)
plt.figure()
plt.plot(alphas, test_scores)
plt.title(“alpha vs cv error”)
plt.show()#跑出来的结果大约在15左右最有效
#使用随机森林
from sklearn.ensemble import RandomForestRegressor
max_features = [.1, .3, .5, .7, .9, .99]
test_scores_rf = []
for max_feature in max_features:
clf_rf = RandomForestRegressor(n_estimators=200, max_features=max_feature)
test_score_rf = np.sqrt(-cross_val_score(clf_rf, x_train, y_train, cv=5, scoring=‘neg_mean_squared_error’))
test_scores_rf.append(np.mean(test_score_rf))
plt.figure()
plt.plot(max_features, test_scores_rf)
plt.title(“Max Features vs CV Error”)
plt.show()#跑出来的结果大约在0.3附近做有效
ridge = Ridge(alpha=15)
rf = RandomForestRegressor(n_estimators=500, max_features=.3)
ridge.fit(x_train, y_train)
rf.fit(x_train, y_train)
y_ridge = np.expm1(ridge.predict(x_test))
y_rf = np.expm1(ridge.predict(x_test))
y_final = (y_train+y_ridge) / 2
submission_df = pd.DataFrame(data= {‘Id’ : test_df.index, ‘SalePrice’: y_final})