import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
df_train = pd.read_csv(r'data\train.csv')
print(df_train.shape)
df_train['SalePrice'].describe()
print('Skewness: %f' % df_train['SalePrice'].skew())
print('Kurtosis: %f' % df_train['SalePrice'].kurt())
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.distplot(df_train['SalePrice'])
Skewness: 1.882876
Kurtosis: 6.536282
# 居住面积平方英尺
data = pd.concat([df_train['SalePrice'], df_train['GrLivArea']], axis = 1)
data.plot.scatter(x = 'GrLivArea', y = 'SalePrice')
# 地下室面积平方英尺
data = pd.concat([df_train['SalePrice'], df_train['TotalBsmtSF']], axis = 1)
data.plot.scatter(x = 'TotalBsmtSF', y = 'SalePrice')
# 整体材料和饰面质量
data = df_train[['SalePrice', 'OverallQual']]
plt.subplots(figsize = (8, 6))
sns.boxplot(x = 'OverallQual', y = 'SalePrice', data = data)
data = df_train[['Neighborhood', 'SalePrice']]
plt.subplots(figsize = (10, 6))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice', data = data)
plt.xticks(rotation = 60);
corrmat = df_train.corr()
plt.subplots(figsize = (12, 9))
sns.heatmap(corrmat, square = True, cmap = 'Greens');
cols = corrmat.nlargest(10, 'SalePrice')['SalePrice'].index # 数值最大的前十个
cm = np.corrcoef(df_train[cols].values.T)
plt.figure(figsize = (8, 6))
sns.heatmap(cm, cbar = True, annot = True, square = True, fmt = '.2f', cmap = 'Blues',
annot_kws = {'size': 10}, yticklabels = cols.values, xticklabels = cols.values)
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(df_train[cols], size = 3);
total_missing = df_train.isnull().sum().sort_values(ascending = False)
percent = (df_train.isnull().sum() / len(df_train)).sort_values(ascending = False).round(3)
missing_data = pd.concat([total_missing, percent], axis = 1, keys = ['Total', 'Percent'])
missing_data.head()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
train = pd.read_csv(r'data\train.csv')
test = pd.read_csv(r'data\test.csv')
print('The train data size before dropping Id feature is: {}'.format(train.shape))
print('The test data size before dropping Id feature is: {}'.format(test.shape))
# ID先留着,暂时不用
train_ID = train['Id']
test_ID = test['Id']
train.drop('Id', axis = 1, inplace = True)
test.drop('Id', axis = 1, inplace = True)
# 发现离群点
plt.figure(figsize = (8, 6))
plt.scatter(x = train['GrLivArea'], y = train['SalePrice'])
plt.xlabel('GrLivArea', fontsize = 12)
plt.ylabel('SalePrice', fontsize = 12);
train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)
plt.figure(figsize = (8, 6))
plt.scatter(x = train['GrLivArea'], y = train['SalePrice'])
plt.xlabel('GrLivArea', fontsize = 12)
plt.ylabel('SalePrice', fontsize = 12);
from scipy.stats import norm
from scipy import stats
plt.figure(figsize = (8, 6))
sns.distplot(train['SalePrice'], fit = norm)
(mu, sigma) = norm.fit(train['SalePrice'])
print('mu = {:.2f} and sigma = {:.2f}'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu = $ {:.2f} and $\sigma = $ {:.2f})'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
fig = plt.figure(figsize = (8, 6))
stats.probplot(train['SalePrice'], plot = plt);
#对数变换log(1+x)
train['SalePrice'] = np.log1p(train['SalePrice'])
(mu, sigma) = norm.fit(train['SalePrice'])
print('mu = {:.2f} and sigma = {:.2f}'.format(mu, sigma))
plt.figure(figsize = (8, 6))
sns.distplot(train['SalePrice'], fit = norm)
plt.legend(['Normal dist. ($\mu = $ {:.2f} and $\sigma$ = {:.2f})'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
plt.figure(figsize = (8, 6))
stats.probplot(train['SalePrice'], plot = plt);
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop = True)
all_data.drop(['SalePrice'], axis = 1, inplace = True)
print('all_data size is: {}'.format(all_data.shape))
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending = False)[: 20]
missing_data = pd.DataFrame({'Missing Ratio': all_data_na})
missing_data
plt.figure(figsize = (12, 8))
sns.barplot(x = all_data_na.index, y = all_data_na)
plt.xlabel('Features')
plt.ylabel('Percent of missing values')
plt.xticks(rotation = 90)
plt.title('Percent minssing data by feature');
all_data['PoolQC'] = all_data['PoolQC'].fillna('None') # 游泳池
all_data['MiscFeature'] = all_data['MiscFeature'].fillna('None') # 没有特征
all_data['Alley'] = all_data['Alley'].fillna('None') # 通道的入口
all_data['Fence'] = all_data['Fence'].fillna('None') # 栅栏
all_data['FireplaceQu'] = all_data['FireplaceQu'].fillna('None') # 壁炉
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median())) # 到街道的距离
# 车库
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
all_data[col] = all_data[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
all_data[col] = all_data[col].fillna(0)
# 地下室
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
all_data[col] = all_data[col].fillna(0)
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
all_data[col] = all_data[col].fillna('None')
all_data['MasVnrType'] = all_data['MasVnrType'].fillna('None') # 砌体
all_data['MasVnrArea'] = all_data['MasVnrArea'].fillna(0) # 砌体
# 一般分区分类,用众数
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
all_data['Functional'] = all_data['Functional'].fillna(all_data['Functional'].mode()[0]) # 家庭功能评定
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0]) # 电力系统
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0]) # 厨房的品质
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0]) # 外部
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0]) # 外部
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0]) # 销售类型
all_data['MSSubClass'] = all_data['MSSubClass'].fillna('None') # 建筑类型
all_data = all_data.drop('Utilities', axis = 1)
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending = False)
missing_data = pd.DataFrame({'Missing Ratio': all_data_na})
missing_data.head()
from sklearn.preprocessing import LabelEncoder
cols = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'ExterQual',
'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 'BsmtFinType2',
'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope', 'LotShape',
'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold']
for c in cols:
encode = LabelEncoder()
encode.fit(list(all_data[c].values))
all_data[c] = encode.transform(list(all_data[c].values))
# 增加一个新特征总面积
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
from scipy.stats import norm, skew
numeric_feats = all_data.dtypes[all_data.dtypes != 'object'].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending = False)
print('Skew in numerical features:')
skewness = pd.DataFrame({'Skew': skewed_feats})
skewness.head(10)
skewness = skewness[abs(skewness) > 0.75]
print('There are {} skewed numerical features to Box Cox transform'.format(skewness.shape[0]))
# 关键点在于如何找到一个合适的参数,一般情况下0.15为经验值
from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
all_data[feat] = boxcox1p(all_data[feat], lam)
all_data.head()
all_data = pd.get_dummies(all_data)
print(all_data.shape)
train = all_data[: ntrain]
test = all_data[ntrain:]
(2917, 220)
from sklearn.model_selection import KFold, cross_val_score
def rmse_cv(n_folds, model):
kf = KFold(n_folds, shuffle = True, random_state = 42).get_n_splits(train.values)
rmse = np.sqrt(-cross_val_score(model, train.values, y_train, scoring = 'neg_mean_squared_error', cv = kf))
return rmse
# make_pipeline:级联起来去做事 RobustScaler:更适合处理离群点
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso, ElasticNet
lasso = make_pipeline(RobustScaler(), Lasso(alpha = 0.0005, random_state = 42))
# ElasticNet同时使用l1和l2
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha = 0.0005, l1_ratio = .9, random_state = 42))
# KernelRidge带有核函数的岭回归
from sklearn.kernel_ridge import KernelRidge
KRR = KernelRidge(alpha = 0.6, kernel = 'polynomial', degree = 2, coef0 = 2.5)
from sklearn.ensemble import GradientBoostingRegressor
GBoost = GradientBoostingRegressor(n_estimators = 3000, learning_rate = 0.05, max_depth = 4,
max_features = 'sqrt', min_samples_leaf = 15, loss = 'huber',
min_samples_split = 10, random_state = 42)
import xgboost as xgb
model_xgb = xgb.XGBRegressor(colsample_bytree = 0.4603, gamma = 0.0468, learning_rate = 0.05,
max_depth = 3, min_child_weight = 1.7817, n_estimators = 2200,
reg_alpha = 0.4640, reg_lambda = 0.8571, subsample = 0.5213,
silent = 1, nthread = -1)
score = rmse_cv(5, lasso)
print('Lasso score: {:.4f} ({:.4f})'.format(score.mean(), score.std()))
score = rmse_cv(5, ENet)
print('ElasticNet score: {:.4f} ({:.4f})'.format(score.mean(), score.std()))
score = rmse_cv(5, KRR)
print('Kernel Ridge score: {:.4f} ({:.4f})'.format(score.mean(), score.std()))
score = rmse_cv(5, GBoost)
print('Gradient Boosting score: {:.4f} ({:.4f})'.format(score.mean(), score.std()))
score = rmse_cv(5, model_xgb)
print('Xgboost score: {:.4f} ({:.4f})'.format(score.mean(), score.std()))
Lasso score: 0.1116 (0.0072)
ElasticNet score: 0.1116 (0.0072)
Kernel Ridge score: 0.1153 (0.0071)
Gradient Boosting score: 0.1161 (0.0067)
Xgboost score: 0.1168 (0.0072)
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, models):
self.models = models
def fit(self, X, y):
self.models_ = [clone(x) for x in self.models]
for model in self.models_:
model.fit(X, y)
return self
def predict(self, X):
predictions = np.column_stack([model.predict(X) for model in self.models_])
return np.mean(predictions, axis = 1)
averaged_models = AveragingModels(models = (ENet, GBoost, KRR, lasso))
score = rmse_cv(5, averaged_models)
print('Averaged base models score: {:.4f} ({:.4f})'.format(score.mean(), score.std()))
Averaged base models score: 0.1083 (0.0069)
class StackingAverageModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, base_models, meta_model, n_folds = 5):
self.base_models = base_models
self.meta_model = meta_model
self.n_folds = n_folds
def fit(self, X, y):
self.base_models_ = [list() for x in self.base_models]
self.meta_model_ = clone(self.meta_model)
kfold = KFold(n_splits = self.n_folds, shuffle = True, random_state = 42)
out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
for i, model in enumerate(self.base_models):
for train_index, holdout_index in kfold.split(X, y):
instance = clone(model)
self.base_models_[i].append(instance)
instance.fit(X[train_index], y[train_index])
y_pred = instance.predict(X[holdout_index])
out_of_fold_predictions[holdout_index, i] = y_pred
self.meta_model_.fit(out_of_fold_predictions, y)
return self
def predict(self, X):
meta_features = np.column_stack([np.column_stack([model.predict(X) for model in base_models]).mean(axis = 1)
for base_models in self.base_models_])
return self.meta_model_.predict(meta_features)
stacked_averaged_models = StackingAverageModels(base_models = (ENet, GBoost, KRR), meta_model = lasso)
score = rmse_cv(5, stacked_averaged_models)
print('Stacking Averaged models score: {:.4f} ({:.4f})'.format(score.mean(), score.std()))
Stacking Averaged models score: 0.1079 (0.0072)