刚刚接触数据处理,就只能看看带佬们写的kernel帖了。这个印度老哥的数据处理写得很详细,就写了总结帖,以供分享以及日后学习吧。
[Top 5% on Leaderboard]
sns.scatterplot(x='GrLivArea',y='SalePrice',data=train)
#观察GrLivArea与SalePrice的分布图
#as per above plot we can see there are two outliers which can affect on out model,lets remove those outliers
train=train.drop(train.loc[(train['GrLivArea']>4000) & (train['SalePrice']<200000)].index,0)
train.reset_index(drop=True, inplace=True)
#lets create heatmap first of all lest see on which feature SalePrice is dependent
corr=train.drop('Id',1).corr().sort_values(by='SalePrice',ascending=False).round(2)
print(corr['SalePrice'])
#now lets create heatmap for top 10 correlated features
cols =corr['SalePrice'].head(10).index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1)
hm = sns.heatmap(cm, annot=True, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
#lets see relation of 10 feature with SalePrice through Pairplot
sns.pairplot(train[corr['SalePrice'].head(10).index])
missing=data.isnull().sum().sort_values(ascending=False)
missing=missing.drop(missing[missing==0].index)
missing
PoolQC 2908
MiscFeature 2812
Alley 2719
Fence 2346
FireplaceQu 1420
LotFrontage 486
GarageCond 159
GarageQual 159
GarageYrBlt 159
GarageFinish 159
GarageType 157
BsmtCond 82
BsmtExposure 82
BsmtQual 81
BsmtFinType2 80
BsmtFinType1 79
MasVnrType 24
MasVnrArea 23
MSZoning 4
BsmtHalfBath 2
Utilities 2
Functional 2
BsmtFullBath 2
BsmtFinSF2 1
BsmtFinSF1 1
Exterior2nd 1
BsmtUnfSF 1
TotalBsmtSF 1
Exterior1st 1
SaleType 1
Electrical 1
KitchenQual 1
GarageArea 1
GarageCars 1
dtype: int64
data['PoolQC']=data['PoolQC'].fillna('NA')
data['PoolQC'].unique()
data['MiscFeature']=data['MiscFeature'].fillna('NA')
data['MiscFeature'].unique()
data['Alley']=data['Alley'].fillna('NA')
data['Alley'].unique()
data['Fence']=data['Fence'].fillna('NA')
data['Fence'].unique()
data['FireplaceQu']=data['FireplaceQu'].fillna('NA')
data['FireplaceQu'].unique()
#GarageCond,GarageQual,GarageFinish
data['GarageCond']=data['GarageCond'].fillna('NA')
data['GarageCond'].unique()
data['GarageQual']=data['GarageQual'].fillna('NA')
data['GarageQual'].unique()
data['GarageFinish']=data['GarageFinish'].fillna('NA')
data['GarageFinish'].unique()
#BsmtExposure,BsmtCond,BsmtQual,BsmtFinType2,BsmtFinType1
data['BsmtExposure']=data['BsmtExposure'].fillna('NA')
data['BsmtCond']=data['BsmtCond'].fillna('NA')
data['BsmtQual']=data['BsmtQual'].fillna('NA')
data['BsmtFinType2']=data['BsmtFinType2'].fillna('NA')
data['BsmtFinType1']=data['BsmtFinType1'].fillna('NA')
data['MasVnrType']=data['MasVnrType'].fillna('NA')
#LotFrontage: all house have linear connected feet so putting most mean value
data['LotFrontage']=data['LotFrontage'].fillna(data['LotFrontage'].dropna().mean())
此处的平均值,是使用的训练集与测试集共同的平均值。这不知道算不算data leakage
#GarageYrBlt,GarageType,GarageArea,GarageCars putting 0
data['GarageYrBlt']=data['GarageYrBlt'].fillna(0)
data['GarageType']=data['GarageType'].fillna(0)
data['GarageArea']=data['GarageArea'].fillna(0)
data['GarageCars']=data['GarageCars'].fillna(0)
#BsmtFinSF1,BsmtFinSF2
data['BsmtFinSF1']=data['BsmtFinSF1'].fillna(0)
data['BsmtFinSF2']=data['BsmtFinSF2'].fillna(0)
data['BsmtFinSF1']=data['BsmtFinSF1'].fillna(0)
data['BsmtFinSF2']=data['BsmtFinSF2'].fillna(0)
data['MasVnrArea']=data['MasVnrArea'].fillna(0)
#MSZoning
data['MSZoning']=data['MSZoning'].fillna(data['MSZoning'].dropna().sort_values().index[0])
#Utilities
data['Utilities']=data['Utilities'].fillna(data['Utilities'].dropna().sort_values().index[0])
#KitchenQual
data['KitchenQual']=data['KitchenQual'].fillna(data['KitchenQual'].dropna().sort_values().index[0])
#as we know some feature are highly co-related with SalePrice so lets create some feature using these features
data['GrLivArea_2']=data['GrLivArea']**2
data['GrLivArea_3']=data['GrLivArea']**3
data['GrLivArea_4']=data['GrLivArea']**4
data['TotalBsmtSF_2']=data['TotalBsmtSF']**2
data['TotalBsmtSF_3']=data['TotalBsmtSF']**3
data['TotalBsmtSF_4']=data['TotalBsmtSF']**4
data['GarageCars_2']=data['GarageCars']**2
data['GarageCars_3']=data['GarageCars']**3
data['GarageCars_4']=data['GarageCars']**4
data['1stFlrSF_2']=data['1stFlrSF']**2
data['1stFlrSF_3']=data['1stFlrSF']**3
data['1stFlrSF_4']=data['1stFlrSF']**4
data['GarageArea_2']=data['GarageArea']**2
data['GarageArea_3']=data['GarageArea']**3
data['GarageArea_4']=data['GarageArea']**4
#lets add 1stFlrSF and 2ndFlrSF and create new feature floorfeet
data['Floorfeet']=data['1stFlrSF']+data['2ndFlrSF']
data=data.drop(['1stFlrSF','2ndFlrSF'],1)
#lets add all bath in one feature
data['Bath']=data['BsmtFullBath']+data['BsmtHalfBath']*.5+data['FullBath']+data['HalfBath']*.5
data=data.drop(['BsmtFullBath','BsmtHalfBath','FullBath','HalfBath'],1)
#MSSubClass,MSZoning
data=pd.get_dummies(data=data,columns=['MSSubClass'],prefix='MSSubClass')
data=pd.get_dummies(data=data,columns=['MSZoning'],prefix='MSZoning')
#Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle
data=pd.get_dummies(data=data,columns=['Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle'])
data.head()
#lets import StandardScaler from sklearn for feature scalling
from sklearn.preprocessing import StandardScaler
#lets split data using trainrow data and scale data
x_train=data.iloc[:trainrow]
x_test=data.iloc[trainrow:]
scaler=StandardScaler()
scaler=scaler.fit(x_train)
x_train_scaled=scaler.transform(x_train)
x_test_scaled=scaler.transform(x_test)