文章目录
- 准备工作
- 一、特征构造
- 1.1 分类型变量—重分组
- 1.2 分类型变量—one-hot编码
- 2.1 连续型变量—非线性衍生
- 2.2 连续型变量—简单组合
- 2.3 连续型变量—正态转换
- 2.4 连续型变量—无量纲化转换
- 整合处理
开发环境 jupyter notebook
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from scipy import stats
from scipy.stats import norm
from scipy.stats import skew
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
准备工作
%run ".基于Kaggle的经典AI项目二—数据清洗.ipynb"
def anova(train, categorical, y):
anv = pd.DataFrame(index=categorical)
anv['feature'] = categorical
pvals = []
for c in categorical:
samples = []
for cls in train[c].dropna().unique():
s = train[train[c] == cls][y].values
samples.append(s)
pval = stats.f_oneway(*samples)[1]
pvals.append(pval)
anv['pval'] = pvals
anv['disparity'] = np.log(20*1./anv['pval'].values)/np.log(20)
return anv.sort_values('pval')
cate_feature = [column for column in train\
.columns if train.dtypes[column] == 'object']
y = 'SalePrice'
corr_cate = anova(train, cate_feature, y)
train_ana = pd.DataFrame()
train_ana['feature_type'] = train.dtypes
train_ana['cate_cnt'] = train.apply(lambda x : x.nunique())
train_ana['conti_corr'] = train.corr()['SalePrice']
train_ana['cate_corr'] = corr_cate['disparity']
一、特征构造
1.1 分类型变量—重分组
neighborhood_order = train.groupby('Neighborhood')\
.median()\
.sort_values(by='SalePrice')\
.index
plt.figure(figsize = (14, 4))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice',
data = train, order=neighborhood_order)
plt.xticks(rotation=90)
plt.figure(figsize = (14, 4))
sns.countplot(x = 'Neighborhood', data = train,
order=neighborhood_order)
plt.xticks(rotation=90)
train["SimpleNeighborhood"] = train.Neighborhood\
.replace({'IDOTRR': 'IDOTRR-BrDale' , 'BrDale': 'IDOTRR-BrDale',
'Blueste': 'Blueste-SWISU', 'SWISU': 'Blueste-SWISU' ,
'NPkVill': 'NPkVill-Mitchel', 'Mitchel': 'NPkVill-Mitchel'
})
anova(train, ['Neighborhood', 'SimpleNeighborhood'], y)
~ |
feature |
pval |
disparity |
SimpleNeighborhood |
SimpleNeighborhood |
1.725211e-243 |
187.593052 |
Neighborhood |
Neighborhood |
1.019383e-240 |
185.462820 |
1.2 分类型变量—one-hot编码
featurego_cat_column = train.select_dtypes(include = [np.object]).columns
train_onehot = pd.get_dummies(train[featurego_cat_column])
train_onehot.columns
2.1 连续型变量—非线性衍生
train_ana.loc[train_ana.feature_type != 'object', ]\
.sort_values('conti_corr', ascending=False)
train_ana.head()
~ |
feature_type |
cate_cnt |
conti_corr |
cate_corr |
Id |
int64 |
1458 |
-0.027300 |
NaN |
MSSubClass |
object |
15 |
NaN |
65.931200 |
MSZoning |
object |
5 |
NaN |
29.838963 |
LotFrontage |
float64 |
111 |
0.209700 |
NaN |
LotArea |
float64 |
1039 |
0.421355 |
NaN |
train["OverallQual-s2"] = train["OverallQual"] ** 2
train["OverallQual-s3"] = train["OverallQual"] ** 3
train["OverallQual-Sq"] = np.sqrt(train["OverallQual"])
train["GrLivArea-2"] = train["GrLivArea"] ** 2
train["GrLivArea-3"] = train["GrLivArea"] ** 3
train["GrLivArea-Sq"] = np.sqrt(train["GrLivArea"])
train[['SalePrice', 'OverallQual', 'OverallQual-s2', 'OverallQual-s3', 'OverallQual-Sq',
'GrLivArea', 'GrLivArea-2', 'GrLivArea-3', 'GrLivArea-Sq']].corr()['SalePrice']
2.2 连续型变量—简单组合
train["TotalBath"] = train["BsmtFullBath"] + (0.5 * train["BsmtHalfBath"]) + \
train["FullBath"] + (0.5 * train["HalfBath"])
train["AllSF"] = train["GrLivArea"] + train["TotalBsmtSF"]
train["AllFlrsSF"] = train["1stFlrSF"] + train["2ndFlrSF"]
train["AllPorchSF"] = train["OpenPorchSF"] + train["EnclosedPorch"] + \
train["3SsnPorch"] + train["ScreenPorch"]
train[['TotalBath', 'AllSF', 'AllFlrsSF', 'AllPorchSF', 'SalePrice']].corr()['SalePrice']
2.3 连续型变量—正态转换
sns.distplot(train['SalePrice'], fit=norm)
train['SalePrice_log'] = np.log1p(train['SalePrice'])
stats.probplot(train['SalePrice'], plot=plt)
train.corr().sort_values('SalePrice_log', ascending=False)\
[['SalePrice', 'SalePrice_log']].head()
train_num = train.select_dtypes(include=[np.number])\
.drop(['SalePrice', 'SalePrice_log'], axis=1)
skewness = train_num.apply(lambda x: skew(x.dropna()))
skewness = skewness[abs(skewness) > 0.75]
skewness
featurego_skewed_features = skewness.index
train[featurego_skewed_features] = np.log1p(train[featurego_skewed_features])
2.4 连续型变量—无量纲化转换
from sklearn.preprocessing import MinMaxScaler
train_num = train.select_dtypes(include=[np.number]).drop(['SalePrice', 'SalePrice_log'], axis=1)
featurego_min_max_scaler = MinMaxScaler()
featurego_min_max_scaler.fit_transform(train_num)
featurego_scaler_numcolumn = train_num.columns
train_num_minmax = np.round(featurego_min_max_scaler\
.transform(train[featurego_scaler_numcolumn]), 2)
train_num_minmax= pd.DataFrame(train_num_minmax,
columns=featurego_scaler_numcolumn + '_minmax',
index=train.index)
整合处理
train = pd.concat([train, train_onehot, train_num_minmax], axis=1)
train.drop(['SalePrice'], axis=1, inplace=True)