1、加载数据
2、枚举特征分类统计
3、数字特征可视化
4、特征扩充
5、数字特征异常值检测
6、正态分布检测
7、对数转化图形对比
8、对数正态转化
9、数字特征异常值检测(正态变换后)
10、特征选择(根据数据分布)
11、特征选择(l岭回归)
12、特征选择(逐步回归)
13、特征选择(xgboost)
EDA-数据探索性分析
实现思路,按目录步骤实现如下:
步骤1,首先加载天池二手车价格预测赛题的数据;
步骤2,观察数据类型,初步可以分为数值型和枚举型;为大致了解下数据的分布情况,对所有指标进行分组统计,可以发现连续型数值的量化统计值:最大最小,方差,峭度,裕度等,也可以初步了解下各个指标类别,以及类别的频数。
步骤3,要对数字特征进行可视化,观察数字特征的分布。
步骤4,做特征扩充,此部分特征扩充主要在针对年份的处理上,提取出年份,和月份单独作为特征,另外对月份为非法的进行众数填充。
步骤5,利用箱线图法对数字特征进行异常值检测,此处主要是针对价格特征进行了异常值检测
步骤6,对连续型的数值特征进行正态分布检测。
步骤7,进行对数正态转化及画图对比,可以发现某些特征通过指数变换是可以达到正态分布的效果的,而有些特征变化效果则不怎么样。
步骤8,对部分有效果的特征进行正态转化,再次用异常值检测方法检测其效果,
步骤9,对特征进行筛选,主要采用了直接观察法,删除掉哪些明显不平衡的特征样本;另外还通过岭回归,逐步回归,xgboost等方法进行特征选择,
1、加载数据
def load_data(self,train_data_path):
# 训练数据初步统计
train_data_df = pd.read_csv(train_data_path, sep=' ')
pd.set_option('display.max_columns', None)
# test_data_path = r"C:\Users\ccs\Documents\dataWhale\used_car_testA_20200313\used_car_testA_20200313.csv"
# test_data_df = pd.read_csv(test_data_path, sep=' ')
# train_data_df = train_data_df.append(test_data_df)
# print("原数据的数量:\n",train_data_df.count())
# print("去重后的数量:\n",train_data_df.drop_duplicates().count())
train_data_df.describe(include='all')
# 空值统计
print(train_data_df.isnull().sum())
print(train_data_df.columns)
return train_data_df
2、枚举特征分类统计
def categorial_statistus(self,train_data_df,category_columns):
"""
总体特征或者字符特征数据统计
"""
import numpy as np
#
# print(train_data_df['model'])
# category_columns = ['month_regDate']
# train_data_df["regDate"] = train_data_df["regDate"]
print(train_data_df.columns)
# train_data_df.loc[train_data_df['regDate'][4:6],'C']=train_data_df['regDate'][4:6]
# train_data_df
for i in category_columns:
# print(train_data_df.groupby(i).size())
total = pd.DataFrame({'count': train_data_df.groupby(i).size()})
total = total.sort_values(['count'], ascending=False)
print(total, '\n', total.count())
return train_data_df
3、数字特征可视化
#数字特征可视化
def plot_nemurical(self,train_data_df,numerical_columns):
## 3) 每个数字特征得分布可视化--连续型取值的
##去除掉字符型的变量
# numerical_columns = ['regDate', 'power', 'kilometer', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
# 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
# 'v_13', 'v_14']
f = pd.melt(train_data_df, value_vars=numerical_columns)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")
## 4) 数字特征相互之间的关系可视化
sns.set()
columns = ['price', 'v_12', 'v_8', 'v_0', 'power', 'v_5', 'v_2', 'v_6', 'v_1', 'v_14']
sns.pairplot(train_data_df[columns], size=2, kind='scatter', diag_kind='kde')
plt.show()
4、特征扩充
def categorial_extend(self,train_data_df):
"""
字符特征扩展
"""
def fun(x):
if str(x)[4:6] == '00':
rst = str(x)[0:4] + '03' + str(x)[6:]
return rst
else:
return str(x)
train_data_df['regDate'] = train_data_df['regDate'].apply(lambda x: fun(x))
train_data_df["year_regDate"] = train_data_df['regDate'].astype("str").str[0:4]
train_data_df["month_regDate"] = train_data_df['regDate'].astype("str").str[4:6]
return train_data_df
5、数字特征异常值检测
# 异常值检测
def detect_outliers(self,df, n, features):
"""
"""
outlier_indices = []
# iterate over features(columns)
for col in features:
# 1st quartile (25%)
Q1 = np.percentile(df[col], 25)
# 3rd quartile (75%)
Q3 = np.percentile(df[col], 75)
# quartile spacing (IQR)
IQR = Q3 - Q1
# outlier step
outlier_step = 1.5 * IQR
# Determine a list of indices of outliers for feature col
outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
# append the found outlier indices for col to the list of outlier indices
outlier_indices.extend(outlier_list_col)
# select observations containing more than n outliers
outlier_indices = Counter(outlier_indices)
print("outlier_indices is ", outlier_indices)
print("outlier_indices length is ", outlier_indices.__len__())
multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)
return multiple_outliers
6、正态分布检测
def normal_test(self,train_data_df):
# 对于连续型指标---正态分布检验
# 判断是否符合近似正态分布
# 若p_value比较小,表示不大可能来自正态分布
#经检验,都不是正态分布,因此需要对重要的power和kilometer进行转换。
numerical_columns = ['regDate', 'power', 'kilometer', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
'v_13', 'v_14']
train_data_df['regDate'] = train_data_df['regDate'].astype('int')
train_data_df['notRepairedDamage'] = train_data_df['notRepairedDamage'].replace('-', np.nan, inplace=True)
train_data_df = train_data_df.fillna(0)
train_data_df.info()
print('看P-Value是否满足正态分布,不大表明不大可能来自正态分布',
list(map(lambda x: scipy.stats.normaltest(train_data_df[x])[1], numerical_columns)))
7、对数转化图形对比
#画指标原图与对数转化后的原图
def log_plot(self,train_data_df):
## 3) 查看预测值的具体频数
plt.subplot(2,2,1)
plt.hist(train_data_df['price'], orientation='vertical', histtype='bar', color='red',label='price')
plt.subplot(2, 2, 2)
plt.hist(train_data_df['kilometer'], orientation='vertical', histtype='bar', color='green',label='price')
# plt.subplot(2, 3, 3)
# plt.hist(train_data_df['power'], orientation='vertical', histtype='bar', color='yellow',label='price')
# log变换 z之后的分布较均匀,可以进行log变换进行预测,这也是预测问题常用的trick
plt.subplot(2, 2, 3)
plt.hist(np.log(train_data_df['price']), orientation='vertical', histtype='bar', color='red',label='price')
plt.subplot(2, 2, 4)
#会发现,这种离散的做了对数变化,正态化效果并不明显
# plt.hist(np.log(train_data_df['kilometer']), orientation='vertical', histtype='bar', color='red',label='kilometer')
#转换不了,会报错,ValueError: supplied range of [-inf, 9.868481943337313] is not finite
# plt.subplot(2, 3, 6)
# plt.hist(np.log(train_data_df['power']), orientation='vertical', histtype='bar', color='red',label='power')
plt.show()
8、对数正态转化
def change_to_nomal(self,train_data_df):
"""
转换为正态分布
"""
train_data_df['price'] = train_data_df['price'].apply(lambda x: np.log(x))
# train_data_df['log_kilometer'] = train_data_df['kilometer'].apply(lambda x: np.log(x))
# train_data_df['log_power'] = train_data_df['power'].apply(lambda x: np.log(x))
# train_data_df['log_model'] = train_data_df['model'].apply(lambda x: np.log(x))
return train_data_df
8、对数正态转化
def change_to_nomal(self,train_data_df):
"""
转换为正态分布
"""
train_data_df['price'] = train_data_df['price'].apply(lambda x: np.log(x))
# train_data_df['log_kilometer'] = train_data_df['kilometer'].apply(lambda x: np.log(x))
# train_data_df['log_power'] = train_data_df['power'].apply(lambda x: np.log(x))
# train_data_df['log_model'] = train_data_df['model'].apply(lambda x: np.log(x))
return train_data_df
10、特征选择(根据数据分布)
11、特征选择(岭回归)
def ridge_cv(self,train_data_df,feature_columns):
"""
注意此时价格为正态
"""
# 使用岭回归处理共线性 ;逐步回归法(Stepwise Regression);
from sklearn import linear_model
# 初始化一个Ridge Cross-Validation Regression
# train_data_df = train_data_df.fillna(0)
data = train_data_df[feature_columns]
clf = linear_model.RidgeCV(fit_intercept=False)
# 训练模型---岭回归训练模型
clf.fit(data, train_data_df['price'])
print('alpha的数值 : ', clf.alpha_)
rst = list(map(lambda x: '{:.5f}'.format(abs(x)), clf.coef_))
rst = sorted(rst)
print(rst)
print(len(rst), len(feature_columns))
print('参数的数值:', dict(zip(feature_columns, rst)))
12、特征选择(逐步回归)
def stepwise_selection(self,X, y,
initial_list=[],
threshold_in=0.01,
threshold_out=0.05,
verbose=True):
"""
逐步回归,筛选特征
"""
included = list(initial_list)
while True:
changed = False
# forward step
excluded = list(set(X.columns) - set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.argmin()
included.append(best_feature)
changed = True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
changed = True
worst_feature = pvalues.argmax()
included.remove(worst_feature)
if verbose:
print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
if not changed:
break
return included
13、特征选择(xgboost)
def xgb_model_fit(self,train_data_df,predictors,alg, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(train_data_df[predictors], label=train_data_df['price'])
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='mae', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
# Fit the algorithm on the data
alg.fit(train_data_df[predictors], train_data_df['price'], eval_metric='mae')
# Predict training set:
train_data_df_predictions = alg.predict(train_data_df[predictors])
# 回归问题评价标
print("mean_absolute_error is : " )
print(mean_absolute_error(train_data_df['price'], train_data_df_predictions))
# feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
# feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance is')
plot_importance(alg)
plt.show()