了解各种特征工程以及分析方法
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')
data_train = pd.read_csv('Dataset/train_new.csv')
data_testA = pd.read_csv('Dataset/testA_new.csv')
numerical_fea = data_train.select_dtypes(exclude=['object']).columns.to_list()
category_fea = list(filter(lambda x: x not in numerical_fea,
data_train.columns.to_list()))
label = 'isDefault'
numerical_fea.remove(label)
缺失值填充是数据预处理中影响重大的一个环节。缺失值处理有多种方式,可以尝试多种填充方式并对比出结果最优的一种。但是这种尝试往往是基于对业务逻辑的正确理解之上。
缺失值填充方法:
1.DataFrame的fillna()方法。
1.1 可以对缺失值填充指定数值。
例1:
data_train = data_train.fillna(0)
1.2 也可以按行或者列进行前向填充(把缺失值替换成前面最近的非缺失值)或者后向填充(把缺失值替换成后面最近的非缺失值),并且可以限制填充的个数。
例2:
data_train = data_train.fillna(axis=0, method='ffill', limit=2)
2.DataFrame的interpolate()方法。
除了可以实现fillna()的功能,还可以对缺失值进行线性插值。
用法:
DataFrame.interpolate(method='linear', axis=0, limit=None, inplace=False, limit_direction=None, limit_area=None, downcast=None,**kwargs)
method:插值方式,可选参数值:
详情请参考官方文档:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html
例3:
s = pd.Series([0, 1, np.nan, 3])
s
0 0.0
1 1.0
2 NaN
3 3.0
dtype: float64
s.interpolate()
0 0.0
1 1.0
2 2.0
3 3.0
dtype: float64
例4:
s = pd.Series([0, 2, np.nan, 8])
s.interpolate(method='polynomial', order=2)
0 0.000000
1 2.000000
2 4.666667
3 8.000000
dtype: float64
# 查看缺失值
data_train.isnull().sum()
id 0
loanAmnt 0
term 0
interestRate 0
installment 0
grade 0
subGrade 0
employmentTitle 1
employmentLength 46799
homeOwnership 0
annualIncome 0
verificationStatus 0
issueDate 0
isDefault 0
purpose 0
postCode 1
regionCode 0
dti 239
delinquency_2years 0
ficoRangeLow 0
ficoRangeHigh 0
openAcc 0
pubRec 0
pubRecBankruptcies 405
revolBal 0
revolUtil 531
totalAcc 0
initialListStatus 0
applicationType 0
earliesCreditLine 0
title 1
policyCode 0
n0 40270
n1 40270
n2 40270
n3 40270
n4 33239
n5 40270
n6 40270
n7 40270
n8 40271
n9 40270
n10 33239
n11 69752
n12 40270
n13 40270
n14 40270
dtype: int64
# 按中位数填充数值特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_testA[numerical_fea] = data_testA[numerical_fea].fillna(data_train[numerical_fea].median())
# 按众数填充类别特征
"使用众数填充类别特征的原因?"
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_testA[category_fea] = data_testA[category_fea].fillna(data_train[category_fea].mode())
data_train.isnull().sum()
id 0
loanAmnt 0
term 0
interestRate 0
installment 0
grade 0
subGrade 0
employmentTitle 0
employmentLength 46799
homeOwnership 0
annualIncome 0
verificationStatus 0
issueDate 0
isDefault 0
purpose 0
postCode 0
regionCode 0
dti 0
delinquency_2years 0
ficoRangeLow 0
ficoRangeHigh 0
openAcc 0
pubRec 0
pubRecBankruptcies 0
revolBal 0
revolUtil 0
totalAcc 0
initialListStatus 0
applicationType 0
earliesCreditLine 0
title 0
policyCode 0
n0 0
n1 0
n2 0
n3 0
n4 0
n5 0
n6 0
n7 0
n8 0
n9 0
n10 0
n11 0
n12 0
n13 0
n14 0
dtype: int64
category_fea
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
for data in [data_train, data_testA]:
data['issueDate'] = pd.to_datetime(data['issueDate'], format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
# 构建时间特征
data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
data_train['employmentLength'].value_counts(dropna=False).sort_index()
1 year 52489
10+ years 262753
2 years 72358
3 years 64152
4 years 47985
5 years 50102
6 years 37254
7 years 35407
8 years 36192
9 years 30272
< 1 year 64237
NaN 46799
Name: employmentLength, dtype: int64
data_train['earliesCreditLine'].sample(5)
296825 Mar-1995
119736 Nov-1980
452511 Mar-2000
542871 Oct-2000
248791 Oct-1998
Name: earliesCreditLine, dtype: object
for data in [data_train, data_testA]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda x: int(x[-4:]))
data_train['earliesCreditLine'].sample(5)
630921 2011
634448 2009
135912 2008
73773 2003
606984 1995
Name: earliesCreditLine, dtype: int64
这里把月份丢弃了,如果要加上月份,可以用 “年份+月份/12”的方法。代码如下:
for data in [data_train, data_testA]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda x: int(x[-4:])) \
+ pd.to_datetime(data['earliesCreditLine'], format='%b-%Y').dt.month / 12
data_train['earliesCreditLine'].sample(5)
281248 2005.083333
297228 2005.500000
512734 2005.666667
48758 2003.416667
783447 2005.000000
Name: earliesCreditLine, dtype: float64
def employmentLength_to_int(data):
if pd.isnull(data):
return data
else:
return np.int8(data.split()[0])
for data in [data_train, data_testA]:
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data_train['employmentLength'].value_counts(dropna=False).sort_index()
0.0 64237
1.0 52489
2.0 72358
3.0 64152
4.0 47985
5.0 50102
6.0 37254
7.0 35407
8.0 36192
9.0 30272
10.0 262753
NaN 46799
Name: employmentLength, dtype: int64
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus',
'purpose', 'postCode', 'regionCode', 'applicationType', 'initialListStatus',
'title', 'policyCode']
for fea in cate_features:
print(fea, '类型数: ', data[fea].nunique())
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 79282
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 889
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 12058
policyCode 类型数: 1
这种具有优先级的类别特征,可以用 labelencode或者自映射。
grade_code = dict([(v, k) for k, v in enumerate(data_train['grade'].unique())])
for data in [data_train, data_testA]:
data['grade'] = data['grade'].map(grade_code)
# 类型数大于等于2,又不是高纬稀疏的,且为纯分类的特征
for data in [data_train, data_testA]:
data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus',
'purpose', 'regionCode'], drop_first=True)
与正态分布的性质有关。
由于“小概率事件”和假设检验的基本思想 “小概率事件”通常指发生的概率小于5%的事件,认为在一次试验中该事件是几乎不可能发生的。由此可见,服从正态分布的随机变量X,落在(μ-3σ,μ+3σ)以外的概率小于千分之三,在实际问题中常认为相应的事件是不会发生的,基本上可以把区间(μ-3σ,μ+3σ)看作是随机变量X实际可能的取值区间,这称之为正态分布的“3σ”原则。
统计检验法中,判断异常值的方法还有很多种。例如格拉布斯法、狄克逊法、偏度-峰度法、拉依达法、奈尔法等等。每种方法都有其适用范围和优缺点。
每种统计检验法都会犯错误1和错误2。但是有人做过统计,在所有方法中,格拉布斯法犯这两种错误的概率最小。
def find_outliers_by_3segama(data, fea):
data_std = np.std(data[fea])
data_mean = np.mean(data[fea])
outliers_cut_off = data_std * 3
lower_rule = data_mean - outliers_cut_off
upper_rule = data_mean + outliers_cut_off
data[fea+'_outliers'] = data[fea].apply(lambda x: str('异常值')
if x > upper_rule or x < lower_rule
else '正常值')
return data
进一步分析特征异常值和目标变量的关系。
data_train = data_train.copy()
for fea in numerical_fea:
data_train = find_outliers_by_3segama(data_train, fea)
print(data_train[fea+'_outliers'].value_counts())
print(data_train.groupby(fea+'_outliers')['isDefault'].sum())
print('*'*30)
正常值 800000
Name: id_outliers, dtype: int64
id_outliers
正常值 159610
Name: isDefault, dtype: int64
******************************
正常值 800000
Name: loanAmnt_outliers, dtype: int64
loanAmnt_outliers
正常值 159610
Name: isDefault, dtype: int64
******************************
正常值 800000
Name: term_outliers, dtype: int64
term_outliers
正常值 159610
Name: isDefault, dtype: int64
******************************
正常值 794259
异常值 5741
Name: interestRate_outliers, dtype: int64
interestRate_outliers
异常值 2916
正常值 156694
Name: isDefault, dtype: int64
******************************
正常值 792046
异常值 7954
Name: installment_outliers, dtype: int64
installment_outliers
异常值 2152
正常值 157458
Name: isDefault, dtype: int64
******************************
正常值 800000
Name: employmentTitle_outliers, dtype: int64
employmentTitle_outliers
正常值 159610
Name: isDefault, dtype: int64
******************************
正常值 799701
异常值 299
Name: homeOwnership_outliers, dtype: int64
homeOwnership_outliers
异常值 62
正常值 159548
Name: isDefault, dtype: int64
******************************
正常值 793973
异常值 6027
Name: annualIncome_outliers, dtype: int64
annualIncome_outliers
异常值 756
正常值 158854
Name: isDefault, dtype: int64
******************************
正常值 800000
Name: verificationStatus_outliers, dtype: int64
verificationStatus_outliers
正常值 159610
Name: isDefault, dtype: int64
******************************
正常值 783003
异常值 16997
Name: purpose_outliers, dtype: int64
purpose_outliers
异常值 3635
正常值 155975
Name: isDefault, dtype: int64
******************************
正常值 798931
异常值 1069
Name: postCode_outliers, dtype: int64
postCode_outliers
异常值 221
正常值 159389
Name: isDefault, dtype: int64
******************************
正常值 799994
异常值 6
Name: regionCode_outliers, dtype: int64
regionCode_outliers
异常值 1
正常值 159609
Name: isDefault, dtype: int64
******************************
正常值 798440
异常值 1560
Name: dti_outliers, dtype: int64
dti_outliers
异常值 466
正常值 159144
Name: isDefault, dtype: int64
******************************
正常值 778245
异常值 21755
Name: delinquency_2years_outliers, dtype: int64
delinquency_2years_outliers
异常值 5089
正常值 154521
Name: isDefault, dtype: int64
******************************
正常值 788261
异常值 11739
Name: ficoRangeLow_outliers, dtype: int64
ficoRangeLow_outliers
异常值 778
正常值 158832
Name: isDefault, dtype: int64
******************************
正常值 788261
异常值 11739
Name: ficoRangeHigh_outliers, dtype: int64
ficoRangeHigh_outliers
异常值 778
正常值 158832
Name: isDefault, dtype: int64
******************************
正常值 790889
异常值 9111
Name: openAcc_outliers, dtype: int64
openAcc_outliers
异常值 2195
正常值 157415
Name: isDefault, dtype: int64
******************************
正常值 792471
异常值 7529
Name: pubRec_outliers, dtype: int64
pubRec_outliers
异常值 1701
正常值 157909
Name: isDefault, dtype: int64
******************************
正常值 794120
异常值 5880
Name: pubRecBankruptcies_outliers, dtype: int64
pubRecBankruptcies_outliers
异常值 1423
正常值 158187
Name: isDefault, dtype: int64
******************************
正常值 790001
异常值 9999
Name: revolBal_outliers, dtype: int64
revolBal_outliers
异常值 1359
正常值 158251
Name: isDefault, dtype: int64
******************************
正常值 799948
异常值 52
Name: revolUtil_outliers, dtype: int64
revolUtil_outliers
异常值 23
正常值 159587
Name: isDefault, dtype: int64
******************************
正常值 791663
异常值 8337
Name: totalAcc_outliers, dtype: int64
totalAcc_outliers
异常值 1668
正常值 157942
Name: isDefault, dtype: int64
******************************
正常值 800000
Name: initialListStatus_outliers, dtype: int64
initialListStatus_outliers
正常值 159610
Name: isDefault, dtype: int64
******************************
正常值 784586
异常值 15414
Name: applicationType_outliers, dtype: int64
applicationType_outliers
异常值 3875
正常值 155735
Name: isDefault, dtype: int64
******************************
正常值 775134
异常值 24866
Name: title_outliers, dtype: int64
title_outliers
异常值 3900
正常值 155710
Name: isDefault, dtype: int64
******************************
正常值 800000
Name: policyCode_outliers, dtype: int64
policyCode_outliers
正常值 159610
Name: isDefault, dtype: int64
******************************
正常值 782773
异常值 17227
Name: n0_outliers, dtype: int64
n0_outliers
异常值 3485
正常值 156125
Name: isDefault, dtype: int64
******************************
正常值 790500
异常值 9500
Name: n1_outliers, dtype: int64
n1_outliers
异常值 2491
正常值 157119
Name: isDefault, dtype: int64
******************************
正常值 789067
异常值 10933
Name: n2_outliers, dtype: int64
n2_outliers
异常值 3205
正常值 156405
Name: isDefault, dtype: int64
******************************
正常值 789067
异常值 10933
Name: n3_outliers, dtype: int64
n3_outliers
异常值 3205
正常值 156405
Name: isDefault, dtype: int64
******************************
正常值 788660
异常值 11340
Name: n4_outliers, dtype: int64
n4_outliers
异常值 2476
正常值 157134
Name: isDefault, dtype: int64
******************************
正常值 790355
异常值 9645
Name: n5_outliers, dtype: int64
n5_outliers
异常值 1858
正常值 157752
Name: isDefault, dtype: int64
******************************
正常值 786006
异常值 13994
Name: n6_outliers, dtype: int64
n6_outliers
异常值 3182
正常值 156428
Name: isDefault, dtype: int64
******************************
正常值 788430
异常值 11570
Name: n7_outliers, dtype: int64
n7_outliers
异常值 2746
正常值 156864
Name: isDefault, dtype: int64
******************************
正常值 789625
异常值 10375
Name: n8_outliers, dtype: int64
n8_outliers
异常值 2131
正常值 157479
Name: isDefault, dtype: int64
******************************
正常值 786384
异常值 13616
Name: n9_outliers, dtype: int64
n9_outliers
异常值 3953
正常值 155657
Name: isDefault, dtype: int64
******************************
正常值 788979
异常值 11021
Name: n10_outliers, dtype: int64
n10_outliers
异常值 2639
正常值 156971
Name: isDefault, dtype: int64
******************************
正常值 799434
异常值 566
Name: n11_outliers, dtype: int64
n11_outliers
异常值 112
正常值 159498
Name: isDefault, dtype: int64
******************************
正常值 797585
异常值 2415
Name: n12_outliers, dtype: int64
n12_outliers
异常值 545
正常值 159065
Name: isDefault, dtype: int64
******************************
正常值 788907
异常值 11093
Name: n13_outliers, dtype: int64
n13_outliers
异常值 2482
正常值 157128
Name: isDefault, dtype: int64
******************************
正常值 788884
异常值 11116
Name: n14_outliers, dtype: int64
n14_outliers
异常值 3364
正常值 156246
Name: isDefault, dtype: int64
******************************
需要分析的方向:
异常值在目标变量上的分布是否符合整体的分布?
如果出现异常值只分布在目标变量的个别类别上的情况,又代表着什么?
# 删除异常值
for fea in numerical_fea:
data_train = data_train[data_train[fea+'_outliers']=='正常值']
data_train = data_train.reset_index(drop=True)
与四分位数有关。
总结一句话:四分位数会将数据分为三个点和四个区间,IQR = Q3 -Q1,下触须=Q1 − 1.5x IQR,上触须=Q3 + 1.5x IQR;
从模型效果上来看,特征分箱主要是为了降低变量的复杂性,减少变量噪音对模型的影响,提高自变量和因变量的相关度。从而使模型更加稳定。
将连续变量离散化。
将多态的离散变量合并成少状态。
在数据的特征内的值跨度比较大时,如果使用有监督和无监督中算法,如k-means聚类,其使用欧式距离作为相似度来测量数据点之间的相似度,都会造成大吃小的影响。
通过数据分桶(数据分箱),对特征数值进行区间量化,可以解决这个问题。
处理缺失值:当数据源可能存在缺失值,此时可以把null单独作为一个分箱。
处理异常值:当数据中存在离群点时,可以将其通过分箱离散化处理,从而提高变量的鲁棒性(抗干扰能力)。例如,age特征出现200这种异常值时,可分入"age > 60"这个分箱里,排除影响。
业务解释性:当变量存在非线性相关性时,可以经过WOE变换。
(1)最小分箱占比不低于5%
(2)箱内不能全部是好客户
(3)连续箱单调
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是 loanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)# 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
此外还有树模型分箱等方法。对此了解不深,暂时不讨论。
分箱完成后,要对分箱结果进行评价。
评分卡模型中最常用的是WOE和IV值。要注意的一点是,WOE和IV值只能针对二分类问题计算。
参考资料:https://zhuanlan.zhihu.com/p/80134853
以下是我个人对上面分箱知识的理解和应用,如有谬误,烦请指明。
def WOE_evaluation(data, cats, col='isDefault'):
"""
data: 用于分箱的数据集
cats: 每个样本所属的箱的数据集
col: 目标变量
return:每个箱的WOE值,以及整个分箱效果的IV值
"""
# cats = cats.astype('category')
encode_set = data.groupby([cats, data[col]]).size().unstack().fillna(0)
WOE_values = np.log((encode_set[0] / encode_set.sum()[0]) / ((encode_set[1] / encode_set.sum()[1])))
IV_value = np.sum((encode_set[0] / encode_set.sum()[0] - encode_set[1] / encode_set.sum()[1])* WOE_values)
print('IV value: ',IV_value)
plt.figure(figsize=(10, 4))
x = range(encode_set.shape[0])
sns.lineplot(x, WOE_values)
# plt.xticks(x, cats.cat.categories, rotation=45)
plt.xticks(x, cats.unique(), rotation=45)
plt.show()
return WOE_values, IV_value
# 'dti'特征里面有一个-1值,属于异常值,需要去掉
data_train = data_train[data_train['dti'] >= 0].copy()
data_train.reset_index(drop=True, inplace=True)
bins = [i for i in np.arange(0, 41, 5)] # 固定宽度分箱
bins.append(int(data_train['dti'].max())+1) # 偏态分布的尾部,分为一箱
cats = pd.cut(data_train['dti'], bins, right=False)
WOE, IV = WOE_evaluation(data_train, cats)
WOE
IV value: 0.0712144251530253
dti
[0, 5) 0.359301
[5, 10) 0.346582
[10, 15) 0.226937
[15, 20) 0.056710
[20, 25) -0.102537
[25, 30) -0.282429
[30, 35) -0.466837
[35, 40) -0.587163
[40, 1000) -0.607468
dtype: float64
分箱结果虽然呈现单调下降的趋势,但是IV值只有0.07,对预测结果的影响很弱。
交互特征的构造非常简单,使用起来的却代价不菲(在时间或空间上)。
如果线性模型中包含有交互特征对,那它的训练时间和评分时间就会从 O(n) 增加到 O(n2),其中 n 是单一特征的数量。
for col in ['grade', 'subGrade']:
temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col+'_target_mean'})
temp_dict.index = temp_dict[col].values
temp_dict = temp_dict[col+'_target_mean'].to_dict()
data_train[col+'_target_mean'] = data_train[col].map(temp_dict)
data_testA[col+'_target_mean'] = data_testA[col].map(temp_dict)
# 其他衍生变量 mean 和 std
for df in [data_train, data_testA]:
for item in ['n'+str(i) for i in range(15)]:
df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')
以上是特征交互的思路。但是特征和特征之间交互衍生出新的特征和衍生的算法还不止这些。
for col in tqdm(['employmentTitle', 'postCode', 'title', 'subGrade']):
le = LabelEncoder()
le.fit(data_train[col].astype(str).to_list() + data_testA[col].astype(str).to_list())
data_train[col] = le.transform(data_train[col].astype(str).to_list())
data_testA[col] = le.transform(data_testA[col].astype(str).to_list())
print('Label Encoding 完成。')
100%|███████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00, 1.73s/it]
Label Encoding 完成。
需要进行特征归一化和去除相关性高的特征。
#伪代码
for fea in [要归一化的特征列表]:
data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))
<br>
特征选择技术可以精简掉无用的特征,以降低最终模型的复杂性,最终得到一个简约模型,并且在不降低预测准确率或对预测准确率影响不大的情况下提高计算速度。
特征选择不是为了减少训练时间(实际上,一些技术会增加总体训练时间),而是为了减少模型评分时间。
特征选择的方法:
1.Filter
方差选择法
相关系数选择法(person相关系数)
卡方检验
互信息法
2.Wrapper(RFE)
递归特征消除法
3.Embedded
基于惩罚的特征选择法
基于树模型的特征选择
基于特征间的关系进行筛选
from sklearn.feature_selection import VarianceThreshold
# 参数 threshold为方差的阈值
VarianceThreshold(threshold=3).fit_transform(train, traget_train)
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
# 选择K个最好的特征,返回选择特征后的数据
# 第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,
# 输出二元组(评分, P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算系数。
# 参数K为选择的特征个数
SelectKBest(k=5).fit_transform(train, traget_train)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# 参数K 为选择的特征个数
SelectKBest(chi2, k=5).fit_transform(train, target_train)
# !pip install minepy
from sklearn.feature_selection import SelectKBest
from minepy import MINE
# 由于MINE的设计不是函数式的,通过以下定义的mic函数将其转化为函数式
# 返回结果为一个二元元组,元组的第二项设置成固定的P值0.5
def mic(x, y):
m = MINE()
m.compute_socre(x, y)
return (m.mic(), 0.5)
# 参数K为选择的特征个数
SelectKBest(lambda X, Y: np.arry(map(lambda x: mic(x, Y), X.T)).T,
k=2).fit_transform(train, target_train)
递归特征消除法使用一个基模型来进行多伦训练,每轮训练后,消除若干权值系数的特征,再基于新的特征集进行下一轮训练。
在sklearn.feature_selection库中的RFE类可以用于特征选择。以下用逻辑回归为例:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogistRegresstion
# 递归特征消除法,返回特征选择后的数据
# 参数estimator为基模型
# 参数n_features_to_select为选择的特征个数
RFE(estimator=LoistRegresstion(), n_features_to_select=2).fit_transform(train, target_train)
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
# 带L1惩罚项的逻辑回归作为基模型的特征选择
SelectFromModel(LogisticRegression(penalty-'l1', C=0.1)).fit_transform(train, target_train)
from sklearn.feature_selection import SelectFromModel
from sklarn.ensemble import GradientBoostingClassifier
# GBDT作为基模型的特征选择
SelecFromModel(GradientBoostingClassifier()).fit_transform(train, target_train)
对本数据集使用协方差进行特征选择
# 删除不需要的数据
for data in [data_train, data_testA]:
data.drop(['issueDate', 'id'], axis=1, inplace=True)
# 纵向用缺失值上面的值替换缺失值
data_train = data_train.fillna(axis=0, method='ffill')
x_train = data_train.drop(['isDefault'], axis=1)
# 计算协方差
# 计算相关性
data_corr = x_train.corrwith(data_train.isDefault)
# result = pd.DataFrame(columns=['features', 'corr'])
# result['features'] = data_corr.index
# result['corr'] = data_corr.values
data_corr = data_corr.reset_index()
data_corr.columns = ['features', 'corr']
# 特征相关系性可视化
data_numeric = data_train[numerical_fea[1:]]
correlation = data_numeric.corr()
plt.figure(figsize=(7, 7))
plt.title('Correlation of Numberic Features with Price', y=1, size=16)
sns.heatmap(correlation, square=True, vmax=0.8)
plt.show()
features = [col for col in data_train.columns
if col not in ['id', 'issuDate', 'isDefault'] and '_outliers' not in col]
x_train = data_train[features]
x_test = data_testA[features]
y_train = data_train['isDefault']
保存数据,以便后续模型调参和模型融合时候调用。
x_train.to_csv('Dataset/data_for_model.csv', index=False)
y_train.to_csv('Dataset/label_for_model.csv', index=False)
x_test.to_csv('Dataset/testA_With_FeatureEngineering.csv', index=False)
def cv_model(clf, train_x, train_y, test_x, clf_name):
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
train = np.zeros(train_x.shape[0])
test = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('*'*30,str(i+1),'*'*30)
trn_x, trn_y, val_x, val_y = (train_x.iloc[train_index], train_y[train_index],
train_x.iloc[valid_index], train_y[valid_index])
if clf_name == 'lgb':
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
parmas = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 2**5,
'lambda_12': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': seed,
'nthread': 28,
'n_jobs': 24,
'silent': True,
'verbose': -1
}
model = clf.train(parmas, train_matrix, 500000, valid_sets=[train_matrix, valid_matrix],
verbose_eval=200, early_stopping_rounds=200)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
if clf_name == 'xgb':
train_matrix = clf.DMatrix(trn_x, label=trn_y)
valid_matrix = clf.DMatrix(val_x, label=val_y)
test_matrix = clf.DMatrix(test_x)
parmas = {
'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.04,
'tree_method': 'exact',
'seed': seed,
'nthread': 36,
'silent': True,
# 'tree_method': 'gpu_hist'
}
watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
model = clf.train(parmas, train_matrix, num_boost_round=50000, evals=watchlist,
verbose_eval=200, early_stopping_rounds=200)
val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
if clf_name == 'cat':
parmas = {
'learning_rate': 0.05,
'depth': 5,
'l2_leaf_reg': 10,
'bootstrap_type': 'Bernoulli',
'od_type': 'Iter',
'od_wait': 50,
'random_seed': 11,
'allow_writing_files': False
}
model = clf(iterations=20000, **parmas)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
cat_features=[], use_best_model=True, verbose=500)
val_pred = model.predict(val_x)
test_pred = model.predict(test_x)
train[valid_index] = val_pred
test += test_pred / kf.n_splits
cv_scores.append(roc_auc_score(val_y, val_pred))
print(cv_scores)
print('%s_scotrainre_lsit: ' % clf_name, cv_scores)
print('%s_score_mean: ' % clf_name, np.mean(cv_scores))
print('%s_score_std: ' % clf_name, np.std(cv_scores))
return train, test
def lgb_model(x_train, y_train, x_test):
lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, 'lgb')
return lgb_train, lgb_test
def xgb_model(x_train, y_train, x_test):
xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, 'xgb')
return xgb_train, xgb_test
def cat_model(x_train, y_train, x_test):
cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, 'cat')
return cat_train, cat_test
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
****************************** 1 ******************************
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.75152 valid_1's auc: 0.728813
[400] training's auc: 0.770784 valid_1's auc: 0.729129
Early stopping, best iteration is:
[323] training's auc: 0.763994 valid_1's auc: 0.729426
[0.7294259254463246]
****************************** 2 ******************************
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.751572 valid_1's auc: 0.730902
[400] training's auc: 0.770774 valid_1's auc: 0.731029
Early stopping, best iteration is:
[339] training's auc: 0.765356 valid_1's auc: 0.731286
[0.7294259254463246, 0.7312857834764355]
****************************** 3 ******************************
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.750642 valid_1's auc: 0.731674
[400] training's auc: 0.769801 valid_1's auc: 0.732319
[600] training's auc: 0.786078 valid_1's auc: 0.732263
Early stopping, best iteration is:
[582] training's auc: 0.784663 valid_1's auc: 0.732347
[0.7294259254463246, 0.7312857834764355, 0.7323471951456343]
****************************** 4 ******************************
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.751853 valid_1's auc: 0.727029
[400] training's auc: 0.770701 valid_1's auc: 0.727655
Early stopping, best iteration is:
[307] training's auc: 0.76223 valid_1's auc: 0.72777
[0.7294259254463246, 0.7312857834764355, 0.7323471951456343, 0.7277698679698775]
****************************** 5 ******************************
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.750566 valid_1's auc: 0.732136
[400] training's auc: 0.7692 valid_1's auc: 0.732618
Early stopping, best iteration is:
[373] training's auc: 0.766763 valid_1's auc: 0.732762
[0.7294259254463246, 0.7312857834764355, 0.7323471951456343, 0.7277698679698775, 0.732762205282248]
lgb_scotrainre_lsit: [0.7294259254463246, 0.7312857834764355, 0.7323471951456343, 0.7277698679698775, 0.732762205282248]
lgb_score_mean: 0.730718195464104
lgb_score_std: 0.0018717034520938764
cat_train, cat_test = cat_model(x_train, y_train, x_test)
****************************** 1 ******************************
0: learn: 0.3943984 test: 0.3964334 best: 0.3964334 (0) total: 230ms remaining: 1h 16m 42s
500: learn: 0.3728079 test: 0.3756736 best: 0.3756736 (500) total: 1m 12s remaining: 46m 44s
1000: learn: 0.3712148 test: 0.3750919 best: 0.3750911 (998) total: 2m 28s remaining: 46m 50s
1500: learn: 0.3699919 test: 0.3748420 best: 0.3748420 (1500) total: 3m 43s remaining: 45m 58s
2000: learn: 0.3688915 test: 0.3746858 best: 0.3746842 (1990) total: 4m 49s remaining: 43m 28s
2500: learn: 0.3678739 test: 0.3745857 best: 0.3745830 (2480) total: 5m 52s remaining: 41m 7s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3745742218
bestIteration = 2598
Shrink model to first 2599 iterations.
[0.7309683524594143]
****************************** 2 ******************************
0: learn: 0.3947297 test: 0.3950963 best: 0.3950963 (0) total: 169ms remaining: 56m 28s
500: learn: 0.3731510 test: 0.3743435 best: 0.3743435 (500) total: 1m 12s remaining: 47m 18s
1000: learn: 0.3715005 test: 0.3738019 best: 0.3738019 (1000) total: 2m 20s remaining: 44m 33s
1500: learn: 0.3702694 test: 0.3735912 best: 0.3735912 (1500) total: 3m 36s remaining: 44m 23s
2000: learn: 0.3691661 test: 0.3734402 best: 0.3734392 (1997) total: 4m 44s remaining: 42m 35s
2500: learn: 0.3681716 test: 0.3733360 best: 0.3733354 (2486) total: 5m 59s remaining: 41m 55s
3000: learn: 0.3672144 test: 0.3732713 best: 0.3732689 (2996) total: 7m 17s remaining: 41m 19s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3732575734
bestIteration = 3088
Shrink model to first 3089 iterations.
[0.7309683524594143, 0.7326572146779085]
****************************** 3 ******************************
0: learn: 0.3950712 test: 0.3937140 best: 0.3937140 (0) total: 168ms remaining: 55m 54s
500: learn: 0.3734495 test: 0.3730873 best: 0.3730873 (500) total: 1m 25s remaining: 55m 22s
1000: learn: 0.3718337 test: 0.3724441 best: 0.3724438 (999) total: 2m 45s remaining: 52m 28s
1500: learn: 0.3706096 test: 0.3721854 best: 0.3721854 (1497) total: 4m 6s remaining: 50m 43s
2000: learn: 0.3695474 test: 0.3720525 best: 0.3720515 (1997) total: 5m 21s remaining: 48m 13s
2500: learn: 0.3685663 test: 0.3719417 best: 0.3719415 (2494) total: 6m 36s remaining: 46m 11s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3719013034
bestIteration = 2838
Shrink model to first 2839 iterations.
[0.7309683524594143, 0.7326572146779085, 0.7340265229814408]
****************************** 4 ******************************
0: learn: 0.3949297 test: 0.3943073 best: 0.3943073 (0) total: 201ms remaining: 1h 6m 58s
500: learn: 0.3731882 test: 0.3740583 best: 0.3740583 (500) total: 1m 24s remaining: 54m 59s
1000: learn: 0.3715885 test: 0.3735140 best: 0.3735129 (998) total: 2m 37s remaining: 49m 41s
1500: learn: 0.3703487 test: 0.3732792 best: 0.3732792 (1500) total: 3m 48s remaining: 46m 56s
2000: learn: 0.3692554 test: 0.3731344 best: 0.3731331 (1984) total: 5m 9s remaining: 46m 19s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3731228401
bestIteration = 2114
Shrink model to first 2115 iterations.
[0.7309683524594143, 0.7326572146779085, 0.7340265229814408, 0.7289299541394646]
****************************** 5 ******************************
0: learn: 0.3948878 test: 0.3944737 best: 0.3944737 (0) total: 338ms remaining: 1h 52m 44s
500: learn: 0.3733459 test: 0.3734735 best: 0.3734735 (500) total: 1m 23s remaining: 53m 56s
1000: learn: 0.3716805 test: 0.3729340 best: 0.3729336 (999) total: 2m 35s remaining: 49m 17s
1500: learn: 0.3704593 test: 0.3727339 best: 0.3727339 (1499) total: 3m 53s remaining: 47m 51s
2000: learn: 0.3693569 test: 0.3726087 best: 0.3726087 (2000) total: 5m 11s remaining: 46m 42s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3725807976
bestIteration = 2216
Shrink model to first 2217 iterations.
[0.7309683524594143, 0.7326572146779085, 0.7340265229814408, 0.7289299541394646, 0.7336854992714498]
cat_scotrainre_lsit: [0.7309683524594143, 0.7326572146779085, 0.7340265229814408, 0.7289299541394646, 0.7336854992714498]
cat_score_mean: 0.7320535087059357
cat_score_std: 0.001889698211194384
from sklearn.naive_bayes import BernoulliNB
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
train = np.zeros(x_train.shape[0])
test = np.zeros(x_test.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
print('*'*30,str(i+1),'*'*30)
trn_x, trn_y, val_x, val_y = (x_train.iloc[train_index], y_train[train_index],
x_train.iloc[valid_index], y_train[valid_index])
ber_nb = BernoulliNB()
ber_nb.fit(trn_x, trn_y)
val_pred = ber_nb.predict(val_x)
test_pred = ber_nb.predict(x_test)
train[valid_index] = val_pred
test = test_pred / kf.n_splits
score = roc_auc_score(val_y, val_pred)
print(score)
cv_scores.append()
使用2个模型预测结果的平均数作为最终预测结果。
rh_test = lgb_test*0.5 + cat_test*0.5
# 前面删除了ID 特征,这里需要重新读取
data_testA = pd.read_csv('Dataset/testA_new.csv', usecols=['id'])
data_testA['isDefault'] = rh_test
data_testA[['id', 'isDefault']].to_csv('AfterFeatureEngineering.csv', index=False)
提交结果为:0.7325