import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
data_train = pd.read_csv('Dataset/train_new.csv')
data_testA = pd.read_csv('Dataset/testA_new.csv')
numerical_fea = data_train.select_dtypes(exclude=['object']).columns.to_list()
category_fea = list(filter(lambda x: x not in numerical_fea,
label = 'isDefault'
1.1 可以对缺失值填充指定数值。
data_train = data_train.fillna(0)
1.2 也可以按行或者列进行前向填充(把缺失值替换成前面最近的非缺失值)或者后向填充(把缺失值替换成后面最近的非缺失值),并且可以限制填充的个数。
data_train = data_train.fillna(axis=0, method='ffill', limit=2)
DataFrame.interpolate(method='linear', axis=0, limit=None, inplace=False, limit_direction=None, limit_area=None, downcast=None,**kwargs)
s = pd.Series([0, 1, np.nan, 3])
0 0.0
1 1.0
2 NaN
3 3.0
dtype: float64
0 0.0
1 1.0
2 2.0
3 3.0
dtype: float64
s = pd.Series([0, 2, np.nan, 8])
s.interpolate(method='polynomial', order=2)
0 0.000000
1 2.000000
2 4.666667
3 8.000000
dtype: float64
# 查看缺失值
id 0
loanAmnt 0
term 0
interestRate 0
installment 0
grade 0
subGrade 0
employmentTitle 1
employmentLength 46799
homeOwnership 0
annualIncome 0
verificationStatus 0
issueDate 0
isDefault 0
purpose 0
postCode 1
regionCode 0
dti 239
delinquency_2years 0
ficoRangeLow 0
ficoRangeHigh 0
openAcc 0
pubRec 0
pubRecBankruptcies 405
revolBal 0
revolUtil 531
totalAcc 0
initialListStatus 0
applicationType 0
earliesCreditLine 0
title 1
policyCode 0
n0 40270
n1 40270
n2 40270
n3 40270
n4 33239
n5 40270
n6 40270
n7 40270
n8 40271
n9 40270
n10 33239
n11 69752
n12 40270
n13 40270
n14 40270
dtype: int64
# 按中位数填充数值特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_testA[numerical_fea] = data_testA[numerical_fea].fillna(data_train[numerical_fea].median())
# 按众数填充类别特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_testA[category_fea] = data_testA[category_fea].fillna(data_train[category_fea].mode())
id 0
loanAmnt 0
term 0
interestRate 0
installment 0
grade 0
subGrade 0
employmentTitle 0
employmentLength 46799
homeOwnership 0
annualIncome 0
verificationStatus 0
issueDate 0
isDefault 0
purpose 0
postCode 0
regionCode 0
dti 0
delinquency_2years 0
ficoRangeLow 0
ficoRangeHigh 0
openAcc 0
pubRec 0
pubRecBankruptcies 0
revolBal 0
revolUtil 0
totalAcc 0
initialListStatus 0
applicationType 0
earliesCreditLine 0
title 0
policyCode 0
n0 0
n1 0
n2 0
n3 0
n4 0
n5 0
n6 0
n7 0
n8 0
n9 0
n10 0
n11 0
n12 0
n13 0
n14 0
dtype: int64
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
for data in [data_train, data_testA]:
data['issueDate'] = pd.to_datetime(data['issueDate'], format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
# 构建时间特征
data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
1 year 52489
10+ years 262753
2 years 72358
3 years 64152
4 years 47985
5 years 50102
6 years 37254
7 years 35407
8 years 36192
9 years 30272
< 1 year 64237
NaN 46799
Name: employmentLength, dtype: int64
296825 Mar-1995
119736 Nov-1980
452511 Mar-2000
542871 Oct-2000
248791 Oct-1998
Name: earliesCreditLine, dtype: object
for data in [data_train, data_testA]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda x: int(x[-4:]))
630921 2011
634448 2009
135912 2008
73773 2003
606984 1995
Name: earliesCreditLine, dtype: int64
这里把月份丢弃了,如果要加上月份,可以用 “年份+月份/12”的方法。代码如下:
for data in [data_train, data_testA]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda x: int(x[-4:])) \
+ pd.to_datetime(data['earliesCreditLine'], format='%b-%Y').dt.month / 12
281248 2005.083333
297228 2005.500000
512734 2005.666667
48758 2003.416667
783447 2005.000000
Name: earliesCreditLine, dtype: float64
def employmentLength_to_int(data):
if pd.isnull(data):
return data
return np.int8(data.split()[0])
for data in [data_train, data_testA]:
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
0.0 64237
1.0 52489
2.0 72358
3.0 64152
4.0 47985
5.0 50102
6.0 37254
7.0 35407
8.0 36192
9.0 30272
10.0 262753
NaN 46799
Name: employmentLength, dtype: int64
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus',
'purpose', 'postCode', 'regionCode', 'applicationType', 'initialListStatus',
'title', 'policyCode']
for fea in cate_features:
print(fea, '类型数: ', data[fea].nunique())
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 79282
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 889
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 12058
policyCode 类型数: 1
这种具有优先级的类别特征,可以用 labelencode或者自映射。
grade_code = dict([(v, k) for k, v in enumerate(data_train['grade'].unique())])
for data in [data_train, data_testA]:
data['grade'] = data['grade'].map(grade_code)
# 类型数大于等于2,又不是高纬稀疏的,且为纯分类的特征
for data in [data_train, data_testA]:
data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus',
'purpose', 'regionCode'], drop_first=True)
由于“小概率事件”和假设检验的基本思想 “小概率事件”通常指发生的概率小于5%的事件,认为在一次试验中该事件是几乎不可能发生的。由此可见,服从正态分布的随机变量X,落在(μ-3σ,μ+3σ)以外的概率小于千分之三,在实际问题中常认为相应的事件是不会发生的,基本上可以把区间(μ-3σ,μ+3σ)看作是随机变量X实际可能的取值区间,这称之为正态分布的“3σ”原则。
def find_outliers_by_3segama(data, fea):
data_std = np.std(data[fea])
data_mean = np.mean(data[fea])
outliers_cut_off = data_std * 3
lower_rule = data_mean - outliers_cut_off
upper_rule = data_mean + outliers_cut_off
data[fea+'_outliers'] = data[fea].apply(lambda x: str('异常值')
if x > upper_rule or x < lower_rule
else '正常值')
return data
data_train = data_train.copy()
for fea in numerical_fea:
data_train = find_outliers_by_3segama(data_train, fea)
正常值 800000
Name: id_outliers, dtype: int64
正常值 159610
Name: isDefault, dtype: int64
正常值 800000
Name: loanAmnt_outliers, dtype: int64
正常值 159610
Name: isDefault, dtype: int64
正常值 800000
Name: term_outliers, dtype: int64
正常值 159610
Name: isDefault, dtype: int64
正常值 794259
异常值 5741
Name: interestRate_outliers, dtype: int64
异常值 2916
正常值 156694
Name: isDefault, dtype: int64
正常值 792046
异常值 7954
Name: installment_outliers, dtype: int64
异常值 2152
正常值 157458
Name: isDefault, dtype: int64
正常值 800000
Name: employmentTitle_outliers, dtype: int64
正常值 159610
Name: isDefault, dtype: int64
正常值 799701
异常值 299
Name: homeOwnership_outliers, dtype: int64
异常值 62
正常值 159548
Name: isDefault, dtype: int64
正常值 793973
异常值 6027
Name: annualIncome_outliers, dtype: int64
异常值 756
正常值 158854
Name: isDefault, dtype: int64
正常值 800000
Name: verificationStatus_outliers, dtype: int64
正常值 159610
Name: isDefault, dtype: int64
正常值 783003
异常值 16997
Name: purpose_outliers, dtype: int64
异常值 3635
正常值 155975
Name: isDefault, dtype: int64
正常值 798931
异常值 1069
Name: postCode_outliers, dtype: int64
异常值 221
正常值 159389
Name: isDefault, dtype: int64
正常值 799994
异常值 6
Name: regionCode_outliers, dtype: int64
异常值 1
正常值 159609
Name: isDefault, dtype: int64
正常值 798440
异常值 1560
Name: dti_outliers, dtype: int64
异常值 466
正常值 159144
Name: isDefault, dtype: int64
正常值 778245
异常值 21755
Name: delinquency_2years_outliers, dtype: int64
异常值 5089
正常值 154521
Name: isDefault, dtype: int64
正常值 788261
异常值 11739
Name: ficoRangeLow_outliers, dtype: int64
异常值 778
正常值 158832
Name: isDefault, dtype: int64
正常值 788261
异常值 11739
Name: ficoRangeHigh_outliers, dtype: int64
异常值 778
正常值 158832
Name: isDefault, dtype: int64
正常值 790889
异常值 9111
Name: openAcc_outliers, dtype: int64
异常值 2195
正常值 157415
Name: isDefault, dtype: int64
正常值 792471
异常值 7529
Name: pubRec_outliers, dtype: int64
异常值 1701
正常值 157909
Name: isDefault, dtype: int64
正常值 794120
异常值 5880
Name: pubRecBankruptcies_outliers, dtype: int64
异常值 1423
正常值 158187
Name: isDefault, dtype: int64
正常值 790001
异常值 9999
Name: revolBal_outliers, dtype: int64
异常值 1359
正常值 158251
Name: isDefault, dtype: int64
正常值 799948
异常值 52
Name: revolUtil_outliers, dtype: int64
异常值 23
正常值 159587
Name: isDefault, dtype: int64
正常值 791663
异常值 8337
Name: totalAcc_outliers, dtype: int64
异常值 1668
正常值 157942
Name: isDefault, dtype: int64
正常值 800000
Name: initialListStatus_outliers, dtype: int64
正常值 159610
Name: isDefault, dtype: int64
正常值 784586
异常值 15414
Name: applicationType_outliers, dtype: int64
异常值 3875
正常值 155735
Name: isDefault, dtype: int64
正常值 775134
异常值 24866
Name: title_outliers, dtype: int64
异常值 3900
正常值 155710
Name: isDefault, dtype: int64
正常值 800000
Name: policyCode_outliers, dtype: int64
正常值 159610
Name: isDefault, dtype: int64
正常值 782773
异常值 17227
Name: n0_outliers, dtype: int64
异常值 3485
正常值 156125
Name: isDefault, dtype: int64
正常值 790500
异常值 9500
Name: n1_outliers, dtype: int64
异常值 2491
正常值 157119
Name: isDefault, dtype: int64
正常值 789067
异常值 10933
Name: n2_outliers, dtype: int64
异常值 3205
正常值 156405
Name: isDefault, dtype: int64
正常值 789067
异常值 10933
Name: n3_outliers, dtype: int64
异常值 3205
正常值 156405
Name: isDefault, dtype: int64
正常值 788660
异常值 11340
Name: n4_outliers, dtype: int64
异常值 2476
正常值 157134
Name: isDefault, dtype: int64
正常值 790355
异常值 9645
Name: n5_outliers, dtype: int64
异常值 1858
正常值 157752
Name: isDefault, dtype: int64
正常值 786006
异常值 13994
Name: n6_outliers, dtype: int64
异常值 3182
正常值 156428
Name: isDefault, dtype: int64
正常值 788430
异常值 11570
Name: n7_outliers, dtype: int64
异常值 2746
正常值 156864
Name: isDefault, dtype: int64
正常值 789625
异常值 10375
Name: n8_outliers, dtype: int64
异常值 2131
正常值 157479
Name: isDefault, dtype: int64
正常值 786384
异常值 13616
Name: n9_outliers, dtype: int64
异常值 3953
正常值 155657
Name: isDefault, dtype: int64
正常值 788979
异常值 11021
Name: n10_outliers, dtype: int64
异常值 2639
正常值 156971
Name: isDefault, dtype: int64
正常值 799434
异常值 566
Name: n11_outliers, dtype: int64
异常值 112
正常值 159498
Name: isDefault, dtype: int64
正常值 797585
异常值 2415
Name: n12_outliers, dtype: int64
异常值 545
正常值 159065
Name: isDefault, dtype: int64
正常值 788907
异常值 11093
Name: n13_outliers, dtype: int64
异常值 2482
正常值 157128
Name: isDefault, dtype: int64
正常值 788884
异常值 11116
Name: n14_outliers, dtype: int64
异常值 3364
正常值 156246
Name: isDefault, dtype: int64
# 删除异常值
for fea in numerical_fea:
data_train = data_train[data_train[fea+'_outliers']=='正常值']
data_train = data_train.reset_index(drop=True)
总结一句话:四分位数会将数据分为三个点和四个区间,IQR = Q3 -Q1,下触须=Q1 − 1.5x IQR,上触须=Q3 + 1.5x IQR;
处理异常值:当数据中存在离群点时,可以将其通过分箱离散化处理,从而提高变量的鲁棒性(抗干扰能力)。例如,age特征出现200这种异常值时,可分入"age > 60"这个分箱里,排除影响。
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是 loanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)# 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
def WOE_evaluation(data, cats, col='isDefault'):
data: 用于分箱的数据集
cats: 每个样本所属的箱的数据集
col: 目标变量
# cats = cats.astype('category')
encode_set = data.groupby([cats, data[col]]).size().unstack().fillna(0)
WOE_values = np.log((encode_set[0] / encode_set.sum()[0]) / ((encode_set[1] / encode_set.sum()[1])))
IV_value = np.sum((encode_set[0] / encode_set.sum()[0] - encode_set[1] / encode_set.sum()[1])* WOE_values)
print('IV value: ',IV_value)
plt.figure(figsize=(10, 4))
x = range(encode_set.shape[0])
sns.lineplot(x, WOE_values)
# plt.xticks(x, cats.cat.categories, rotation=45)
plt.xticks(x, cats.unique(), rotation=45)
return WOE_values, IV_value
# 'dti'特征里面有一个-1值,属于异常值,需要去掉
data_train = data_train[data_train['dti'] >= 0].copy()
data_train.reset_index(drop=True, inplace=True)
bins = [i for i in np.arange(0, 41, 5)] # 固定宽度分箱
bins.append(int(data_train['dti'].max())+1) # 偏态分布的尾部,分为一箱
cats = pd.cut(data_train['dti'], bins, right=False)
WOE, IV = WOE_evaluation(data_train, cats)
IV value: 0.0712144251530253
[0, 5) 0.359301
[5, 10) 0.346582
[10, 15) 0.226937
[15, 20) 0.056710
[20, 25) -0.102537
[25, 30) -0.282429
[30, 35) -0.466837
[35, 40) -0.587163
[40, 1000) -0.607468
dtype: float64
如果线性模型中包含有交互特征对,那它的训练时间和评分时间就会从 O(n) 增加到 O(n2),其中 n 是单一特征的数量。
for col in ['grade', 'subGrade']:
temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col+'_target_mean'})
temp_dict.index = temp_dict[col].values
temp_dict = temp_dict[col+'_target_mean'].to_dict()
data_train[col+'_target_mean'] = data_train[col].map(temp_dict)
data_testA[col+'_target_mean'] = data_testA[col].map(temp_dict)
# 其他衍生变量 mean 和 std
for df in [data_train, data_testA]:
for item in ['n'+str(i) for i in range(15)]:
df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')
for col in tqdm(['employmentTitle', 'postCode', 'title', 'subGrade']):
le = LabelEncoder()
le.fit(data_train[col].astype(str).to_list() + data_testA[col].astype(str).to_list())
data_train[col] = le.transform(data_train[col].astype(str).to_list())
data_testA[col] = le.transform(data_testA[col].astype(str).to_list())
print('Label Encoding 完成。')
100%|███████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00, 1.73s/it]
Label Encoding 完成。
for fea in [要归一化的特征列表]:
data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))
from sklearn.feature_selection import VarianceThreshold
# 参数 threshold为方差的阈值
VarianceThreshold(threshold=3).fit_transform(train, traget_train)
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
# 选择K个最好的特征,返回选择特征后的数据
# 第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,
# 输出二元组(评分, P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算系数。
# 参数K为选择的特征个数
SelectKBest(k=5).fit_transform(train, traget_train)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# 参数K 为选择的特征个数
SelectKBest(chi2, k=5).fit_transform(train, target_train)
# !pip install minepy
from sklearn.feature_selection import SelectKBest
from minepy import MINE
# 由于MINE的设计不是函数式的,通过以下定义的mic函数将其转化为函数式
# 返回结果为一个二元元组,元组的第二项设置成固定的P值0.5
def mic(x, y):
m = MINE()
m.compute_socre(x, y)
return (m.mic(), 0.5)
# 参数K为选择的特征个数
SelectKBest(lambda X, Y: np.arry(map(lambda x: mic(x, Y), X.T)).T,
k=2).fit_transform(train, target_train)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogistRegresstion
# 递归特征消除法,返回特征选择后的数据
# 参数estimator为基模型
# 参数n_features_to_select为选择的特征个数
RFE(estimator=LoistRegresstion(), n_features_to_select=2).fit_transform(train, target_train)
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
# 带L1惩罚项的逻辑回归作为基模型的特征选择
SelectFromModel(LogisticRegression(penalty-'l1', C=0.1)).fit_transform(train, target_train)
from sklearn.feature_selection import SelectFromModel
from sklarn.ensemble import GradientBoostingClassifier
# GBDT作为基模型的特征选择
SelecFromModel(GradientBoostingClassifier()).fit_transform(train, target_train)
# 删除不需要的数据
for data in [data_train, data_testA]:
data.drop(['issueDate', 'id'], axis=1, inplace=True)
# 纵向用缺失值上面的值替换缺失值
data_train = data_train.fillna(axis=0, method='ffill')
x_train = data_train.drop(['isDefault'], axis=1)
# 计算协方差
# 计算相关性
data_corr = x_train.corrwith(data_train.isDefault)
# result = pd.DataFrame(columns=['features', 'corr'])
# result['features'] = data_corr.index
# result['corr'] = data_corr.values
data_corr = data_corr.reset_index()
data_corr.columns = ['features', 'corr']
# 特征相关系性可视化
data_numeric = data_train[numerical_fea[1:]]
correlation = data_numeric.corr()
plt.figure(figsize=(7, 7))
plt.title('Correlation of Numberic Features with Price', y=1, size=16)
sns.heatmap(correlation, square=True, vmax=0.8)
features = [col for col in data_train.columns
if col not in ['id', 'issuDate', 'isDefault'] and '_outliers' not in col]
x_train = data_train[features]
x_test = data_testA[features]
y_train = data_train['isDefault']
x_train.to_csv('Dataset/data_for_model.csv', index=False)
y_train.to_csv('Dataset/label_for_model.csv', index=False)
x_test.to_csv('Dataset/testA_With_FeatureEngineering.csv', index=False)
def cv_model(clf, train_x, train_y, test_x, clf_name):
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
train = np.zeros(train_x.shape[0])
test = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
trn_x, trn_y, val_x, val_y = (train_x.iloc[train_index], train_y[train_index],
train_x.iloc[valid_index], train_y[valid_index])
if clf_name == 'lgb':
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
parmas = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 2**5,
'lambda_12': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': seed,
'nthread': 28,
'n_jobs': 24,
'silent': True,
'verbose': -1
model = clf.train(parmas, train_matrix, 500000, valid_sets=[train_matrix, valid_matrix],
verbose_eval=200, early_stopping_rounds=200)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
if clf_name == 'xgb':
train_matrix = clf.DMatrix(trn_x, label=trn_y)
valid_matrix = clf.DMatrix(val_x, label=val_y)
test_matrix = clf.DMatrix(test_x)
parmas = {
'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.04,
'tree_method': 'exact',
'seed': seed,
'nthread': 36,
'silent': True,
# 'tree_method': 'gpu_hist'
watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
model = clf.train(parmas, train_matrix, num_boost_round=50000, evals=watchlist,
verbose_eval=200, early_stopping_rounds=200)
val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
if clf_name == 'cat':
parmas = {
'learning_rate': 0.05,
'depth': 5,
'l2_leaf_reg': 10,
'bootstrap_type': 'Bernoulli',
'od_type': 'Iter',
'od_wait': 50,
'random_seed': 11,
'allow_writing_files': False
model = clf(iterations=20000, **parmas)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
cat_features=[], use_best_model=True, verbose=500)
val_pred = model.predict(val_x)
test_pred = model.predict(test_x)
train[valid_index] = val_pred
test += test_pred / kf.n_splits
cv_scores.append(roc_auc_score(val_y, val_pred))
print('%s_scotrainre_lsit: ' % clf_name, cv_scores)
print('%s_score_mean: ' % clf_name, np.mean(cv_scores))
print('%s_score_std: ' % clf_name, np.std(cv_scores))
return train, test
def lgb_model(x_train, y_train, x_test):
lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, 'lgb')
return lgb_train, lgb_test
def xgb_model(x_train, y_train, x_test):
xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, 'xgb')
return xgb_train, xgb_test
def cat_model(x_train, y_train, x_test):
cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, 'cat')
return cat_train, cat_test
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
****************************** 1 ******************************
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.75152 valid_1's auc: 0.728813
[400] training's auc: 0.770784 valid_1's auc: 0.729129
Early stopping, best iteration is:
[323] training's auc: 0.763994 valid_1's auc: 0.729426
****************************** 2 ******************************
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.751572 valid_1's auc: 0.730902
[400] training's auc: 0.770774 valid_1's auc: 0.731029
Early stopping, best iteration is:
[339] training's auc: 0.765356 valid_1's auc: 0.731286
[0.7294259254463246, 0.7312857834764355]
****************************** 3 ******************************
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.750642 valid_1's auc: 0.731674
[400] training's auc: 0.769801 valid_1's auc: 0.732319
[600] training's auc: 0.786078 valid_1's auc: 0.732263
Early stopping, best iteration is:
[582] training's auc: 0.784663 valid_1's auc: 0.732347
[0.7294259254463246, 0.7312857834764355, 0.7323471951456343]
****************************** 4 ******************************
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.751853 valid_1's auc: 0.727029
[400] training's auc: 0.770701 valid_1's auc: 0.727655
Early stopping, best iteration is:
[307] training's auc: 0.76223 valid_1's auc: 0.72777
[0.7294259254463246, 0.7312857834764355, 0.7323471951456343, 0.7277698679698775]
****************************** 5 ******************************
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.750566 valid_1's auc: 0.732136
[400] training's auc: 0.7692 valid_1's auc: 0.732618
Early stopping, best iteration is:
[373] training's auc: 0.766763 valid_1's auc: 0.732762
[0.7294259254463246, 0.7312857834764355, 0.7323471951456343, 0.7277698679698775, 0.732762205282248]
lgb_scotrainre_lsit: [0.7294259254463246, 0.7312857834764355, 0.7323471951456343, 0.7277698679698775, 0.732762205282248]
lgb_score_mean: 0.730718195464104
lgb_score_std: 0.0018717034520938764
cat_train, cat_test = cat_model(x_train, y_train, x_test)
****************************** 1 ******************************
0: learn: 0.3943984 test: 0.3964334 best: 0.3964334 (0) total: 230ms remaining: 1h 16m 42s
500: learn: 0.3728079 test: 0.3756736 best: 0.3756736 (500) total: 1m 12s remaining: 46m 44s
1000: learn: 0.3712148 test: 0.3750919 best: 0.3750911 (998) total: 2m 28s remaining: 46m 50s
1500: learn: 0.3699919 test: 0.3748420 best: 0.3748420 (1500) total: 3m 43s remaining: 45m 58s
2000: learn: 0.3688915 test: 0.3746858 best: 0.3746842 (1990) total: 4m 49s remaining: 43m 28s
2500: learn: 0.3678739 test: 0.3745857 best: 0.3745830 (2480) total: 5m 52s remaining: 41m 7s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3745742218
bestIteration = 2598
Shrink model to first 2599 iterations.
****************************** 2 ******************************
0: learn: 0.3947297 test: 0.3950963 best: 0.3950963 (0) total: 169ms remaining: 56m 28s
500: learn: 0.3731510 test: 0.3743435 best: 0.3743435 (500) total: 1m 12s remaining: 47m 18s
1000: learn: 0.3715005 test: 0.3738019 best: 0.3738019 (1000) total: 2m 20s remaining: 44m 33s
1500: learn: 0.3702694 test: 0.3735912 best: 0.3735912 (1500) total: 3m 36s remaining: 44m 23s
2000: learn: 0.3691661 test: 0.3734402 best: 0.3734392 (1997) total: 4m 44s remaining: 42m 35s
2500: learn: 0.3681716 test: 0.3733360 best: 0.3733354 (2486) total: 5m 59s remaining: 41m 55s
3000: learn: 0.3672144 test: 0.3732713 best: 0.3732689 (2996) total: 7m 17s remaining: 41m 19s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3732575734
bestIteration = 3088
Shrink model to first 3089 iterations.
[0.7309683524594143, 0.7326572146779085]
****************************** 3 ******************************
0: learn: 0.3950712 test: 0.3937140 best: 0.3937140 (0) total: 168ms remaining: 55m 54s
500: learn: 0.3734495 test: 0.3730873 best: 0.3730873 (500) total: 1m 25s remaining: 55m 22s
1000: learn: 0.3718337 test: 0.3724441 best: 0.3724438 (999) total: 2m 45s remaining: 52m 28s
1500: learn: 0.3706096 test: 0.3721854 best: 0.3721854 (1497) total: 4m 6s remaining: 50m 43s
2000: learn: 0.3695474 test: 0.3720525 best: 0.3720515 (1997) total: 5m 21s remaining: 48m 13s
2500: learn: 0.3685663 test: 0.3719417 best: 0.3719415 (2494) total: 6m 36s remaining: 46m 11s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3719013034
bestIteration = 2838
Shrink model to first 2839 iterations.
[0.7309683524594143, 0.7326572146779085, 0.7340265229814408]
****************************** 4 ******************************
0: learn: 0.3949297 test: 0.3943073 best: 0.3943073 (0) total: 201ms remaining: 1h 6m 58s
500: learn: 0.3731882 test: 0.3740583 best: 0.3740583 (500) total: 1m 24s remaining: 54m 59s
1000: learn: 0.3715885 test: 0.3735140 best: 0.3735129 (998) total: 2m 37s remaining: 49m 41s
1500: learn: 0.3703487 test: 0.3732792 best: 0.3732792 (1500) total: 3m 48s remaining: 46m 56s
2000: learn: 0.3692554 test: 0.3731344 best: 0.3731331 (1984) total: 5m 9s remaining: 46m 19s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3731228401
bestIteration = 2114
Shrink model to first 2115 iterations.
[0.7309683524594143, 0.7326572146779085, 0.7340265229814408, 0.7289299541394646]
****************************** 5 ******************************
0: learn: 0.3948878 test: 0.3944737 best: 0.3944737 (0) total: 338ms remaining: 1h 52m 44s
500: learn: 0.3733459 test: 0.3734735 best: 0.3734735 (500) total: 1m 23s remaining: 53m 56s
1000: learn: 0.3716805 test: 0.3729340 best: 0.3729336 (999) total: 2m 35s remaining: 49m 17s
1500: learn: 0.3704593 test: 0.3727339 best: 0.3727339 (1499) total: 3m 53s remaining: 47m 51s
2000: learn: 0.3693569 test: 0.3726087 best: 0.3726087 (2000) total: 5m 11s remaining: 46m 42s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3725807976
bestIteration = 2216
Shrink model to first 2217 iterations.
[0.7309683524594143, 0.7326572146779085, 0.7340265229814408, 0.7289299541394646, 0.7336854992714498]
cat_scotrainre_lsit: [0.7309683524594143, 0.7326572146779085, 0.7340265229814408, 0.7289299541394646, 0.7336854992714498]
cat_score_mean: 0.7320535087059357
cat_score_std: 0.001889698211194384
from sklearn.naive_bayes import BernoulliNB
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
train = np.zeros(x_train.shape[0])
test = np.zeros(x_test.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
trn_x, trn_y, val_x, val_y = (x_train.iloc[train_index], y_train[train_index],
x_train.iloc[valid_index], y_train[valid_index])
ber_nb = BernoulliNB()
ber_nb.fit(trn_x, trn_y)
val_pred = ber_nb.predict(val_x)
test_pred = ber_nb.predict(x_test)
train[valid_index] = val_pred
test = test_pred / kf.n_splits
score = roc_auc_score(val_y, val_pred)
rh_test = lgb_test*0.5 + cat_test*0.5
# 前面删除了ID 特征,这里需要重新读取
data_testA = pd.read_csv('Dataset/testA_new.csv', usecols=['id'])
data_testA['isDefault'] = rh_test
data_testA[['id', 'isDefault']].to_csv('AfterFeatureEngineering.csv', index=False)