import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
bureau = pd.read_csv('data/bureau.csv')
target = pd.read_csv('data/TARGET.csv')
bureau.head()
SK_ID_CURR | SK_ID_BUREAU | CREDIT_ACTIVE | CREDIT_CURRENCY | DAYS_CREDIT | CREDIT_DAY_OVERDUE | DAYS_CREDIT_ENDDATE | DAYS_ENDDATE_FACT | AMT_CREDIT_MAX_OVERDUE | CNT_CREDIT_PROLONG | AMT_CREDIT_SUM | AMT_CREDIT_SUM_DEBT | AMT_CREDIT_SUM_LIMIT | AMT_CREDIT_SUM_OVERDUE | CREDIT_TYPE | DAYS_CREDIT_UPDATE | AMT_ANNUITY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 215354 | 5714462 | Closed | currency 1 | -497 | 0 | -153.0 | -153.0 | NaN | 0 | 91323.0 | 0.0 | NaN | 0.0 | Consumer credit | -131 | NaN |
1 | 215354 | 5714463 | Active | currency 1 | -208 | 0 | 1075.0 | NaN | NaN | 0 | 225000.0 | 171342.0 | NaN | 0.0 | Credit card | -20 | NaN |
2 | 215354 | 5714464 | Active | currency 1 | -203 | 0 | 528.0 | NaN | NaN | 0 | 464323.5 | NaN | NaN | 0.0 | Consumer credit | -16 | NaN |
3 | 215354 | 5714465 | Active | currency 1 | -203 | 0 | NaN | NaN | NaN | 0 | 90000.0 | NaN | NaN | 0.0 | Credit card | -16 | NaN |
4 | 215354 | 5714466 | Active | currency 1 | -629 | 0 | 1197.0 | NaN | 77674.5 | 0 | 2700000.0 | NaN | NaN | 0.0 | Consumer credit | -21 | NaN |
TARGET
target.head(5)
SK_ID_CURR | TARGET | |
---|---|---|
0 | 100002 | 1 |
1 | 100003 | 0 |
2 | 100004 | 0 |
3 | 100006 | 0 |
4 | 100007 | 0 |
bureau['DAYS_CREDIT_ENDDATE'][bureau['DAYS_CREDIT_ENDDATE'] < -40000] = np.nan
bureau['DAYS_CREDIT_UPDATE'][bureau['DAYS_CREDIT_UPDATE'] < -40000] = np.nan
bureau['DAYS_ENDDATE_FACT'][bureau['DAYS_ENDDATE_FACT'] < -40000] = np.nan
bureau = bureau.fillna(np.nan)
#Feature Engineering
bureau['bureau_credit_active_binary'] = (bureau['CREDIT_ACTIVE'] != 'Closed').astype(int)
bureau['bureau_credit_enddate_binary'] = (bureau['DAYS_CREDIT_ENDDATE'] > 0).astype(int)
根据 SK_ID_CURR分组(每个客户有多条信用记录)
groupby_sk_id_curr = bureau.groupby(by = ['SK_ID_CURR'])
features = pd.DataFrame({'SK_ID_CURR':bureau['SK_ID_CURR'].unique()})
features.head(4)
SK_ID_CURR | |
---|---|
0 | 215354 |
1 | 162297 |
2 | 402440 |
3 | 238881 |
features.shape
(305811, 1) 之前借款记录
group_object = groupby_sk_id_curr['DAYS_CREDIT'].agg('count').reset_index()
group_object.head(4)
SK_ID_CURR | DAYS_CREDIT | |
---|---|---|
0 | 100001 | 7 |
1 | 100002 | 8 |
2 | 100003 | 4 |
3 | 100004 | 2 |
列名变更
group_object.rename(index = str, columns = {'DAYS_CREDIT':'bureau_count_past_loans'},inplace = True)
group_object.head(4)
SK_ID_CURR | bureau_count_past_loans | |
---|---|---|
0 | 100001 | 7 |
1 | 100002 | 8 |
2 | 100003 | 4 |
3 | 100004 | 2 |
两表合并(根据 SK_ID_CURR)
features = features.merge(group_object, on = ['SK_ID_CURR'], how = 'left')
features.head(4)
SK_ID_CURR | bureau_count_past_loans | |
---|---|---|
0 | 215354 | 11 |
1 | 162297 | 6 |
2 | 402440 | 1 |
3 | 238881 | 8 |
类型
group_object = groupby_sk_id_curr['CREDIT_TYPE'].agg('nunique').reset_index()
group_object.head(4)
#列名变更
group_object.rename(index = str, columns = {'CREDIT_TYPE':'num_loan_types'},inplace = True)
group_object.head(4)
#合并
features = features.merge(group_object, on = ['SK_ID_CURR'], how = 'left')
features.head(4)
SK_ID_CURR | bureau_count_past_loans | num_loan_types | |
---|---|---|---|
0 | 215354 | 11 | 3 |
1 | 162297 | 6 | 3 |
2 | 402440 | 1 | 1 |
3 | 238881 | 8 | 2 |
feature:每种类型平均次数
features['avg_loans_per_type'] = features['bureau_count_past_loans']/features['num_loan_types']
features.head(4)
SK_ID_CURR | bureau_count_past_loans | num_loan_types | avg_loans_per_type | |
---|---|---|---|---|
0 | 215354 | 11 | 3 | 3.666667 |
1 | 162297 | 6 | 3 | 2.000000 |
2 | 402440 | 1 | 1 | 1.000000 |
3 | 238881 | 8 | 2 | 4.000000 |
信贷机构存活率(每个客户曾经有过合作关系的)
group_object = groupby_sk_id_curr['bureau_credit_active_binary'].agg('mean').reset_index()
features = features.merge(group_object, on = ['SK_ID_CURR'], how = 'left')
features.head(4)
SK_ID_CURR | bureau_count_past_loans | num_loan_types | avg_loans_per_type | bureau_credit_active_binary | |
---|---|---|---|---|---|
0 | 215354 | 11 | 3 | 3.666667 | 0.545455 |
1 | 162297 | 6 | 3 | 2.000000 | 0.500000 |
2 | 402440 | 1 | 1 | 1.000000 | 1.000000 |
3 | 238881 | 8 | 2 | 4.000000 | 0.375000 |
group_object = groupby_sk_id_curr['bureau_credit_enddate_binary'].agg('mean').reset_index()
features = features.merge(group_object, on = ['SK_ID_CURR'], how = 'left')
features.head(4)
SK_ID_CURR | bureau_count_past_loans | num_loan_types | avg_loans_per_type | bureau_credit_active_binary | bureau_credit_enddate_binary | |
---|---|---|---|---|---|---|
0 | 215354 | 11 | 3 | 3.666667 | 0.545455 | 0.454545 |
1 | 162297 | 6 | 3 | 2.000000 | 0.500000 | 0.166667 |
2 | 402440 | 1 | 1 | 1.000000 | 1.000000 | 1.000000 |
3 | 238881 | 8 | 2 | 4.000000 | 0.375000 | 0.125000 |
for func in ['mean', 'min', 'max', 'sum', 'var']:
for feature in ['DAYS_CREDIT','CREDIT_DAY_OVERDUE','DAYS_CREDIT_ENDDATE',
'DAYS_ENDDATE_FACT','AMT_CREDIT_MAX_OVERDUE',
'CNT_CREDIT_PROLONG','AMT_CREDIT_SUM','AMT_CREDIT_SUM_DEBT',
'AMT_CREDIT_SUM_LIMIT','AMT_CREDIT_SUM_OVERDUE','DAYS_CREDIT_UPDATE',
'AMT_ANNUITY'
]:
group_object = groupby_sk_id_curr[feature].agg(func).reset_index()
group_object.rename(index = str, columns = {feature:feature+'_'+func},inplace = True)
features = features.merge(group_object, on = ['SK_ID_CURR'],how = 'left')
features.head(4)
SK_ID_CURR | bureau_count_past_loans | num_loan_types | avg_loans_per_type | bureau_credit_active_binary | bureau_credit_enddate_binary | DAYS_CREDIT_mean | CREDIT_DAY_OVERDUE_mean | DAYS_CREDIT_ENDDATE_mean | DAYS_ENDDATE_FACT_mean | … | DAYS_CREDIT_ENDDATE_var | DAYS_ENDDATE_FACT_var | AMT_CREDIT_MAX_OVERDUE_var | CNT_CREDIT_PROLONG_var | AMT_CREDIT_SUM_var | AMT_CREDIT_SUM_DEBT_var | AMT_CREDIT_SUM_LIMIT_var | AMT_CREDIT_SUM_OVERDUE_var | DAYS_CREDIT_UPDATE_var | AMT_ANNUITY_var | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 215354 | 11 | 3 | 3.666667 | 0.545455 | 0.454545 | -727.818182 | 0.0 | 2685.300000 | -889.8 | … | 7.641379e+07 | 215965.7 | 2.011109e+09 | 0.0 | 6.043071e+11 | 4.537546e+09 | 3.959070e+09 | 0.0 | 236203.418182 | NaN |
1 | 162297 | 6 | 3 | 2.000000 | 0.500000 | 0.166667 | -1344.500000 | 0.0 | 245.500000 | -1125.0 | … | 6.294475e+06 | 256725.0 | 4.491004e+07 | 0.0 | 7.753264e+12 | 0.000000e+00 | 0.000000e+00 | 0.0 | 290229.766667 | NaN |
2 | 402440 | 1 | 1 | 1.000000 | 1.000000 | 1.000000 | -96.000000 | 0.0 | 269.000000 | NaN | … | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 238881 | 8 | 2 | 4.000000 | 0.375000 | 0.125000 | -914.750000 | 0.0 | -521.166667 | -847.2 | … | 2.171049e+06 | 1088076.2 | 0.000000e+00 | 0.0 | 2.096805e+10 | 4.421319e+10 | 1.303252e+10 | 0.0 | 808344.267857 | NaN |
4 rows × 66 columns
构造DKF
features['debt_credit_ratio_sum'] = features['AMT_CREDIT_SUM_DEBT_sum']/features['AMT_CREDIT_SUM_sum']
features['debt_credit_ratio_avg'] = features['AMT_CREDIT_SUM_DEBT_mean']/features['AMT_CREDIT_SUM_mean']
features['debt_credit_ratio_min'] = features['AMT_CREDIT_SUM_DEBT_min']/features['AMT_CREDIT_SUM_min']
features['debt_credit_ratio_max'] = features['AMT_CREDIT_SUM_DEBT_max']/features['AMT_CREDIT_SUM_max']
features['debt_limit_ratio_sum'] = features['AMT_CREDIT_SUM_DEBT_sum']/features['AMT_CREDIT_SUM_LIMIT_sum']
features['debt_limit_ratio_avg'] = features['AMT_CREDIT_SUM_DEBT_mean']/features['AMT_CREDIT_SUM_LIMIT_mean']
features['debt_limit_ratio_min'] = features['AMT_CREDIT_SUM_DEBT_min']/features['AMT_CREDIT_SUM_LIMIT_min']
features['debt_limit_ratio_max'] = features['AMT_CREDIT_SUM_DEBT_max']/features['AMT_CREDIT_SUM_LIMIT_max']
features['debt_annuity_ratio_sum'] = features['AMT_CREDIT_SUM_DEBT_sum']/features['AMT_ANNUITY_sum']
features['debt_annuity_ratio_avg'] = features['AMT_CREDIT_SUM_DEBT_mean']/features['AMT_ANNUITY_mean']
features['debt_annuity_ratio_min'] = features['AMT_CREDIT_SUM_DEBT_min']/features['AMT_ANNUITY_min']
features['debt_annuity_ratio_max'] = features['AMT_CREDIT_SUM_DEBT_max']/features['AMT_ANNUITY_max']
features['credit_limit_ratio_sum'] = features['AMT_CREDIT_SUM_sum']/features['AMT_CREDIT_SUM_LIMIT_sum']
features['credit_limit_ratio_avg'] = features['AMT_CREDIT_SUM_mean']/features['AMT_CREDIT_SUM_LIMIT_mean']
features['credit_limit_ratio_min'] = features['AMT_CREDIT_SUM_min']/features['AMT_CREDIT_SUM_LIMIT_min']
features['credit_limit_ratio_max'] = features['AMT_CREDIT_SUM_max']/features['AMT_CREDIT_SUM_LIMIT_max']
features['credit_annuity_ratio_sum'] = features['AMT_CREDIT_SUM_sum']/features['AMT_ANNUITY_sum']
features['credit_annuity_ratio_avg'] = features['AMT_CREDIT_SUM_mean']/features['AMT_ANNUITY_mean']
features['credit_annuity_ratio_min'] = features['AMT_CREDIT_SUM_min']/features['AMT_ANNUITY_min']
features['credit_annuity_ratio_max'] = features['AMT_CREDIT_SUM_max']/features['AMT_ANNUITY_max']
features['limit_annuity_ratio_sum'] = features['AMT_CREDIT_SUM_LIMIT_sum']/features['AMT_ANNUITY_sum']
features['limit_annuity_ratio_avg'] = features['AMT_CREDIT_SUM_LIMIT_mean']/features['AMT_ANNUITY_mean']
features['limit_annuity_ratio_min'] = features['AMT_CREDIT_SUM_LIMIT_min']/features['AMT_ANNUITY_min']
features['limit_annuity_ratio_max'] = features['AMT_CREDIT_SUM_LIMIT_max']/features['AMT_ANNUITY_max']
features['overdue_debt_ratio_sum'] = features['AMT_CREDIT_SUM_OVERDUE_sum']/features['AMT_CREDIT_SUM_DEBT_sum']
features['overdue_debt_ratio_avg'] = features['AMT_CREDIT_SUM_OVERDUE_mean']/features['AMT_CREDIT_SUM_DEBT_mean']
features['overdue_debt_ratio_min'] = features['AMT_CREDIT_SUM_OVERDUE_min']/features['AMT_CREDIT_SUM_DEBT_min']
features['overdue_debt_ratio_max'] = features['AMT_CREDIT_SUM_OVERDUE_max']/features['AMT_CREDIT_SUM_DEBT_max']
与 TARGET列合并
# temp 为了计算相关性系数
temp = features.merge(target, left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')
temp.shape
(263491, 95)
corrs = temp.corr()['TARGET'].sort_values()
corrs = corrs.drop(['TARGET'])
corrs.plot(kind = 'barh', figsize = (10, 40))
相关系数小于0的特征列名
weak_corr_feature_names = list(corrs.loc[corrs < 0].index)
selected_feature = features.drop(weak_corr_feature_names, axis = 1)
Save data
# 保留全部特征
features.to_csv('data/bureau_features.csv',index = False)