数据清洗+特征构造:bureau.csv

bureau.csv 客户信用报告(信用历史)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
bureau = pd.read_csv('data/bureau.csv')
target = pd.read_csv('data/TARGET.csv')
bureau.head()
 
SK_ID_CURR SK_ID_BUREAU CREDIT_ACTIVE CREDIT_CURRENCY DAYS_CREDIT CREDIT_DAY_OVERDUE DAYS_CREDIT_ENDDATE DAYS_ENDDATE_FACT AMT_CREDIT_MAX_OVERDUE CNT_CREDIT_PROLONG AMT_CREDIT_SUM AMT_CREDIT_SUM_DEBT AMT_CREDIT_SUM_LIMIT AMT_CREDIT_SUM_OVERDUE CREDIT_TYPE DAYS_CREDIT_UPDATE AMT_ANNUITY
0 215354 5714462 Closed currency 1 -497 0 -153.0 -153.0 NaN 0 91323.0 0.0 NaN 0.0 Consumer credit -131 NaN
1 215354 5714463 Active currency 1 -208 0 1075.0 NaN NaN 0 225000.0 171342.0 NaN 0.0 Credit card -20 NaN
2 215354 5714464 Active currency 1 -203 0 528.0 NaN NaN 0 464323.5 NaN NaN 0.0 Consumer credit -16 NaN
3 215354 5714465 Active currency 1 -203 0 NaN NaN NaN 0 90000.0 NaN NaN 0.0 Credit card -16 NaN
4 215354 5714466 Active currency 1 -629 0 1197.0 NaN 77674.5 0 2700000.0 NaN NaN 0.0 Consumer credit -21 NaN

TARGET

target.head(5)
 
SK_ID_CURR TARGET
0 100002 1
1 100003 0
2 100004 0
3 100006 0
4 100007 0
#缺失值异常值处理
bureau['DAYS_CREDIT_ENDDATE'][bureau['DAYS_CREDIT_ENDDATE'] < -40000] = np.nan
bureau['DAYS_CREDIT_UPDATE'][bureau['DAYS_CREDIT_UPDATE'] < -40000] = np.nan
bureau['DAYS_ENDDATE_FACT'][bureau['DAYS_ENDDATE_FACT'] < -40000] = np.nan
bureau = bureau.fillna(np.nan)
#Feature Engineering
bureau['bureau_credit_active_binary'] = (bureau['CREDIT_ACTIVE'] != 'Closed').astype(int)
bureau['bureau_credit_enddate_binary'] = (bureau['DAYS_CREDIT_ENDDATE'] > 0).astype(int)
根据 SK_ID_CURR分组(每个客户有多条信用记录)
groupby_sk_id_curr = bureau.groupby(by = ['SK_ID_CURR'])
features = pd.DataFrame({'SK_ID_CURR':bureau['SK_ID_CURR'].unique()})
features.head(4)
 
SK_ID_CURR
0 215354
1 162297
2 402440
3 238881
features.shape
(305811, 1) 之前借款记录
group_object = groupby_sk_id_curr['DAYS_CREDIT'].agg('count').reset_index()
group_object.head(4)
 
SK_ID_CURR DAYS_CREDIT
0 100001 7
1 100002 8
2 100003 4
3 100004 2

列名变更

group_object.rename(index = str, columns = {'DAYS_CREDIT':'bureau_count_past_loans'},inplace = True)
group_object.head(4)
 
SK_ID_CURR bureau_count_past_loans
0 100001 7
1 100002 8
2 100003 4
3 100004 2

两表合并(根据 SK_ID_CURR)

features = features.merge(group_object, on = ['SK_ID_CURR'], how = 'left')
features.head(4)
 
SK_ID_CURR bureau_count_past_loans
0 215354 11
1 162297 6
2 402440 1
3 238881 8

类型

group_object = groupby_sk_id_curr['CREDIT_TYPE'].agg('nunique').reset_index()
group_object.head(4)
#列名变更
group_object.rename(index = str, columns = {'CREDIT_TYPE':'num_loan_types'},inplace = True)
group_object.head(4)
#合并
features = features.merge(group_object, on = ['SK_ID_CURR'], how = 'left')
features.head(4)
 
SK_ID_CURR bureau_count_past_loans num_loan_types
0 215354 11 3
1 162297 6 3
2 402440 1 1
3 238881 8 2

feature:每种类型平均次数

features['avg_loans_per_type'] = features['bureau_count_past_loans']/features['num_loan_types']
features.head(4)
 
SK_ID_CURR bureau_count_past_loans num_loan_types avg_loans_per_type
0 215354 11 3 3.666667
1 162297 6 3 2.000000
2 402440 1 1 1.000000
3 238881 8 2 4.000000

信贷机构存活率(每个客户曾经有过合作关系的)

group_object = groupby_sk_id_curr['bureau_credit_active_binary'].agg('mean').reset_index()

features = features.merge(group_object, on = ['SK_ID_CURR'], how = 'left')
features.head(4)
 
SK_ID_CURR bureau_count_past_loans num_loan_types avg_loans_per_type bureau_credit_active_binary
0 215354 11 3 3.666667 0.545455
1 162297 6 3 2.000000 0.500000
2 402440 1 1 1.000000 1.000000
3 238881 8 2 4.000000 0.375000
group_object = groupby_sk_id_curr['bureau_credit_enddate_binary'].agg('mean').reset_index()
features = features.merge(group_object, on = ['SK_ID_CURR'], how = 'left')
features.head(4)
 
SK_ID_CURR bureau_count_past_loans num_loan_types avg_loans_per_type bureau_credit_active_binary bureau_credit_enddate_binary
0 215354 11 3 3.666667 0.545455 0.454545
1 162297 6 3 2.000000 0.500000 0.166667
2 402440 1 1 1.000000 1.000000 1.000000
3 238881 8 2 4.000000 0.375000 0.125000
for func in ['mean', 'min', 'max', 'sum', 'var']:
    for feature in ['DAYS_CREDIT','CREDIT_DAY_OVERDUE','DAYS_CREDIT_ENDDATE',
                   'DAYS_ENDDATE_FACT','AMT_CREDIT_MAX_OVERDUE',
                   'CNT_CREDIT_PROLONG','AMT_CREDIT_SUM','AMT_CREDIT_SUM_DEBT',
                   'AMT_CREDIT_SUM_LIMIT','AMT_CREDIT_SUM_OVERDUE','DAYS_CREDIT_UPDATE',
                   'AMT_ANNUITY'
                   ]:
        group_object = groupby_sk_id_curr[feature].agg(func).reset_index()
        group_object.rename(index = str, columns = {feature:feature+'_'+func},inplace = True)
        features = features.merge(group_object, on = ['SK_ID_CURR'],how = 'left')

features.head(4)
 
SK_ID_CURR bureau_count_past_loans num_loan_types avg_loans_per_type bureau_credit_active_binary bureau_credit_enddate_binary DAYS_CREDIT_mean CREDIT_DAY_OVERDUE_mean DAYS_CREDIT_ENDDATE_mean DAYS_ENDDATE_FACT_mean DAYS_CREDIT_ENDDATE_var DAYS_ENDDATE_FACT_var AMT_CREDIT_MAX_OVERDUE_var CNT_CREDIT_PROLONG_var AMT_CREDIT_SUM_var AMT_CREDIT_SUM_DEBT_var AMT_CREDIT_SUM_LIMIT_var AMT_CREDIT_SUM_OVERDUE_var DAYS_CREDIT_UPDATE_var AMT_ANNUITY_var
0 215354 11 3 3.666667 0.545455 0.454545 -727.818182 0.0 2685.300000 -889.8 7.641379e+07 215965.7 2.011109e+09 0.0 6.043071e+11 4.537546e+09 3.959070e+09 0.0 236203.418182 NaN
1 162297 6 3 2.000000 0.500000 0.166667 -1344.500000 0.0 245.500000 -1125.0 6.294475e+06 256725.0 4.491004e+07 0.0 7.753264e+12 0.000000e+00 0.000000e+00 0.0 290229.766667 NaN
2 402440 1 1 1.000000 1.000000 1.000000 -96.000000 0.0 269.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 238881 8 2 4.000000 0.375000 0.125000 -914.750000 0.0 -521.166667 -847.2 2.171049e+06 1088076.2 0.000000e+00 0.0 2.096805e+10 4.421319e+10 1.303252e+10 0.0 808344.267857 NaN

4 rows × 66 columns

构造DKF

features['debt_credit_ratio_sum'] = features['AMT_CREDIT_SUM_DEBT_sum']/features['AMT_CREDIT_SUM_sum']
features['debt_credit_ratio_avg'] = features['AMT_CREDIT_SUM_DEBT_mean']/features['AMT_CREDIT_SUM_mean']
features['debt_credit_ratio_min'] = features['AMT_CREDIT_SUM_DEBT_min']/features['AMT_CREDIT_SUM_min']
features['debt_credit_ratio_max'] = features['AMT_CREDIT_SUM_DEBT_max']/features['AMT_CREDIT_SUM_max']
features['debt_limit_ratio_sum'] = features['AMT_CREDIT_SUM_DEBT_sum']/features['AMT_CREDIT_SUM_LIMIT_sum']
features['debt_limit_ratio_avg'] = features['AMT_CREDIT_SUM_DEBT_mean']/features['AMT_CREDIT_SUM_LIMIT_mean']
features['debt_limit_ratio_min'] = features['AMT_CREDIT_SUM_DEBT_min']/features['AMT_CREDIT_SUM_LIMIT_min']
features['debt_limit_ratio_max'] = features['AMT_CREDIT_SUM_DEBT_max']/features['AMT_CREDIT_SUM_LIMIT_max']
features['debt_annuity_ratio_sum'] = features['AMT_CREDIT_SUM_DEBT_sum']/features['AMT_ANNUITY_sum']
features['debt_annuity_ratio_avg'] = features['AMT_CREDIT_SUM_DEBT_mean']/features['AMT_ANNUITY_mean']
features['debt_annuity_ratio_min'] = features['AMT_CREDIT_SUM_DEBT_min']/features['AMT_ANNUITY_min']
features['debt_annuity_ratio_max'] = features['AMT_CREDIT_SUM_DEBT_max']/features['AMT_ANNUITY_max']
features['credit_limit_ratio_sum'] = features['AMT_CREDIT_SUM_sum']/features['AMT_CREDIT_SUM_LIMIT_sum']
features['credit_limit_ratio_avg'] = features['AMT_CREDIT_SUM_mean']/features['AMT_CREDIT_SUM_LIMIT_mean']
features['credit_limit_ratio_min'] = features['AMT_CREDIT_SUM_min']/features['AMT_CREDIT_SUM_LIMIT_min']
features['credit_limit_ratio_max'] = features['AMT_CREDIT_SUM_max']/features['AMT_CREDIT_SUM_LIMIT_max']
features['credit_annuity_ratio_sum'] = features['AMT_CREDIT_SUM_sum']/features['AMT_ANNUITY_sum']
features['credit_annuity_ratio_avg'] = features['AMT_CREDIT_SUM_mean']/features['AMT_ANNUITY_mean']
features['credit_annuity_ratio_min'] = features['AMT_CREDIT_SUM_min']/features['AMT_ANNUITY_min']
features['credit_annuity_ratio_max'] = features['AMT_CREDIT_SUM_max']/features['AMT_ANNUITY_max']
features['limit_annuity_ratio_sum'] = features['AMT_CREDIT_SUM_LIMIT_sum']/features['AMT_ANNUITY_sum']
features['limit_annuity_ratio_avg'] = features['AMT_CREDIT_SUM_LIMIT_mean']/features['AMT_ANNUITY_mean']
features['limit_annuity_ratio_min'] = features['AMT_CREDIT_SUM_LIMIT_min']/features['AMT_ANNUITY_min']
features['limit_annuity_ratio_max'] = features['AMT_CREDIT_SUM_LIMIT_max']/features['AMT_ANNUITY_max']
features['overdue_debt_ratio_sum'] = features['AMT_CREDIT_SUM_OVERDUE_sum']/features['AMT_CREDIT_SUM_DEBT_sum']
features['overdue_debt_ratio_avg'] = features['AMT_CREDIT_SUM_OVERDUE_mean']/features['AMT_CREDIT_SUM_DEBT_mean']
features['overdue_debt_ratio_min'] = features['AMT_CREDIT_SUM_OVERDUE_min']/features['AMT_CREDIT_SUM_DEBT_min']
features['overdue_debt_ratio_max'] = features['AMT_CREDIT_SUM_OVERDUE_max']/features['AMT_CREDIT_SUM_DEBT_max']

与 TARGET列合并

# temp 为了计算相关性系数
temp = features.merge(target, left_on = 'SK_ID_CURR', right_on = 'SK_ID_CURR')
temp.shape
(263491, 95)

相关系数

corrs = temp.corr()['TARGET'].sort_values()
corrs = corrs.drop(['TARGET'])
corrs.plot(kind = 'barh', figsize = (10, 40))

数据清洗+特征构造:bureau.csv_第1张图片

相关系数小于0的特征列名

weak_corr_feature_names = list(corrs.loc[corrs < 0].index)
selected_feature = features.drop(weak_corr_feature_names, axis = 1)

Save data

# 保留全部特征
features.to_csv('data/bureau_features.csv',index = False)

你可能感兴趣的:(信贷风险,贷款违约风险预测)