分箱、WOE、IV的计算

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
from scipy import stats
warnings.filterwarnings('ignore')
plt.style.use("seaborn")
plt.rc('font', family='SimHei', size=13)  # 显示中文
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负
# 载入数据
data = pd.read_csv(r"./cs_training.csv",encoding='gbk')
# 查看数据集
# data.head(10)

# 将特征名字改为中文
column={'SeriousDlqin2yrs':'好坏客户',
        'RevolvingUtilizationOfUnsecuredLines':'可用额度比值',
        'age':'年龄',
        'NumberOfTime30-59DaysPastDueNotWorse':'逾期30-59天笔数',
        'DebtRatio':'负债率',
        'MonthlyIncome':'月收入',
        'NumberOfOpenCreditLinesAndLoans':'信贷数量',
        'NumberOfTimes90DaysLate':'逾期90天笔数',
        'NumberRealEstateLoansOrLines':'固定资产贷款量',
        'NumberOfTime60-89DaysPastDueNotWorse':'逾期60-89天笔数',
        'NumberOfDependents':'家属数量'}
data.rename(columns=column,inplace=True)
data.head()
好坏客户 可用额度比值 年龄 逾期30-59天笔数 负债率 月收入 信贷数量 逾期90天笔数 固定资产贷款量 逾期60-89天笔数 家属数量
0 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0
1 0 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0
2 0 0.658180 38 1 0.085113 3042.0 2 1 0 0 0.0
3 0 0.233810 30 0 0.036050 3300.0 5 0 0 0 0.0
4 0 0.907239 49 1 0.024926 63588.0 7 0 1 0 0.0
from sklearn.ensemble import RandomForestRegressor
# 用随机森林对缺失值预测填充函数
def set_missing(df):
    # 把已有的数值型特征取出来
    process_df = df.iloc[:,[5,0,1,2,3,4,6,7,8,9]]
    # 分成已知该特征和未知该特征两部分
    # dataframe.values获取的是dataframe中的数据为数组array
    known = process_df[process_df['月收入'].notnull()].values
    unknown = process_df[process_df['月收入'].isnull()].values
    # X为已知月收入的特征属性值
    X = known[:, 1:]
    # y为结果标签值月收入
    y = known[:, 0]
    # X与y用于训练随机森林模型,fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=200,max_depth=3,n_jobs=-1)
    rfr.fit(X,y)
    # 用得到的模型进行未知特征值预测
    predicted = rfr.predict(unknown[:, 1:]).round(0)
    # 用得到的预测结果填补原缺失数据
    df.loc[df['月收入'].isnull(), '月收入'] = predicted
    return df
# 用随机森林填补比较多的缺失值
data=set_missing(data)   
# 删除比较少的缺失值
data=data.dropna()   
# 删除重复项
data = data.drop_duplicates()    
data.info()

Int64Index: 145563 entries, 0 to 149999
Data columns (total 11 columns):
好坏客户          145563 non-null int64
可用额度比值        145563 non-null float64
年龄            145563 non-null int64
逾期30-59天笔数    145563 non-null int64
负债率           145563 non-null float64
月收入           145563 non-null float64
信贷数量          145563 non-null int64
逾期90天笔数       145563 non-null int64
固定资产贷款量       145563 non-null int64
逾期60-89天笔数    145563 non-null int64
家属数量          145563 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.3 MB
# 删除逾期30-59天笔数、逾期90天笔数、逾期60-89天笔数大于80的数据
data = data[data['逾期30-59天笔数'] < 80]
data = data[data['逾期90天笔数'] < 80]
data = data[data['逾期60-89天笔数'] < 80]
data = data[data['年龄'] > 0]
col_list = data.columns.values
col_list
array(['好坏客户', '可用额度比值', '年龄', '逾期30-59天笔数', '负债率', '月收入', '信贷数量',
       '逾期90天笔数', '固定资产贷款量', '逾期60-89天笔数', '家属数量'], dtype=object)
new_col_list = []
for i in range(len(col_list)):
    if i != 0 and i != 3 and i != 7 and i != 9:
        new_col_list.append(col_list[i])
# 去除单侧99%上部分异常值
for item in new_col_list:
    data = data[data[item] < data[item].quantile(0.99)]
    
import woe.feature_process as fp
import woe.eval as eval
data.columns
Index(['好坏客户', '可用额度比值', '年龄', '逾期30-59天笔数', '负债率', '月收入', '信贷数量', '逾期90天笔数',
       '固定资产贷款量', '逾期60-89天笔数', '家属数量'],
      dtype='object')
data.rename(columns={'好坏客户': 'target'}, inplace=True)
# woe分箱, iv and transform
data_woe = data # 用于存储所有数据的woe值
civ_list = []
n_positive = sum(data['target'])
n_negtive = len(data) - n_positive
for column in list(data.columns[1:]):
    if data[column].dtypes == 'object':
        civ = fp.proc_woe_discrete(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05)
    else:            
        civ = fp.proc_woe_continuous(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05)
    civ_list.append(civ)
    data_woe[column] = fp.woe_trans(data[column], civ)
    
civ_df = eval.eval_feature_detail(civ_list,'output_feature_detail_0315.csv')
# 删除iv值过小的变量
iv_thre = 0.001
iv = civ_df[['var_name','iv']].drop_duplicates()
x_columns = iv.var_name[iv.iv > iv_thre]

-------------process continuous variable:可用额度比值-------------
---------------process continuous variable:年龄---------------
-----------process continuous variable:逾期30-59天笔数-----------
--------------process continuous variable:负债率---------------
--------------process continuous variable:月收入---------------
--------------process continuous variable:信贷数量--------------
------------process continuous variable:逾期90天笔数-------------
------------process continuous variable:固定资产贷款量-------------
-----------process continuous variable:逾期60-89天笔数-----------
--------------process continuous variable:家属数量--------------
可用额度比值
年龄
逾期30-59天笔数
负债率
月收入
信贷数量
逾期90天笔数
固定资产贷款量
逾期60-89天笔数
家属数量
civ_df
var_name split_list sub_total_sample_num positive_sample_num negative_sample_num sub_total_num_percentage positive_rate_in_sub_total woe_list iv_list iv
0 可用额度比值 (-INF,0.0] 9352 281 9071 0.071213 0.030047 -0.757463 0.029626 1.097527
1 可用额度比值 (0.0,0.04215617400000002] 29188 365 28823 0.222259 0.012505 -1.652011 0.312454 1.097527
2 可用额度比值 (0.04215617400000002,0.0596119858] 6962 111 6851 0.053014 0.015944 -1.405599 0.059004 1.097527
3 可用额度比值 (0.0596119858,0.13857709429999995] 17901 375 17526 0.136312 0.020949 -1.127495 0.108464 1.097527
4 可用额度比值 (0.13857709429999995,0.21535932080000003] 10113 295 9818 0.077008 0.029170 -0.787977 0.034242 1.097527
5 可用额度比值 (0.21535932080000003,0.30067412204] 8274 296 7978 0.063004 0.035775 -0.577063 0.016386 1.097527
6 可用额度比值 (0.30067412204,0.3974544458] 7510 371 7139 0.057187 0.049401 -0.240106 0.002970 1.097527
7 可用额度比值 (0.3974544458,0.5331554074] 8506 586 7920 0.064771 0.068893 0.113193 0.000872 1.097527
8 可用额度比值 (0.5331554074,0.74050784496] 9985 996 8989 0.076033 0.099750 0.517010 0.025541 1.097527
9 可用额度比值 (0.74050784496,0.90349439404] 7295 1103 6192 0.055550 0.151199 0.991796 0.084555 1.097527
10 可用额度比值 (0.90349439404,+INF) 16238 3360 12878 0.123648 0.206922 1.373441 0.423411 1.097527
0 年龄 (-INF,32.0] 13531 1392 12139 0.103035 0.102875 0.551338 0.039964 0.046040
1 年龄 (32.0,+INF) 117793 6747 111046 0.896965 0.057278 -0.083827 0.006076 0.046040
0 逾期30-59天笔数 (-INF,0.0] 111119 4279 106840 0.846144 0.038508 -0.500593 0.170989 0.606073
1 逾期30-59天笔数 (0.0,+INF) 20205 3860 16345 0.153856 0.191042 1.273765 0.435084 0.606073
0 负债率 (-INF,0.018495376] 10665 466 10199 0.081211 0.043694 -0.368839 0.009420 0.088009
1 负债率 (0.018495376,0.087064379] 8843 577 8266 0.067337 0.065249 0.054956 0.000208 0.088009
2 负债率 (0.087064379,0.138218834] 7533 449 7084 0.057362 0.059604 -0.041551 0.000097 0.088009
3 负债率 (0.138218834,0.191269577] 9148 493 8655 0.069660 0.053892 -0.148363 0.001437 0.088009
4 负债率 (0.191269577,0.229044637] 6898 383 6515 0.052527 0.055523 -0.116807 0.000681 0.088009
5 负债率 (0.229044637,0.26480176767999997] 6780 313 6467 0.051628 0.046165 -0.311244 0.004370 0.088009
6 负债率 (0.26480176767999997,0.33095571454] 12054 618 11436 0.091788 0.051269 -0.201013 0.003398 0.088009
7 负债率 (0.33095571454,0.37664756308] 7551 440 7111 0.057499 0.058270 -0.065603 0.000240 0.088009
8 负债率 (0.37664756308,0.4237495164599999] 6696 406 6290 0.050988 0.060633 -0.023343 0.000028 0.088009
9 负债率 (0.4237495164599999,0.54743575044] 12664 918 11746 0.096433 0.072489 0.167949 0.002929 0.088009
10 负债率 (0.54743575044,0.7263413320000001] 9112 825 8287 0.069386 0.090540 0.409960 0.013976 0.088009
11 负债率 (0.7263413320000001,2.6823588614000204] 9111 1026 8085 0.069378 0.112611 0.652677 0.039439 0.088009
12 负债率 (2.6823588614000204,1009.0] 10925 639 10286 0.083191 0.058490 -0.061614 0.000307 0.088009
13 负债率 (1009.0,+INF) 13344 586 12758 0.101611 0.043915 -0.363574 0.011478 0.088009
0 月收入 (-INF,1159.0] 13281 877 12404 0.101132 0.066034 0.067753 0.000478 0.114078
... ... ... ... ... ... ... ... ... ... ...
7 月收入 (4831.0,5332.68] 7201 470 6731 0.054834 0.065269 0.055274 0.000172 0.114078
8 月收入 (5332.68,5917.0] 7381 432 6949 0.056205 0.058529 -0.060907 0.000203 0.114078
9 月收入 (5917.0,6667.0] 8647 533 8114 0.065845 0.061640 -0.005805 0.000002 0.114078
10 月收入 (6667.0,7916.0] 10306 519 9787 0.078478 0.050359 -0.219886 0.003448 0.114078
11 月收入 (7916.0,8333.0] 7967 327 7640 0.060667 0.041044 -0.434172 0.009484 0.114078
12 月收入 (8333.0,10300.0] 10340 489 9851 0.078737 0.047292 -0.285946 0.005687 0.114078
13 月收入 (10300.0,+INF) 13361 474 12887 0.101741 0.035476 -0.585747 0.027165 0.114078
0 信贷数量 (-INF,3.0] 18632 1864 16768 0.141878 0.100043 0.520272 0.048333 0.067247
1 信贷数量 (3.0,4.0] 10396 620 9776 0.079163 0.059638 -0.040946 0.000130 0.067247
2 信贷数量 (4.0,5.0] 11689 671 11018 0.089009 0.057404 -0.081496 0.000570 0.067247
3 信贷数量 (5.0,6.0] 12373 651 11722 0.094217 0.052615 -0.173693 0.002635 0.067247
4 信贷数量 (6.0,7.0] 12102 629 11473 0.092154 0.051975 -0.186600 0.002958 0.067247
5 信贷数量 (7.0,8.0] 11422 518 10904 0.086976 0.045351 -0.329890 0.008205 0.067247
6 信贷数量 (8.0,9.0] 10219 568 9651 0.077815 0.055583 -0.115675 0.000990 0.067247
7 信贷数量 (9.0,10.0] 8745 488 8257 0.066591 0.055803 -0.111481 0.000788 0.067247
8 信贷数量 (10.0,11.0] 7431 405 7026 0.056585 0.054501 -0.136466 0.000993 0.067247
9 信贷数量 (11.0,13.0] 11199 615 10584 0.085278 0.054916 -0.128456 0.001330 0.067247
10 信贷数量 (13.0,+INF) 17116 1110 16006 0.130334 0.064852 0.048416 0.000312 0.067247
0 逾期90天笔数 (-INF,0.0] 124488 5426 119062 0.947946 0.043587 -0.371422 0.111376 0.800610
1 逾期90天笔数 (0.0,+INF) 6836 2713 4123 0.052054 0.396870 2.298494 0.689234 0.800610
0 固定资产贷款量 (-INF,0.0] 49471 3805 45666 0.376710 0.076914 0.231982 0.022454 0.043142
1 固定资产贷款量 (0.0,1.0] 48153 2429 45724 0.366673 0.050443 -0.218124 0.015867 0.043142
2 固定资产贷款量 (1.0,2.0] 28413 1538 26875 0.216358 0.054130 -0.143694 0.004196 0.043142
3 固定资产贷款量 (2.0,+INF) 5287 367 4920 0.040259 0.069416 0.121318 0.000625 0.043142
0 逾期60-89天笔数 (-INF,0.0] 125162 6053 119109 0.953078 0.048361 -0.262465 0.058584 0.515526
1 逾期60-89天笔数 (0.0,+INF) 6162 2086 4076 0.046922 0.338526 2.047152 0.456942 0.515526
0 家属数量 (-INF,0.0] 79954 4351 75603 0.608830 0.054419 -0.138070 0.010928 0.028199
1 家属数量 (0.0,1.0] 24473 1683 22790 0.186356 0.068770 0.111276 0.002423 0.028199
2 家属数量 (1.0,2.0] 18117 1377 16740 0.137957 0.076006 0.219126 0.007295 0.028199
3 家属数量 (2.0,3.0] 8780 728 8052 0.066858 0.082916 0.313645 0.007553 0.028199

66 rows × 10 columns


x_columns
0        可用额度比值
0            年龄
0    逾期30-59天笔数
0           负债率
0           月收入
0          信贷数量
0       逾期90天笔数
0       固定资产贷款量
0    逾期60-89天笔数
0          家属数量
Name: var_name, dtype: object
iv
var_name iv
0 可用额度比值 1.097527
0 年龄 0.046040
0 逾期30-59天笔数 0.606073
0 负债率 0.088009
0 月收入 0.114078
0 信贷数量 0.067247
0 逾期90天笔数 0.800610
0 固定资产贷款量 0.043142
0 逾期60-89天笔数 0.515526
0 家属数量 0.028199
data_woe.head()
target 可用额度比值 年龄 逾期30-59天笔数 负债率 月收入 信贷数量 逾期90天笔数 固定资产贷款量 逾期60-89天笔数 家属数量
1 0 1.373441 -0.083827 -0.500593 -0.041551 0.461028 -0.040946 -0.371422 0.231982 -0.262465 0.111276
2 0 0.517010 -0.083827 1.273765 0.054956 0.461028 0.520272 2.298494 0.231982 -0.262465 -0.138070
3 0 -0.577063 0.551338 -0.500593 0.054956 0.461028 -0.081496 -0.371422 0.231982 -0.262465 -0.138070
5 0 -0.787977 -0.083827 -0.500593 -0.065603 0.243014 0.520272 -0.371422 -0.218124 -0.262465 0.111276
7 0 0.991796 -0.083827 -0.500593 -0.116807 0.243014 -0.329890 -0.371422 0.231982 -0.262465 -0.138070

模型建立

信用评分卡模型在国外是一种成熟的预测方法,尤其在信用风险评估以及金融风险控制领域更是得到了比较广泛的使用,其原理是将模型变量WOE编码方式离散化之后运用logistic回归模型进行的一种二分类变量的广义线性模型,下面将模型目标标量为1记为违约用户,对于目标变量为0记为正常用户,采用sklearn中LogisticRegression进行建模

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# 模型评估
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import auc
# 数据提取与数据分割
col_names = data_woe.columns.values
X = data_woe[col_names[1:]]  # 特征列
y = data_woe[col_names[0]]  # 标签列
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=0)
lr = LogisticRegression(C=1000.0, random_state=0)
result = lr.fit(X_train, y_train)
result
LogisticRegression(C=1000.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
# 模型预测
y_pred = lr.predict(X_test)
y_pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
# 预测为坏的客户的概率
prob_pred = [round(u[1], 5) for u in lr.predict_proba(X_test)]
# 预测的准确率
accuracy_score(y_test, y_pred)
0.9387532362048835
# 样本类别不平衡,用PR不好评价,采用ROC曲线
FPR, TPR, thresholds = metrics.roc_curve(y_test, prob_pred, pos_label=1)
metrics.auc(FPR, TPR)
0.8499778184241903
# 画图对预测值和实际值进行比较
plt.plot(FPR, TPR, 'b', label='AUC = %0.2f' % metrics.auc(FPR, TPR)) # 生成ROC曲线
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ZuGQDu7G-1577332256208)(output_33_0.png)]

从上图可知,AUC值为0.85,说明该模型的预测效果还是不错的,正确率较高

评分卡计算方法

odds为good用户概率(p)与bad用户概率(1-p)的比值

odds ⁡ = p 1 − p \operatorname{odds}=\frac{p}{1-p} odds=1pp

评分卡设定的分值刻度可以通过将分值表示为比率对数的现行表达式来定义。公式如下:

s c o r e 总 = A + B ∗ ln ⁡ ( o d d s ) score_{总}=A+B{*}\ln(odds) score=A+Bln(odds)

常数 A 和 B 通常被称为补偿和刻度,它们的值可以通过将两个已知或者假设的分值带入 s c o r e 总 = A + B ∗ ln ⁡ ( o d d s ) score_{总}=A+B{*}\ln(odds) score=A+Bln(odds) 中得到。通常,需要两个假设:

  • 在某个特定的比率设定特定的预期分值 P 0 P_{0} P0
  • 指定比率翻番的分数(PDO,Point-to-Double Odds)

首先,设定比率为odds的特定点的分值为 P 0 P_{0} P0。然后,比率为 2odds的点分值为 P 0 − P D O P_{0}-PDO P0PDO,带入可以得到
B = P D O log ⁡ ( 2 ) B=\frac{PDO}{\log (2)} B=log(2)PDO

A = P 0 + B log ⁡ ( o d d s ) A=P_{0}+B \log \left(odds\right) A=P0+Blog(odds)

import math
# PDO为比率翻番的分数,P0为特定比例的预期分值,B为刻度
PDO = 20
P0 = 600
B = PDO / math.log(2)
B
28.85390081777927
# A为补偿
A = P0 + B * math.log(1 / 60)
A
481.8621880878296

基于Logistic的评分卡构建


最终,评分卡的分值可以写成下列形式:

Score = A − B ( β 0 + β 1 x 1 + ⋯ + β p x p ) =A-B\left(\beta_{0}+\beta_{1} x_{1}+\cdots+\beta_{p} x_{p}\right) =AB(β0+β1x1++βpxp)

变量 x 1 x_{1} x1,⋯, x p x_{p} xp为自变量对应WOE, β 0 \beta_{0} β0,⋯, β p \beta_{p} βp为逻辑斯蒂回归方程的系数

# 逻辑斯蒂回归的系数列表
coef_list = list(result.coef_[0])
coef_list.insert(0, result.intercept_[0])
# 计算信用评分
def credit_socre(data, coef): 
    score_list = []
    for i in range(data.shape[0]):
        tmp_score = coef[0]
        for j in range(data.shape[1]):
            tmp_score += data.iat[i, j] * coef[j + 1]
        score = A - B * tmp_score
        score_list.append(score)
    return score_list
score_list = credit_socre(data_woe.iloc[:, 1:], coef_list)
data_woe.insert(11, 'credit_score', score_list)
data_woe.head().append(data_woe.tail())

# 在原始数据中插入信用评分
data.insert(11, 'credit_socre', score_list)
data.head().append(data.tail())




你可能感兴趣的:(分箱、WOE、IV的计算)