同业小伙伴都知道信贷风控界有一个库叫做Scorecardpy,作者是谢士晨博士,就不在赘述。今天为读者介绍另一个同样用于开发评分卡的标标准化评分卡建模模块---toad库:
toad是由厚本金融风控团队内部孵化,后开源并坚持维护的标准化评分卡开发库。其功能全面、性能稳健、运行速度快、问题反馈后维护迅速、深受同行喜爱。toad库可实现标准化的信用评分开发,极大简化我们的建模构建和缩短建模周期,只介绍toad库做评分卡模型框架,详细内容请拿数据自己模拟调试,因为作者也比较懒。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 9 19:44:28 2021
@author: Bonus_F
"""
import pandas as pd
from sklearn.metrics import roc_auc_score,roc_curve,auc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV as gscv
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import glob
import math
import xgboost as xgb
import toad ##直接 pip install toad
import scorecardpy as sc
# data prepare ------
# load germancredit data
#dat = pd.read_csv('data.csv')
# df = sc.germancredit() #也可以适用sc里面的德国信贷数据进场测试
import seaborn as sns
# 导出泰坦尼克数据集
df = sns.load_dataset('titanic')
# EDA 用于检测数据情况(EDA)。
#输出每列特征的统计性特征和其他信息,主要的信息包括:缺失值、unique values、数值变量的平均值、离散值变量的众数等。
data = toad.detector.detect(df)
ex_list= [] #指定不参与训练列名
data.shape
data.columns
data.head(10)
#特征筛选 Feature selection
###Output IV (information value), gini, entropy and no. of unique values for each feature.
#输出每个变量的iv值,gini,entropy,和unique values,结果以iv值排序。”target”为目标列,”iv_only”决定是否只输出iv值。
toad.quality(df,'survived',iv_only=True)
#使用toad高效分箱并进行特征筛选
#---参数说明-----#
#empty:缺失率上限,默认值为0.9
#iv:信息量,默认值0.02
#corr:相关系数大于阈值,则删除IV小的特征,默认值0.7
#return_drop:返回删除特征 默认值False
#exclude:不参与筛选的变量名 默认值None
df_selected, dropped= toad.selection.select(df,df['survived'], empty = 0.95,
iv = 0.02, corr = 0.8, return_drop=True, exclude=ex_list)
print("keep:",df_selected.shape[1],
"drop empty:",len(dropped['empty']),
"drop iv:",len(dropped['iv']),
"drop corr:",len(dropped['corr']))
print(dropped)
print(df_selected.shape)
#分箱
#*训练分箱*: c.fit(dataframe, y = ‘target’, method = ‘chi’, min_samples = None, n_bins = None, empty_separate = False)
#y: 目标列
#method: 分箱方法,支持’chi’ (卡方分箱), ‘dt’ (决策树分箱), ‘kmean’ , ‘quantile’ (等频分箱), ‘step’ (等步长分箱)
#min_samples: 每箱至少包含样本量,可以是数字或者占比
#n_bins: 箱数,若无法分出这么多箱数,则会分出最多的箱数
#empty_separate: 是否将空箱单独分开
#*查看分箱节点*:c.export()
#*手动调整分箱*: c.load(dict)
#*apply分箱结果*: c.transform(dataframe, labels=False):
#labels: 是否将分箱结果转化成箱标签。False时输出0,1,2…(离散变量根据占比高低排序),True输出(-inf, 0], (0,10], (10, inf)。
#注意:1. 注意删去不需要分箱的列,特别是ID列和时间列
c = toad.transform.Combiner()
c.fit(df_selected,y=df_selected['survived'],method='chi',min_samples = 0.05,
exclude=None)
#导出箱的节点
bins = c.export()
# 为了演示,仅展示部分分箱
print('age:',c.export()['age'])
#根据节点实施分箱
df_selected_1 = c.transform(df_selected)
#分箱后通过画图观察
# 看'fare'在时间内的分箱
col = 'fare'
from toad.plot import bin_plot,badrate_plot
bin_plot(c.transform(df_selected_1[[col,'survived']], labels=True), x=col, target='survived')
#跨时间观察
# target: 目标列
# x: 时间列, string格式
# by: 需要观察的特征
#注意:时间列需要预先分好并设成string,不支持timestampe
# 观察 'fare' 分别在时间内和OOT中的稳定性
badrate_plot(c.transform(df_selected[[col,'survived','month']], labels=True), target='survived', x='month', by=col)
badrate_plot(c.transform(OOT[[col,'survived','month']], labels=True), target='survived', x='month', by=col)
badrate_plot(c.transform(df[[col,'survived','month']], labels=True), target='survived', x='month', by=col)
#调整分箱:*c.set_rules(dict)
#设置分组 bins_1人为设定分组
rule = {'fare':bins_1,bins_2, bins_3}
#调整分箱
c.set_rules(rule)
#查看手动分箱稳定性
OOT = df['限定条件']
bin_plot(c.transform(df_selected_1[['fare','survived']], labels=True), x='fare', target='survived')
badrate_plot(c.transform(OOT[['fare','survived','month']], labels=True), target='survived', x='month', by='fare')
#WOE转化
# 初始化
transer = toad.transform.WOETransformer()
# combiner.transform() & transer.fit_transform() 转化训练数据,并去掉target列
train_woe = transer.fit_transform(c.transform(df_selected), df_selected['survived'], exclude=to_drop+['survived'])
OOT_woe = transer.transform(c.transform(OOT))
print(train_woe.head(3))
#逐步回归特征筛选,支持向前,向后和双向(推荐)
#estimator: 用于拟合的模型,支持'ols', 'lr', 'lasso', 'ridge'
#direction: 逐步回归的方向,支持'forward', 'backward', 'both' (推荐)
#criterion: 评判标准,支持'aic', 'bic', 'ks', 'auc'
#max_iter: 最大循环次数
#return_drop: 是否返回被剔除的列名
#exclude: 不需要被训练的列名,比如ID列和时间列
#*tip: 经验证,direction = ‘both’效果最好。
#estimator = ‘ols’以及criterion = ‘aic’运行速度快且结果对逻辑回归建模有较好的代表性*
# 将woe转化后的数据做逐步回归
final_data = toad.selection.stepwise(train_woe,target = 'survived', estimator='ols', direction = 'both', criterion = 'aic', exclude = to_drop)
# 将选出的变量应用于test/OOT数据
final_OOT = OOT_woe[final_data.columns]
print(final_data.shape)
# 确定建模要用的变量
col = list(final_data.drop(to_drop+['survived'],axis=1).columns)
#*toad.metrics.PSI(df_train, df_test):*
#输出每列特征的PSI,可以用于检验WOE转化后的特征稳定性
psi_df = toad.metrics.PSI(final_data[col], final_OOT[col])
psi_df = psi_df.reset_index()
psi_df = psi_df.rename(columns = {'index' : 'feature',0:'psi'})
psi005 = list(psi_df[psi_df.psi<0.05].feature)
for i in ex_list:
if i in psi005:
pass
else:
psi005.append(i)
data = data[psi005]
train_woe_psi = train_woe[psi005]
OOT_woe_psi = OOT_woe[psi005]
print(data.shape)
#由于分箱后变量之间的共线性会变强,通过相关性再次筛选特征
train_woe_psi2, drop_lst= toad.selection.select(train_woe_psi,train_woe_psi['survived'], empty = 0.9,
iv = 0.02, corr = 0.5, return_drop=True, exclude=ex_list)
print("keep:",train_woe_psi2.shape[1],
"drop empty:",len(drop_lst['empty']),
"drop iv:",len(drop_lst['iv']),
"drop corr:",len(drop_lst['corr']))
#接下来通过逐步回归进行最终的特征筛选。检验方法(criterion)
dev_woe_psi_stp = toad.selection.stepwise(train_woe_psi2,
train_woe_psi2['survived'],
exclude = ex_list,
direction = 'both',
criterion = 'aic',
estimator = 'ols',
intercept = False)
OOT_woe_psi_stp = OOT_woe_psi[dev_woe_psi_stp.columns]
data = pd.concat([dev_woe_psi_stp,OOT_woe_psi_stp])
data.shape
#接下来定义双向逻辑回归和检验模型XGBoost
#定义逻辑回归
def lr_model(x,y,offx,offy,C):
model = LogisticRegression(C=C,class_weight='balanced')
model.fit(x,y)
y_pred = model.predict_proba(x)[:,1]
fpr_dev,tpr_dev,_ = roc_curve(y,y_pred)
train_ks = abs(fpr_dev - tpr_dev).max()
print('train_ks : ',train_ks)
y_pred = model.predict_proba(offx)[:,1]
fpr_off,tpr_off,_ = roc_curve(offy,y_pred)
off_ks = abs(fpr_off - tpr_off).max()
print('off_ks : ',off_ks)
from matplotlib import pyplot as plt
plt.plot(fpr_dev,tpr_dev,label = 'train')
plt.plot(fpr_off,tpr_off,label = 'off')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC Curve')
plt.legend(loc = 'best')
plt.show()
#定义xgboost辅助判断盘牙鞥特征交叉是否有必要
def xgb_model(x,y,offx,offy):
model = xgb.XGBClassifier(learning_rate=0.05,
n_estimators=400,
max_depth=3,
class_weight='balanced',
min_child_weight=1,
subsample=1,
objective="binary:logistic",
nthread=-1,
scale_pos_weight=1,
random_state=1,
n_jobs=-1,
reg_lambda=300)
model.fit(x,y)
print('>>>>>>>>>')
y_pred = model.predict_proba(x)[:,1]
fpr_dev,tpr_dev,_ = roc_curve(y,y_pred)
train_ks = abs(fpr_dev - tpr_dev).max()
print('train_ks : ',train_ks)
y_pred = model.predict_proba(offx)[:,1]
fpr_off,tpr_off,_ = roc_curve(offy,y_pred)
off_ks = abs(fpr_off - tpr_off).max()
print('off_ks : ',off_ks)
from matplotlib import pyplot as plt
plt.plot(fpr_dev,tpr_dev,label = 'train')
plt.plot(fpr_off,tpr_off,label = 'off')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC Curve')
plt.legend(loc = 'best')
plt.show()
#模型训练
def c_train(data,dep='bg_result_compensate',exclude=None):
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
#变量名
lis = list(data.columns)
for i in exclude:
lis.remove(i)
data[lis] = std_scaler.fit_transform(data[lis])
devv = data[(data['samp_type']=='dev') | (data['samp_type']=='val')]
offf = data[(data['samp_type']=='off1') | (data['samp_type']=='off2') ]
x,y = devv[lis],devv[dep]
offx,offy = offf[lis],offf[dep]
#逻辑回归正向
lr_model(x,y,offx,offy,0.1)
#逻辑回归反向
lr_model(offx,offy,x,y,0.1)
#XGBoost正向
xgb_model(x,y,offx,offy)
#XGBoost反向
xgb_model(offx,offy,x,y)
#------交叉比对分析---------#
c_train(data,dep='survived',exclude=ex_list)
# 用逻辑回归建模
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(final_data[col], final_data['survived'])
# 预测训练和隔月的OOT
pred_train = lr.predict_proba(final_data[col])[:,1]
pred_OOT_may =lr.predict_proba(final_OOT.loc[final_OOT.month == '2019-05',col])[:,1]
from toad.metrics import KS, AUC,F1
print('训练集')
print('train KS',KS(pred_train, final_data['survived']))
print('train AUC',AUC(pred_train, final_data['survived']))
print('train F1:', F1(pred_train,final_data['survived']))
print('OOT')
print('train KS',KS(pred_OOT_may, final_OOT))
print('train AUC',AUC(pred_OOT_may, final_OOT))
print('train F1:', F1(pred_OOT_may,final_OOT))
#PSI 同样可以用于验证分数的稳定性*
print(toad.metrics.PSI(pred_train,final_OOT))
#生产模型KS报告
#bucket:分箱的数量
#method:分箱方法,建议用'quantile'(等人数),或'step' (等分数步长)
#bad\_rate为每组坏账率:(1)组之间的坏账率差距越大越好(2)可以用于观察是否有跳点(3)可以用与找最佳切点(4)可以对比
# 将预测等频分箱,观测每组的区别
toad.metrics.KS_bucket(pred_train, final_data['survived'], bucket=10, method = 'quantile')
#生产评分卡 以及Fico分数校准的基础分与pdo(point of double odds)
#逻辑回归模型转标准评分卡,支持传入逻辑回归参数,进行调参。
#combiner: 传入训练好的 toad.Combiner 对象
#transer: 传入先前训练的 toad.WOETransformer 对象
#pdo、rate、base_odds、base_score:
#e.g. pdo=60, rate=2, base_odds=20,base_score=750
#实际意义为当比率为1/20,输出基准评分750,当比率为基准比率2倍时,基准分下降60分
#card: 支持传入专家评分卡
#**kwargs: 支持传入逻辑回归参数(参数详见 sklearn.linear_model.LogisticRegression)
card = toad.ScoreCard(
combiner = c,
transer = transer,
#class_weight = 'balanced',
#C=0.1,
#base_score = 600,
#base_odds = 35 ,
#pdo = 60,
#rate = 2
)
# 直接使用原始数据进行评分
card.predict(df)
#输出标准评分卡
card.export()
card.fit(final_data[col], final_data['survived'])
card.fit(x,y)
final_card = card.export(to_frame = True)
final_card.head()
#toad使用教程:https://toad.readthedocs.io/en/latest/tutorial_chinese.html