toad建立评分卡

尝试用toad建评分卡

import pandas as pd
import numpy as np
import toad
from toad.plot import bin_plot
from toad.plot import badrate_plot
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from toad.metrics import KS, AUC, F1
from sklearn.metrics import roc_auc_score,roc_curve,auc
from toad.scorecard import ScoreCard
from matplotlib import pyplot as plt

1.导入数据,观察好坏比

data = pd.read_excel(r'l.xlsx')
print('Shape:',data.shape)
print('bad:',data['is_bad'].sum(),'good:',data['is_bad'].value_counts()[0])

2.数据eda

info = toad.detect(data) #数据分析
data.dropna(axis=0,how='any',inplace=True) #缺失值处理

t=['MD002', 'MD001', 'MD004', 'Z0009', 'BLK01', 'BLK02', 'BLK03', 'BLK04','BLK05']
for i in t:
    data[i] = data[i].astype('str',copy=False)

3.划分数据集

data = data.iloc[:,1:] #去掉ID列
data_train,data_test = train_test_split(data,test_size=0.2,random_state=42)
print("train size:",data_train.shape,"\ntest_size:",data_test.shape)

4.数据原始iv值

toad.quality(data_train,target='is_bad',iv_only=True)

5.#特征筛选(或的关系)

train_selected,droped = toad.selection.select(data_train,'is_bad',empty=0.9,iv=0.02,corr=0.7,return_drop = True,exclude = None)
print(droped)

6.分箱

c = toad.transform.Combiner()
c.fit(train_selected,y='is_bad',method='chi',min_samples=0.04,n_bins=5)
c.export()

# 观察分箱
col_lst = train_selected.columns.values[:-1].tolist()
for col in col_lst:
    bin_plot(c.transform(train_selected[[col,'is_bad']], labels=True), x=col, target='is_bad')

# 调整分箱
c.set_rules({'MD002':[['5'], ['4','0'], ['3'], ['1', '2']]
             ,'MD001':[['1','5'], ['2'], ['4','3']]
             ,'MD004':[['2','1'], ['0']]
             ,'Z0009':[['4.0','3.0'], ['5.0', '2.0'], ['1.0'], ['0.0', '-1.0', '6.0']]
             ,'value_1007':[11.0, 37.0, 67.0]
             ,'value_2008':[46.0, 56.0]
             ,'gongdai_3':[1]
             ,'gongdai_4':[0, 1, 28]})

7.根据训练好的transer,转化test/OOT数据

transer = toad.transform.WOETransformer()
train_woe = transer.fit_transform(c.transform(train_selected), train_selected['is_bad'], exclude=['is_bad'])
test_woe = transer.transform(c.transform(data_test))
#此处可以调整入模变量…

8.特征筛选:逐步回归 向前向后双向

final_train = toad.selection.stepwise(train_woe,target = train_woe['is_bad'], estimator='ols', direction = 'both', criterion = 'aic', exclude =['is_bad'])
print(final_train.shape) # 逐步回归选出了10个
final_test = test_woe[final_train.columns]

9.建模

lr = LogisticRegression(class_weight = 'balanced')
lr.fit(final_train.iloc[:,:-1], final_train['is_bad'])

10.预测训练集和测试集的y为1的概率

pred_train = lr.predict_proba(final_train.iloc[:,:-1])[:,1]
pred_test = lr.predict_proba(final_test.iloc[:,:-1])[:,1]
print('train KS',KS(pred_train, final_train['is_bad']))
print('train AUC',AUC(pred_train, final_train['is_bad']))
print('train F1',F1(pred_train, final_train['is_bad']))

print('test KS',KS(pred_test, final_test['is_bad']))
print('test AUC',AUC(pred_test, final_test['is_bad']))
print('test F1',F1(pred_test, final_test['is_bad']))

fpr_train,tpr_train,_  = roc_curve(final_train.iloc[:,-1],pred_train)
train_ks = abs(fpr_train - tpr_train).max()
print('train_ks : ',train_ks)

fpr_test,tpr_test, _  = roc_curve(final_test.iloc[:,-1],pred_test)
train_ks = abs(fpr_test - tpr_test).max()
print('train_ks : ',train_ks)

plt.plot(fpr_train,tpr_train,label ='train')
plt.plot(fpr_test,tpr_test,label ='test')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.legend(loc = 'best')
plt.title('ROC Curve')

11.KS bucket输出模型预测分箱后评判信息,包括每组的分数区间,样本量,坏账率,KS等

buc = toad.metrics.KS_bucket(pred_train, final_train['is_bad'], bucket=20, method = 'quantile')

12.逻辑回归模型转标准评分卡(实际意义为当比率为1/20,输出基准评分750,当比率为基准比率2倍时,基准分下降60分)

card = ScoreCard(
    combiner = c,
    transer = transer,
    pdo = 60,
    rate = 2,
    base_odds = 20,
    base_score = 750,
    )
card.fit(final_train.iloc[:,:-1],final_train['is_bad'])
final_card = card.export(to_frame = True)

参考博文:
https://zhuanlan.zhihu.com/p/90354450
https://github.com/amphibian-dev/toad/blob/master/toad/metrics.py#L33-L120

你可能感兴趣的:(python)