尝试用toad建评分卡
import pandas as pd
import numpy as np
import toad
from toad.plot import bin_plot
from toad.plot import badrate_plot
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from toad.metrics import KS, AUC, F1
from sklearn.metrics import roc_auc_score,roc_curve,auc
from toad.scorecard import ScoreCard
from matplotlib import pyplot as plt
1.导入数据,观察好坏比
data = pd.read_excel(r'l.xlsx')
print('Shape:',data.shape)
print('bad:',data['is_bad'].sum(),'good:',data['is_bad'].value_counts()[0])
2.数据eda
info = toad.detect(data) #数据分析
data.dropna(axis=0,how='any',inplace=True) #缺失值处理
t=['MD002', 'MD001', 'MD004', 'Z0009', 'BLK01', 'BLK02', 'BLK03', 'BLK04','BLK05']
for i in t:
data[i] = data[i].astype('str',copy=False)
3.划分数据集
data = data.iloc[:,1:] #去掉ID列
data_train,data_test = train_test_split(data,test_size=0.2,random_state=42)
print("train size:",data_train.shape,"\ntest_size:",data_test.shape)
4.数据原始iv值
toad.quality(data_train,target='is_bad',iv_only=True)
5.#特征筛选(或的关系)
train_selected,droped = toad.selection.select(data_train,'is_bad',empty=0.9,iv=0.02,corr=0.7,return_drop = True,exclude = None)
print(droped)
6.分箱
c = toad.transform.Combiner()
c.fit(train_selected,y='is_bad',method='chi',min_samples=0.04,n_bins=5)
c.export()
# 观察分箱
col_lst = train_selected.columns.values[:-1].tolist()
for col in col_lst:
bin_plot(c.transform(train_selected[[col,'is_bad']], labels=True), x=col, target='is_bad')
# 调整分箱
c.set_rules({'MD002':[['5'], ['4','0'], ['3'], ['1', '2']]
,'MD001':[['1','5'], ['2'], ['4','3']]
,'MD004':[['2','1'], ['0']]
,'Z0009':[['4.0','3.0'], ['5.0', '2.0'], ['1.0'], ['0.0', '-1.0', '6.0']]
,'value_1007':[11.0, 37.0, 67.0]
,'value_2008':[46.0, 56.0]
,'gongdai_3':[1]
,'gongdai_4':[0, 1, 28]})
7.根据训练好的transer,转化test/OOT数据
transer = toad.transform.WOETransformer()
train_woe = transer.fit_transform(c.transform(train_selected), train_selected['is_bad'], exclude=['is_bad'])
test_woe = transer.transform(c.transform(data_test))
#此处可以调整入模变量…
8.特征筛选:逐步回归 向前向后双向
final_train = toad.selection.stepwise(train_woe,target = train_woe['is_bad'], estimator='ols', direction = 'both', criterion = 'aic', exclude =['is_bad'])
print(final_train.shape) # 逐步回归选出了10个
final_test = test_woe[final_train.columns]
9.建模
lr = LogisticRegression(class_weight = 'balanced')
lr.fit(final_train.iloc[:,:-1], final_train['is_bad'])
10.预测训练集和测试集的y为1的概率
pred_train = lr.predict_proba(final_train.iloc[:,:-1])[:,1]
pred_test = lr.predict_proba(final_test.iloc[:,:-1])[:,1]
print('train KS',KS(pred_train, final_train['is_bad']))
print('train AUC',AUC(pred_train, final_train['is_bad']))
print('train F1',F1(pred_train, final_train['is_bad']))
print('test KS',KS(pred_test, final_test['is_bad']))
print('test AUC',AUC(pred_test, final_test['is_bad']))
print('test F1',F1(pred_test, final_test['is_bad']))
fpr_train,tpr_train,_ = roc_curve(final_train.iloc[:,-1],pred_train)
train_ks = abs(fpr_train - tpr_train).max()
print('train_ks : ',train_ks)
fpr_test,tpr_test, _ = roc_curve(final_test.iloc[:,-1],pred_test)
train_ks = abs(fpr_test - tpr_test).max()
print('train_ks : ',train_ks)
plt.plot(fpr_train,tpr_train,label ='train')
plt.plot(fpr_test,tpr_test,label ='test')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.legend(loc = 'best')
plt.title('ROC Curve')
11.KS bucket输出模型预测分箱后评判信息,包括每组的分数区间,样本量,坏账率,KS等
buc = toad.metrics.KS_bucket(pred_train, final_train['is_bad'], bucket=20, method = 'quantile')
12.逻辑回归模型转标准评分卡(实际意义为当比率为1/20,输出基准评分750,当比率为基准比率2倍时,基准分下降60分)
card = ScoreCard(
combiner = c,
transer = transer,
pdo = 60,
rate = 2,
base_odds = 20,
base_score = 750,
)
card.fit(final_train.iloc[:,:-1],final_train['is_bad'])
final_card = card.export(to_frame = True)
参考博文:
https://zhuanlan.zhihu.com/p/90354450
https://github.com/amphibian-dev/toad/blob/master/toad/metrics.py#L33-L120