R 语言快速构建机器学习,基于某大佬的scorecard包。
# github主页 - R版: http://github.com/shichenxie/scorecard
# 加载[data.table](http://r-datatable.com)与scorecard包
library(data.table) # 一个超高性能的数据处理包
library(scorecard)
# 数据准备 ------
# 加载scorecard包中的德国信贷数据集。该数据集包含了1000个信贷样本
# 20个特征,其详细介绍请参考[UCI的德国信贷数据集]
链接:https://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data)
data("germancredit")
# dim(germancredit)
# 数据集的违约标签为creditability
# 将其中坏样本的标签赋值为1,好样本赋值为0。
dt = setDT(germancredit)[
, creditability := ifelse(creditability=="bad",1,0)]
# 样本粗筛 var_filter
# 默认删除信息值小于0.02、缺失率大于95%或单类别比例大于95%的变量
# 可通过iv_limit, missing_limit, identical_limit分别设定。
# var_rm与var_kp指定需要强制删除或强制保留的变量
dt_s = var_filter(dt, "creditability")
# dim(dt_s)
# 如果return_rm_reason=TRUE,则返回删除变量的原因
# dt_s = var_filter(dt, "creditability", return_rm_reason=TRUE)
# dt_s$rm # 删除变量的原因
# dt_s$dt # 粗筛之后的数据集
# 拆分数据集为训练集与测试集 split_df
# y为标签,如果不指定则随机拆分,反之则按照y值分层拆分
# ratio为拆分后两个数据集样本数比例
# seed为随机种子,用于重现样本拆分
dt_list = split_df(dt_s, y="creditability", ratio=0.6, seed=30)
train = dt_list$train; test = dt_list$test;
# 由于数据集样本较少,后面的分箱过程采用全样本进行
# 分箱与woe转换 ------
# 最优分箱 woebin,该函数通过决策树的形式寻找最优分箱点。
# 默认当stop_limit信息值增益率小于0.1, 或max_bin_num分箱数大于6(缺失值除外)时停止分箱。
bins = woebin(dt_s, "creditability", print_step=1)
# class(bins)
# 打印第一个变量的分箱
bins[[1]]
# 绘制变量分箱图woebin_plot
woebin_plot(bins[[1]])
# 手动调整分箱 woebin
# 通过breaks_list指定分箱点,其中类别变量通过 %,% 相连
breaks_adj = list(
age.in.years=c(26, 35, 40),
other.debtors.or.guarantors=c("none", "co-applicant%,%guarantor"))
bins_adj = woebin(dt_s, "creditability", breaks_list=breaks_adj, print_step=0)
# 交互式调整分箱woebin_adj
# breaks_adj = woebin_adj(dt_s, "creditability", bins)
# bins_adj = woebin(dt_s, "creditability", breaks_list=breaks_adj, print_step=0)
# 原始数据集转换为对应的woe值woebin_ply
train_woe = woebin_ply(train, bins_adj, print_step=0)
test_woe = woebin_ply(test, bins_adj, print_step=0)
# 逻辑回归 ------
m1 = glm( creditability ~ ., family = "binomial", data = train_woe)
# summary(m1)
# 基于AIC筛选变量
# 也可通过lasso实现变量筛选,具体参考上面提到的幻灯片
m_step = step(m1, direction="both", trace = FALSE)
m2 = eval(m_step$call)
# summary(m2)
# 模型评估 ------
# 预测违约概率
train_pred = predict(m2, train_woe, type='response')
test_pred = predict(m2, test_woe, type='response')
# ks & roc plot
# type可设定返回的模型评估指标,包括"ks", "lift", "roc", "pr"
perf_eva(train$creditability, train_pred, title = "train")
perf_eva(test$creditability, test_pred, title = "test")
# 评分卡与信用评分 ------
# 默认基础分points0为600,
# 对应的坏好比odds0为1/19,
# 坏好比翻倍的分数pdo为50分
card = scorecard(bins_adj, m2)
# 基于评分卡,计算相应的信用评分
# only_total_score 如果为TRUE只返回总评分,FALSE返回各个变量的评分
train_score = scorecard_ply(train, card, only_total_score = TRUE, print_step = 0)
test_score = scorecard_ply(test, card, only_total_score = TRUE, print_step = 0)
# 模型稳定性评估
# x_limits, x_tick_break分别指定计算psi时的评分范围与间隔
perf_psi(
score = list(train = train_score, test = test_score),
label = list(train = train$creditability, test = test$creditability),
x_limits = c(250, 700), x_tick_break = 50 )