R 语言快速构建信用评分卡模型---scorecard包

前言

R 语言快速构建机器学习,基于某大佬的scorecard包。

# github主页 - R版: http://github.com/shichenxie/scorecard

 

# 加载[data.table](http://r-datatable.com)与scorecard包

library(data.table) # 一个超高性能的数据处理包

library(scorecard)

 

# 数据准备 ------

# 加载scorecard包中的德国信贷数据集。该数据集包含了1000个信贷样本

# 20个特征,其详细介绍请参考[UCI的德国信贷数据集]

链接:https://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data)

data("germancredit")

# dim(germancredit)

 

# 数据集的违约标签为creditability

# 将其中坏样本的标签赋值为1,好样本赋值为0。

dt = setDT(germancredit)[

, creditability := ifelse(creditability=="bad",1,0)]

 

# 样本粗筛 var_filter

# 默认删除信息值小于0.02、缺失率大于95%或单类别比例大于95%的变量

# 可通过iv_limit, missing_limit, identical_limit分别设定。

# var_rm与var_kp指定需要强制删除或强制保留的变量

dt_s = var_filter(dt, "creditability")

# dim(dt_s)

# 如果return_rm_reason=TRUE,则返回删除变量的原因

# dt_s = var_filter(dt, "creditability", return_rm_reason=TRUE)

# dt_s$rm # 删除变量的原因

# dt_s$dt # 粗筛之后的数据集

 

# 拆分数据集为训练集与测试集 split_df

# y为标签,如果不指定则随机拆分,反之则按照y值分层拆分

# ratio为拆分后两个数据集样本数比例

# seed为随机种子,用于重现样本拆分

dt_list = split_df(dt_s, y="creditability", ratio=0.6, seed=30)

train = dt_list$train; test = dt_list$test;

# 由于数据集样本较少,后面的分箱过程采用全样本进行

 

# 分箱与woe转换 ------

# 最优分箱 woebin,该函数通过决策树的形式寻找最优分箱点。

# 默认当stop_limit信息值增益率小于0.1, 或max_bin_num分箱数大于6(缺失值除外)时停止分箱。

bins = woebin(dt_s, "creditability", print_step=1)

# class(bins)

 

# 打印第一个变量的分箱

bins[[1]]

# 绘制变量分箱图woebin_plot

woebin_plot(bins[[1]])

 

# 手动调整分箱 woebin

# 通过breaks_list指定分箱点,其中类别变量通过 %,% 相连

breaks_adj = list(

age.in.years=c(26, 35, 40),

other.debtors.or.guarantors=c("none", "co-applicant%,%guarantor"))

 

bins_adj = woebin(dt_s, "creditability", breaks_list=breaks_adj, print_step=0)

 

# 交互式调整分箱woebin_adj

# breaks_adj = woebin_adj(dt_s, "creditability", bins)

# bins_adj = woebin(dt_s, "creditability", breaks_list=breaks_adj, print_step=0)

 

# 原始数据集转换为对应的woe值woebin_ply

train_woe = woebin_ply(train, bins_adj, print_step=0)

test_woe = woebin_ply(test, bins_adj, print_step=0)

 

# 逻辑回归 ------

m1 = glm( creditability ~ ., family = "binomial", data = train_woe)

# summary(m1)

 

# 基于AIC筛选变量

# 也可通过lasso实现变量筛选,具体参考上面提到的幻灯片

m_step = step(m1, direction="both", trace = FALSE)

m2 = eval(m_step$call)

# summary(m2)

 

 

# 模型评估 ------

# 预测违约概率

train_pred = predict(m2, train_woe, type='response')

test_pred = predict(m2, test_woe, type='response')

 

# ks & roc plot

# type可设定返回的模型评估指标,包括"ks", "lift", "roc", "pr"

perf_eva(train$creditability, train_pred, title = "train")

perf_eva(test$creditability, test_pred, title = "test")

 

# 评分卡与信用评分 ------

# 默认基础分points0为600,

# 对应的坏好比odds0为1/19,

# 坏好比翻倍的分数pdo为50分

card = scorecard(bins_adj, m2)

 

# 基于评分卡,计算相应的信用评分

# only_total_score 如果为TRUE只返回总评分,FALSE返回各个变量的评分

train_score = scorecard_ply(train, card, only_total_score = TRUE, print_step = 0)

test_score = scorecard_ply(test, card, only_total_score = TRUE, print_step = 0)

 

# 模型稳定性评估

# x_limits, x_tick_break分别指定计算psi时的评分范围与间隔

perf_psi(

score = list(train = train_score, test = test_score),

label = list(train = train$creditability, test = test$creditability),

x_limits = c(250, 700), x_tick_break = 50 )

 

 

你可能感兴趣的:(数据分析,R语言系列,信用模型)