R语言建模---Home Credit Default Risk

前言

这是kaggle上关于Credit Risk的一个建模流程,其中非常有重要参考价值的点在于其衍生变量构造这个板块,值得我们借鉴。

#数据下载地址:https://www.kaggle.com/c/home-credit-default-risk/data

###########建模流程############

#加载需要使用的包

library(tidyverse)

library(xgboost)

library(magrittr)

set.seed(0)

 

-----------------------------------------------------

getwd()   -----#获取当前工作路径

setwd('C:/Users/lenovo/Desktop/HOME credic fraud')  -----#设定当前路径为工作路径

cat("Loading data...\n")

 

tr <- read_csv("application_train.csv")     ---#加载train data

te <- read_csv("application_test.csv")     ---#加载test data

 

dim(tr)   #train维度

head(tr) #前5行

names(tr) # 列名

dim(te) #test 维度

 

#管道函数---将数据character/factor,转换成integer类型

bureau <- read_csv("bureau.csv") %>%

mutate_if(is.character, funs(factor(.) %>% as.integer()))

 

dim(bureau)

head(bureau)

str(bureau)

 

#factor vars to integer vars

#.meanings all data

cred_card_bal <- read_csv("credit_card_balance.csv") %>%

mutate_if(is.character, funs(factor(.) %>% as.integer()))

 

pos_cash_bal <- read_csv("POS_CASH_balance.csv") %>%

mutate_if(is.character, funs(factor(.) %>% as.integer()))

 

prev <- read_csv("previous_application.csv") %>%

mutate_if(is.character, funs(factor(.) %>% as.integer()))

 

#---------------------------

cat("Preprocessing...\n")

#筛选查看相关数据

subset(bureau, SK_ID_CURR=='100001')

View(subset(avg_bureau, SK_ID_CURR=='100001'))

 

#使用%$%把左侧的程序的数据集A传递右侧程序的B函数,同时传递数据集A的属性名,

#作为B函数的内部变量方便对A数据集进行处理,最后完成数据计算

avg_bureau <- bureau %>%

group_by(SK_ID_CURR) %>%

summarise_all(funs(mean), na.rm = TRUE) %>%

mutate(buro_count = bureau %>%

group_by(SK_ID_CURR) %>%

count() %$% n)

 

avg_cred_card_bal <- cred_card_bal %>%

group_by(SK_ID_CURR) %>%

summarise_all(funs(mean), na.rm = TRUE) %>%

mutate(card_count = cred_card_bal %>%

group_by(SK_ID_CURR) %>%

count() %$% n)

 

 

avg_pos_cash_bal <- pos_cash_bal %>%

group_by(SK_ID_CURR) %>%

summarise_all(funs(mean), na.rm = TRUE) %>%

mutate(pos_count = pos_cash_bal %>%

group_by(SK_ID_PREV, SK_ID_CURR) %>%

group_by(SK_ID_CURR) %>%

count() %$% n)

 

avg_prev <- prev %>%

group_by(SK_ID_CURR) %>%

summarise_all(funs(mean), na.rm = TRUE) %>%

mutate(nb_app = prev %>%

group_by(SK_ID_CURR) %>%

count() %$% n)

 

tri <- 1:nrow(tr)

y <- tr$TARGET

 

tr_te <- tr %>%

select(-TARGET) %>%

bind_rows(te) %>%

left_join(avg_bureau, by = "SK_ID_CURR") %>%

left_join(avg_cred_card_bal, by = "SK_ID_CURR") %>%

left_join(avg_pos_cash_bal, by = "SK_ID_CURR") %>%

left_join(avg_prev, by = "SK_ID_CURR") %>%

mutate_if(is.character, funs(factor(.) %>% as.integer())) %>%

data.matrix()

 

rm(tr, te, prev, avg_prev, bureau, avg_bureau, cred_card_bal,

avg_cred_card_bal, pos_cash_bal, avg_pos_cash_bal)

gc()

 

#---------------------------

cat("Preparing data...\n")

#加载xgboost模块

library(xgboost)

dtest <- xgb.DMatrix(data = tr_te[-tri, ])

tr_te <- tr_te[tri, ]

tri <- caret::createDataPartition(y, p = 0.9, list = F) %>% c()

dtrain <- xgb.DMatrix(data = tr_te[tri, ], label = y[tri])

dval <- xgb.DMatrix(data = tr_te[-tri, ], label = y[-tri])

cols <- colnames(tr_te)

 

rm(tr_te, y, tri); gc()

 

#---------------------------

cat("Training model...\n")

###设定 xgboost 相关参数

p <- list(objective = "binary:logistic",

booster = "gbtree",

eval_metric = "auc",

nthread = 8,

eta = 0.025,

max_depth = 6,

min_child_weight = 19,

gamma = 0,

subsample = 0.8,

colsample_bytree = 0.632,

alpha = 0,

lambda = 0.05,

nrounds = 2000)

 

m_xgb <- xgb.train(p, dtrain, p$nrounds, list(val = dval), print_every_n = 50, early_stopping_rounds = 200)

 

xgb.importance(cols, model=m_xgb) %>%

xgb.plot.importance(top_n = 30)

 

#---------------------------

read_csv("sample_submission.csv") %>%

mutate(SK_ID_CURR = as.integer(SK_ID_CURR),

TARGET = predict(m_xgb, dtest)) %>%

write_csv(paste0("tidy_xgb_", round(m_xgb$best_score, 4), ".csv"))

 

 

#######################################################

#version2

 

#加载parallel包

library(parallel)

#detectCores函数可以告诉你你的CPU可使用的核数

clnum<-detectCores()

#设置参与并行的CPU核数目,这里我们使用了所有的CPU核,也就是我们刚才得到的clnum,具体到这个案例,clnum=4

cl <- makeCluster(getOption("cl.cores", clnum));

memory.limit(size=20480)

 

#---------------------------

getwd()

setwd('C:/Users/lenovo/Desktop/HOME credic fraud')

cat("Loading data...\n")

library(tidyverse)

library(xgboost)

library(magrittr)

set.seed(5)

 

#---------------------------

cat("Loading data...\n")

 

bbalance <- read_csv("bureau_balance.csv")

bureau <- read_csv("bureau.csv")

cc_balance <- read_csv("credit_card_balance.csv")

payments <- read_csv("installments_payments.csv")

pc_balance <- read_csv("POS_CASH_balance.csv")

prev <- read_csv("previous_application.csv")

tr <- read_csv("application_train.csv")

te <- read_csv("application_test.csv")

 

#---------------------------

cat("Preprocessing...\n")

 

fn <- funs(mean, sd, min, max, sum, n_distinct, .args = list(na.rm = TRUE))

 

#View(subset(bbalance, SK_ID_BUREAU=='5715448'))

#bbalance[which(bbalance$SK_ID_BUREAU=="5715448"),]

#View(subset(sum_bbalance, SK_ID_BUREAU=='5715448'))

 

sum_bbalance <- bbalance %>%

mutate_if(is.character, funs(factor(.) %>% as.integer)) %>%

group_by(SK_ID_BUREAU) %>%

summarise_all(fn)

rm(bbalance); gc()

 

sum_bureau <- bureau %>%

left_join(sum_bbalance, by = "SK_ID_BUREAU") %>%

select(-SK_ID_BUREAU) %>%

mutate_if(is.character, funs(factor(.) %>% as.integer)) %>%

group_by(SK_ID_CURR) %>%

summarise_all(fn)

rm(bureau, sum_bbalance); gc()

 

sum_cc_balance <- cc_balance %>%

select(-SK_ID_PREV) %>%

mutate_if(is.character, funs(factor(.) %>% as.integer)) %>%

group_by(SK_ID_CURR) %>%

summarise_all(fn)

rm(cc_balance); gc()

 

sum_payments <- payments %>%

select(-SK_ID_PREV) %>%

mutate(PAYMENT_PERC = AMT_PAYMENT / AMT_INSTALMENT,

PAYMENT_DIFF = AMT_INSTALMENT - AMT_PAYMENT,

DPD = DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT,

DBD = DAYS_INSTALMENT - DAYS_ENTRY_PAYMENT,

DPD = ifelse(DPD > 0, DPD, 0),

DBD = ifelse(DBD > 0, DBD, 0)) %>%

group_by(SK_ID_CURR) %>%

summarise_all(fn)

rm(payments); gc()

 

sum_pc_balance <- pc_balance %>%

select(-SK_ID_PREV) %>%

mutate_if(is.character, funs(factor(.) %>% as.integer)) %>%

group_by(SK_ID_CURR) %>%

summarise_all(fn)

rm(pc_balance); gc()

 

sum_prev <- prev %>%

select(-SK_ID_PREV) %>%

mutate_if(is.character, funs(factor(.) %>% as.integer)) %>%

mutate(DAYS_FIRST_DRAWING = ifelse(DAYS_FIRST_DRAWING == 365243, NA, DAYS_FIRST_DRAWING),

DAYS_FIRST_DUE = ifelse(DAYS_FIRST_DUE == 365243, NA, DAYS_FIRST_DUE),

DAYS_LAST_DUE_1ST_VERSION = ifelse(DAYS_LAST_DUE_1ST_VERSION == 365243, NA, DAYS_LAST_DUE_1ST_VERSION),

DAYS_LAST_DUE = ifelse(DAYS_LAST_DUE == 365243, NA, DAYS_LAST_DUE),

DAYS_TERMINATION = ifelse(DAYS_TERMINATION == 365243, NA, DAYS_TERMINATION),

APP_CREDIT_PERC = AMT_APPLICATION / AMT_CREDIT) %>%

group_by(SK_ID_CURR) %>%

summarise_all(fn)

rm(prev); gc()

 

tri <- 1:nrow(tr)

y <- tr$TARGET

 

tr_te <- tr %>%

select(-TARGET) %>%

bind_rows(te) %>%

left_join(sum_bureau, by = "SK_ID_CURR") %>%

left_join(sum_cc_balance, by = "SK_ID_CURR") %>%

left_join(sum_payments, by = "SK_ID_CURR") %>%

left_join(sum_pc_balance, by = "SK_ID_CURR") %>%

left_join(sum_prev, by = "SK_ID_CURR") %>%

select(-SK_ID_CURR) %>%

mutate_if(is.character, funs(factor(.) %>% as.integer)) %>%

mutate(na = apply(., 1, function(x) sum(is.na(x))),

DAYS_EMPLOYED = ifelse(DAYS_EMPLOYED == 365243, NA, DAYS_EMPLOYED),

DAYS_EMPLOYED_PERC = sqrt(DAYS_EMPLOYED / DAYS_BIRTH),

INCOME_CREDIT_PERC = AMT_INCOME_TOTAL / AMT_CREDIT,

INCOME_PER_PERSON = log1p(AMT_INCOME_TOTAL / CNT_FAM_MEMBERS),

ANNUITY_INCOME_PERC = sqrt(AMT_ANNUITY / (1 + AMT_INCOME_TOTAL)),

LOAN_INCOME_RATIO = AMT_CREDIT / AMT_INCOME_TOTAL,

ANNUITY_LENGTH = AMT_CREDIT / AMT_ANNUITY,

CHILDREN_RATIO = CNT_CHILDREN / CNT_FAM_MEMBERS,

CREDIT_TO_GOODS_RATIO = AMT_CREDIT / AMT_GOODS_PRICE,

INC_PER_CHLD = AMT_INCOME_TOTAL / (1 + CNT_CHILDREN),

SOURCES_PROD = EXT_SOURCE_1 * EXT_SOURCE_2 * EXT_SOURCE_3,

CAR_TO_BIRTH_RATIO = OWN_CAR_AGE / DAYS_BIRTH,

CAR_TO_EMPLOY_RATIO = OWN_CAR_AGE / DAYS_EMPLOYED,

PHONE_TO_BIRTH_RATIO = DAYS_LAST_PHONE_CHANGE / DAYS_BIRTH,

PHONE_TO_EMPLOY_RATIO = DAYS_LAST_PHONE_CHANGE / DAYS_EMPLOYED)

 

docs <- str_subset(names(tr), "FLAG_DOC")

live <- str_subset(names(tr), "(?!NFLAG_)(?!FLAG_DOC)(?!_FLAG_)FLAG_")

inc_by_org <- tr_te %>%

group_by(ORGANIZATION_TYPE) %>%

summarise(m = median(AMT_INCOME_TOTAL)) %$%

setNames(as.list(m), ORGANIZATION_TYPE)

 

rm(tr, te, fn, sum_bureau, sum_cc_balance,

sum_payments, sum_pc_balance, sum_prev); gc()

 

tr_te %<>%

mutate(DOC_IND_KURT = apply(tr_te[, docs], 1, moments::kurtosis),

LIVE_IND_SUM = apply(tr_te[, live], 1, sum),

NEW_INC_BY_ORG = recode(tr_te$ORGANIZATION_TYPE, !!!inc_by_org),

NEW_EXT_SOURCES_MEAN = apply(tr_te[, c("EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3")], 1, mean),

NEW_SCORES_STD = apply(tr_te[, c("EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3")], 1, sd))%>%

mutate_all(funs(ifelse(is.nan(.), NA, .))) %>% #缺失值判断

mutate_all(funs(ifelse(is.infinite(.), NA, .))) %>% #inf判断

data.matrix()

 

#---------------------------

cat("Preparing data...\n")

dtest <- xgb.DMatrix(data = tr_te[-tri, ])

tr_te <- tr_te[tri, ]

tri <- caret::createDataPartition(y, p = 0.9, list = F) %>% c()

dtrain <- xgb.DMatrix(data = tr_te[tri, ], label = y[tri])

dval <- xgb.DMatrix(data = tr_te[-tri, ], label = y[-tri])

cols <- colnames(tr_te)

 

rm(tr_te, y, tri); gc()

 

#---------------------------

cat("Training model...\n")

p <- list(objective = "binary:logistic",

booster = "gbtree",

eval_metric = "auc",

nthread = 4,

eta = 0.05,

max_depth = 6,

min_child_weight = 30,

gamma = 0,

subsample = 0.85,

colsample_bytree = 0.7,

colsample_bylevel = 0.632,

alpha = 0,

lambda = 0,

nrounds = 2000)

 

set.seed(5)

m_xgb <- xgb.train(p, dtrain, p$nrounds, list(val = dval), print_every_n = 50, early_stopping_rounds = 300)

 

xgb.importance(cols, model=m_xgb) %>%

xgb.plot.importance(top_n = 30)

 

#---------------------------

read_csv("sample_submission.csv") %>%

mutate(SK_ID_CURR = as.integer(SK_ID_CURR),

TARGET = predict(m_xgb, dtest)) %>%

write_csv(paste0("tidy_xgb_", round(m_xgb$best_score, 5), ".csv"))

 

参考文章:原文链接

 

 

你可能感兴趣的:(R语言系列,数据分析,信用模型)