lightgbm algorithm case of kaggle(上)

lightgbm algorithm case of kaggle(上)_第1张图片

作者简介Introduction

苏高生,西南财经大学统计学硕士毕业,现就职于中国电信,主要负责企业存量客户大数据分析、数据建模。研究方向:机器学习,最喜欢的编程语言:R语言,没有之一。

E-mail:[email protected]

往期回顾:

Xgboost算法——Kaggle案例

The rxfastforest algorithm case of kaggle

640?wx_fmt=png

零、案例背景介绍与建模思路说明

1.背景介绍

本案例使用的数据为kaggle中“Santander Customer Satisfaction”比赛的数据。此案例为不平衡二分类问题,目标为最大化auc值(ROC曲线下方面积)。竞赛题目链接为:https://www.kaggle.com/c/santander-customer-satisfaction 。目前此比赛已经结束。

2.建模思路

本文档采用微软开源的lightgbm算法进行分类,运行速度极快,超过xgboost算法与rxFastForest算法。

1) 读取数据;

2) 并行运算:由于lightgbm包可以通过设置相应参数进行并行运算,因此不再调用doParallel与foreach包进行并行运算;

3) 特征选择:使用mlr包提取了99%的信息增益;

4) 调参:逐步调试lgb.cv函数的参数,并多次调试,直到满意为止;

5) 集成预测结果:在每个参数的适宜范围内随机抽取参数值构建lightgbm模型,并将多个模型进行集成,输出预测结果;本案例所用程序输出结果的ROC值为0.832023,已绝对超过Private Leaderboard排名第一的结果(0.829072)。

一、读取数据

options(java.parameters = "-Xmx8g") ## 特征选择时使用,但是需要在加载包之前设置,否则无效

library(readr)

lgb_tr1 <- read_csv("C:/Users/Administrator/Documents/kaggle/scs_lgb/train.csv")

lgb_te1 <- read_csv("C:/Users/Administrator/Documents/kaggle/scs_lgb/test.csv")

二、数据探索

1.设置并行运算

library(dplyr)

library(mlr)

library(parallelMap)

parallelStartSocket(2)

2.数据各列初步探索

summarizeColumns(lgb_tr1) %>% View()

3.处理缺失值

#impute missing values by mean and mode

imp_tr1 <- impute(

    as.data.frame(lgb_tr1),

    classes = list(

        integer = imputeMean(),

        numeric = imputeMean()

    )

)

imp_te1 <- impute(

    as.data.frame(lgb_te1),

    classes = list(

        integer = imputeMean(),

        numeric = imputeMean()

    )

)


## 处理缺失值后summarizeColumns(imp_tr1$data) %>% View()

4.观察训练数据类别的比例–数据类别不平衡

table(lgb_tr1$TARGET)

5.剔除数据集中的常数列

lgb_tr2 <- removeConstantFeatures(imp_tr1$data)

lgb_te2 <- removeConstantFeatures(imp_te1$data)

6.保留训练数据集与测试数据及相同的列

tr2_name <- data.frame(tr2_name = colnames(lgb_tr2))

te2_name <- data.frame(te2_name = colnames(lgb_te2))

tr2_name_inner <- tr2_name %>%

    inner_join(te2_name, by = c('tr2_name' = 'te2_name'))

TARGET = data.frame(TARGET = lgb_tr2$TARGET)

lgb_tr2 <- lgb_tr2[, c(tr2_name_inner$tr2_name[2:dim(tr2_name_inner)[1]])]

lgb_te2 <- lgb_te2[, c(tr2_name_inner$tr2_name[2:dim(tr2_name_inner)[1]])]

lgb_tr2 <- cbind(lgb_tr2, TARGET)

三、特征筛选–信息增益

library(lightgbm)

library(ggplot2)

1.试算最优的weight参数

grid_search <- expand.grid(

    weight = seq(1, 30, 2))

lgb_rate_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr2$TARGET * i + 1) / sum(lgb_tr2$TARGET * i + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr2[, 1:300]),

        label = lgb_tr2$TARGET,

        free_raw_data = FALSE,

        weight = lgb_weight

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc'

    )

    # 交叉验证

    lgb_tr2_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10,

        learning_rate = .1,

        num_threads = 2,

        early_stopping_rounds = 10

    )

    lgb_rate_1[i] <- unlist(lgb_tr2_mod$record_evals$valid$auc$eval)

[length(unlist(lgb_tr2_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- lgb_rate_1

ggplot(grid_search,aes(x = weight, y = perf)) +

    geom_point()

结论:从此图可知auc值受权重影响不大,在weight=9时达到最大,weight>=11时呈负相关

2.特征选择

1)特征选择

lgb_tr2$TARGET <- factor(lgb_tr2$TARGET)

lgb.task <- makeClassifTask(data = lgb_tr2, target = 'TARGET')

lgb.task.smote <- oversample(lgb.task, rate = 9)

fv_time <- system.time(

    fv <- generateFilterValuesData(

        lgb.task.smote,

        method = c('information.gain')

    )

)

2)制图查看

library(ggvis)

plotFilterValues(fv)

plotFilterValuesGGVIS(fv)

3)提取99%的信息增益(lightgbm算法效率极高,因此可以取更多的变量)

fv_data2 <- fv$data %>%

    arrange(desc(information.gain)) %>%

    mutate(info_gain_cul = cumsum(information.gain) / sum(information.gain))

fv_data2_filter <- fv_data2 %>% filter(info_gain_cul <= 0.99)

dim(fv_data2_filter)

fv_feature <- fv_data2_filter$name

lgb_tr3 <- lgb_tr2[, c(fv_feature, 'TARGET')]

lgb_te3 <- lgb_te2[, fv_feature]

4)写出数据

write_csv(lgb_tr3, 'C:/users/Administrator/Documents/kaggle/scs_lgb/lgb_tr3.csv')

write_csv(lgb_te3, 'C:/users/Administrator/Documents/kaggle/scs_lgb/lgb_te3.csv')

四、算法

lgb_tr <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_lgb/lgb_tr3.csv')

lgb_te <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_lgb/lgb_te3.csv')

1.调试weight参数

grid_search <- expand.grid(

    weight = 1:30

)

perf_weight_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * i + 1) / sum(lgb_tr$TARGET * i + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE,

        weight = lgb_weight

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc'

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10,

        learning_rate = .1,

        num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_weight_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)

[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_weight_1

ggplot(grid_search,aes(x = weight, y = perf)) +

    geom_point() +

    geom_smooth()

结论:从此图可知auc值在weight=2时达到最大

2.调试learning_rate参数

grid_search <- expand.grid(

    learning_rate = 2 ^ (-(8:1))

)

perf_learning_rate_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE,

        weight = lgb_weight

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10,

        num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_learning_rate_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)

[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_learning_rate_1

ggplot(grid_search,aes(x = learning_rate, y = perf)) +

    geom_point() +

    geom_smooth()

结论:从此图可知auc值在learning_rate=2^(-3)时达到最大

3.调试num_leaves参数

grid_search <- expand.grid(

    learning_rate = .125,

    num_leaves = seq(50, 1000, 50))

perf_num_leaves_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE,

        weight = lgb_weight

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300, 

       stratified = TRUE,

        nfold = 10,

        num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_num_leaves_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)

[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_num_leaves_1

ggplot(grid_search,aes(x = num_leaves, y = perf)) +

    geom_point() +

    geom_smooth()

结论:从此图可知auc值在num_leaves=600时达到最大

4.调试min_data_in_leaf参数

grid_search <- expand.grid(

    learning_rate = .125,

    num_leaves = 600,

    min_data_in_leaf = 2 ^ (1:7))

perf_min_data_in_leaf_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE,

        weight = lgb_weight

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'],

        min_data_in_leaf = grid_search[i, 'min_data_in_leaf']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE, 

       nfold = 10, 

       num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_min_data_in_leaf_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_min_data_in_leaf_1

ggplot(grid_search,aes(x = min_data_in_leaf, y = perf)) +

    geom_point() +

    geom_smooth()

结论:从此图可知auc值对min_data_in_leaf不敏感,因此不做调整

5.调试max_bin参数

grid_search <- expand.grid(

    learning_rate = .125,

    num_leaves = 600,

    max_bin = 2 ^ (5:10))

perf_max_bin_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE,

        weight = lgb_weight

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'],

        max_bin = grid_search[i, 'max_bin']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10,

        num_threads = 2, 

       early_stopping_rounds = 10

    )  

  perf_max_bin_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)

[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_max_bin_1

ggplot(grid_search,aes(x = max_bin, y = perf)) +

    geom_point() +

    geom_smooth()

结论:从此图可知auc值在max_bin=2^6时达到最大,需要再次微调max_bin值

6.微调max_bin参数

grid_search <- expand.grid(

    learning_rate = .125,

    num_leaves = 600,

    max_bin = 10 * (3:12)

)

perf_max_bin_2 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE,

        weight = lgb_weight

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'],

        max_bin = grid_search[i, 'max_bin']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10, 

       num_threads = 2,   

     early_stopping_rounds = 10

    )

    perf_max_bin_2[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)

[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_max_bin_2

ggplot(grid_search,aes(x = max_bin, y = perf)) +

    geom_point() + 

    geom_smooth()

结论:从此图可知auc值在max_bin=30时达到最大

7.调试min_data_in_bin参数

grid_search <- expand.grid(

    learning_rate = .125,

    num_leaves = 600,

    max_bin = 30,

    min_data_in_bin = 2 ^ (1:9)

    )

perf_min_data_in_bin_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE,

        weight = lgb_weight

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'],

        max_bin = grid_search[i, 'max_bin'],

        min_data_in_bin = grid_search[i, 'min_data_in_bin']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10,

        num_threads = 2,

       early_stopping_rounds = 10

    )

    perf_min_data_in_bin_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_min_data_in_bin_1

ggplot(grid_search,aes(x = min_data_in_bin, y = perf)) +

    geom_point() +

    geom_smooth()

结论:从此图可知auc值在min_data_in_bin=64时达到最大,但是变化极其细微,因此不做调整

8.调试feature_fraction参数

grid_search <- expand.grid(

    learning_rate = .125,

    num_leaves = 600,

    max_bin = 30, 

   min_data_in_bin = 64, 

   feature_fraction = seq(.5, 1, .02)

)

perf_feature_fraction_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

    lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)

        lgb_train <- lgb.Dataset( 

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET, 

        free_raw_data = FALSE,

        weight = lgb_weight 

    )  

      # 参数

    params <- list(   

     objective = 'binary',   

     metric = 'auc',    

    learning_rate = grid_search[i, 'learning_rate'],  

      num_leaves = grid_search[i, 'num_leaves'],  

      max_bin = grid_search[i, 'max_bin'],   

     min_data_in_bin = grid_search[i, 'min_data_in_bin'], 

       feature_fraction = grid_search[i, 'feature_fraction']  

  )  

  # 交叉验证

    lgb_tr_mod <- lgb.cv(   

     params,    

    data = lgb_train,    

    nrounds = 300,  

      stratified = TRUE, 

       nfold = 10,   

     num_threads = 2,

        early_stopping_rounds = 10

    )  

  perf_feature_fraction_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_feature_fraction_1

ggplot(grid_search,aes(x = feature_fraction, y = perf)) +

    geom_point() +

    geom_smooth()

结论:从此图可知auc值在feature_fraction=.64时达到最大,feature_fraction在[.62, .70]之间时,auc值保持稳定,表现较好;从.64开始呈下降趋势

9.调试min_sum_hessian参数

grid_search <- expand.grid(

    learning_rate = .125,

    num_leaves = 600,

    max_bin = 30, 

   min_data_in_bin = 64,

    feature_fraction = .64,

    min_sum_hessian = seq(0, .02, .001)

)

perf_min_sum_hessian_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){ 

        lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE,

        weight = lgb_weight

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'],

        max_bin = grid_search[i, 'max_bin'],

        min_data_in_bin = grid_search[i, 'min_data_in_bin'],

        feature_fraction = grid_search[i, 'feature_fraction'],

        min_sum_hessian = grid_search[i, 'min_sum_hessian']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv( 

        params, 

       data = lgb_train, 

       nrounds = 300, 

       stratified = TRUE,

        nfold = 10, 

       num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_min_sum_hessian_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_min_sum_hessian_1

ggplot(grid_search,aes(x = min_sum_hessian, y = perf)) +

    geom_point() +

    geom_smooth()

结论:从此图可知auc值在min_sum_hessian=0.04时达到最大,建议min_sum_hessian取值在[0.001, 0.005]区间,auc趋于最大

10.调试lamda参数

grid_search <- expand.grid(

    learning_rate = .125,

    num_leaves = 600,

    max_bin = 30,

    min_data_in_bin = 64,

    feature_fraction = .64,

    min_sum_hessian = .004,

    lambda_l1 = seq(0, .01, .002),

    lambda_l2 = seq(0, .01, .002)

)

perf_lamda_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

        lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE,

        weight = lgb_weight

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'], 

       num_leaves = grid_search[i, 'num_leaves'], 

       max_bin = grid_search[i, 'max_bin'], 

       min_data_in_bin = grid_search[i, 'min_data_in_bin'],

        feature_fraction = grid_search[i, 'feature_fraction'],

        min_sum_hessian = grid_search[i, 'min_sum_hessian'],

        lambda_l1 = grid_search[i, 'lambda_l1'], 

       lambda_l2 = grid_search[i, 'lambda_l2']

    )  

  # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE, 

       nfold = 10, 

       num_threads = 2, 

       early_stopping_rounds = 10

    )

    perf_lamda_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_lamda_1

ggplot(data = grid_search, aes(x = lambda_l1, y = perf)) + 

    geom_point() + 

    facet_wrap(~ lambda_l2, nrow = 5)

结论:从此图可知建议去lambda_l1 = .002, lambda_l2 = .008,即在不影响auc值的情况下,尽量增加lambda值以降低模型复杂度

11.调试drop_rate参数

grid_search <- expand.grid(

    learning_rate = .125, 

    num_leaves = 600,

    max_bin = 30,

    min_data_in_bin = 64,

    feature_fraction = .64,

    min_sum_hessian = .004, 

    lambda_l1 = .002,

    lambda_l2 = .008,

    drop_rate = seq(0, 1, .1)

)

perf_drop_rate_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

        lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET, 

        free_raw_data = FALSE, 

       weight = lgb_weight

    ) 

        # 参数

    params <- list( 

        objective = 'binary',

       metric = 'auc',

        learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'],

        max_bin = grid_search[i, 'max_bin'],

        min_data_in_bin = grid_search[i, 'min_data_in_bin'],

        feature_fraction = grid_search[i, 'feature_fraction'],

        min_sum_hessian = grid_search[i, 'min_sum_hessian'],

        lambda_l1 = grid_search[i, 'lambda_l1'],

        lambda_l2 = grid_search[i, 'lambda_l2'],

        drop_rate = grid_search[i, 'drop_rate']

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10,

        num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_drop_rate_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_drop_rate_1

ggplot(data = grid_search, aes(x = drop_rate, y = perf)) +

    geom_point() +

    geom_smooth()

结论:从此图可知auc值在drop_rate=0.3时达到最大,在[.3, .4]之间较好;在[0, 1]变化不大

12.调试max_drop参数

grid_search <- expand.grid(

    learning_rate = .125,

    num_leaves = 600,

    max_bin = 30,

    min_data_in_bin = 64,

    feature_fraction = .64,

    min_sum_hessian = .004,

    lambda_l1 = .002,

    lambda_l2 = .008,

    drop_rate = .3,

    max_drop = seq(1, 10, 2)

)

perf_max_drop_1 <- numeric(length = nrow(grid_search))

for(i in 1:nrow(grid_search)){

        lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)

        lgb_train <- lgb.Dataset(

        data = data.matrix(lgb_tr[, 1:137]),

        label = lgb_tr$TARGET,

        free_raw_data = FALSE, 

       weight = lgb_weight

    )

        # 参数

    params <- list(

        objective = 'binary',

        metric = 'auc', 

       learning_rate = grid_search[i, 'learning_rate'],

        num_leaves = grid_search[i, 'num_leaves'],

        max_bin = grid_search[i, 'max_bin'],

        min_data_in_bin = grid_search[i, 'min_data_in_bin'],

        feature_fraction = grid_search[i, 'feature_fraction'],

        min_sum_hessian = grid_search[i, 'min_sum_hessian'],

        lambda_l1 = grid_search[i, 'lambda_l1'],

        lambda_l2 = grid_search[i, 'lambda_l2'],

        drop_rate = grid_search[i, 'drop_rate'],

        max_drop = grid_search[i, 'max_drop'] 

    )

    # 交叉验证

    lgb_tr_mod <- lgb.cv(

        params,

        data = lgb_train,

        nrounds = 300,

        stratified = TRUE,

        nfold = 10,

        num_threads = 2,

        early_stopping_rounds = 10

    )

    perf_max_drop_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]}

grid_search$perf <- perf_max_drop_1

ggplot(data = grid_search, aes(x = max_drop, y = perf)) +

    geom_point() +

    geom_smooth()

结论:从此图可知auc值在max_drop=5时达到最大,在[1, 10]区间变化较小


==========未完待续==========




 往期精彩内容整理合集 

2017年R语言发展报告(国内)

R语言中文社区历史文章整理(作者篇)

R语言中文社区历史文章整理(类型篇)

640?wx_fmt=jpeg

公众号后台回复关键字即可学习

回复 R                  R语言快速入门及数据挖掘 
回复 Kaggle案例  Kaggle十大案例精讲(连载中)
回复 文本挖掘      手把手教你做文本挖掘
回复 可视化          R语言可视化在商务场景中的应用 
回复 大数据         大数据系列免费视频教程 
回复 量化投资      张丹教你如何用R语言量化投资 
回复 用户画像      京东大数据,揭秘用户画像
回复 数据挖掘     常用数据挖掘算法原理解释与应用
回复 机器学习     人工智能系列之机器学习与实践
回复 爬虫            R语言爬虫实战案例分享

你可能感兴趣的:(lightgbm algorithm case of kaggle(上))