[code] Chap 10 for ESL

Figure 10.2

Firstly define the function to generate simulated data.

# function for generating data for figure 10.2
gen_eq_10_2_data <- function(N = 2000, p = 10){
      X <- matrix( rnorm(N*p), nrow = N )
      Y <- ifelse( rowSums(X^2) > qchisq(0.5, df = 10), 1, -1 )
      
      data = data.frame(X = X, Y = Y)
      return(data)
}

Then build Adaboost.M1. model and plot the figure

# Set working directory
setwd('/Users/Yanfang/Documents/Book/ESL')

## install.packages('gbm')
library(gbm)
library(ggplot2)

# Source data generated function
source('gen_eq_10_2_data.R')

# dimensions to be simulated
p <- 10  # the dimension of the feature vector
N_train <- 2000 # the number of samples to used in training
N_test <- 10000 # the number of samples to used in testing

# Extract the data to be classified:
D_train <- gen_eq_10_2_data(N = N_train, p = p)  # training data 
D_test <- gen_eq_10_2_data(N = N_test, p = p)    # testing data
D_train[ D_train$Y == -1, p+1 ] <- 0 # set -1 to 0 to fit the function gbm    
D_test[ D_test$Y == -1, p+1 ] <- 0   # set -1 to 0 to fit the function gbm

# Generate the formular used to fit our model of trees:
terms <- paste( colnames(D_train)[1:p], collapse = '+' )
formula <- formula( paste( colnames(D_train)[p+1], '~', terms) )

# Do training with the maximum number of trees:
n_trees <- 400

print('Running Adaboost ...\n')
obj <- gbm( formula, data = D_train, distribution = 'adaboost', 
            n.trees = n_trees, shrinkage = 1, verbose = TRUE )

# Plot training error as a function of the number of trees:
E_train <- matrix( 0, nrow = n_trees, ncol = 1 )
for(n in seq(1, n_trees)){
      Yhat <- predict( obj, D_train[, 1:p], n.trees = n )
      E_train[n] <- mean(ifelse( Yhat > 0, 1, 0 ) != D_train[, p+1])
}

E_test <- matrix( 0, nrow = n_trees, ncol = 1 )
for(n in seq(1, n_trees)){
      Yhat <- predict( obj, D_test[, 1:p], n.trees = n )
      E_test[n] <- mean(ifelse( Yhat > 0, 1, 0 ) != D_test[, p+1])
}

# Final plots of training error & testing error
D_plot <- data.frame( X = 1:n_trees, E_train = E_train, E_test = E_test)

ggplot( data = D_plot, aes(X) ) + 
      geom_line( aes(X, E_train), col = 'red' ) +
      geom_line( aes(X, E_test), col = 'blue' ) +
      geom_hline( yintercept = 0, linetype = 'dashed') +
      geom_text( x = 200, y = 0.03, label = 'Training Error' ) +
      geom_text( x = 100, y = 0.12, label = 'Testing Error' ) +
      xlab( 'Boosting Interations' ) +
      ylab( 'Test Error' ) + 
      theme_bw() +
      theme( plot.title = element_text(hjust = 0.5) ) +
      ggtitle( 'Figure 10.2 (with training error)' )

ggplot( data = D_plot, aes(X) ) + 
      geom_line( aes(X, E_test), col = 'blue' ) +
      geom_hline( yintercept = 0, linetype = 'dashed') +
      geom_text( x = 100, y = 0.12, label = 'Testing Error' ) +
      xlab( 'Boosting Interations' ) +
      ylab( 'Test Error' ) + 
      theme_bw() +
      theme( plot.title = element_text(hjust = 0.5) ) +
      ggtitle( 'Figure 10.2 (with training error)' )

[code] Chap 10 for ESL_第1张图片

[code] Chap 10 for ESL_第2张图片

Figure 10.3

Firstly define the misclassification error & average exponential loss functions.

# Misclassification error rate function
mis_error <- function(y, yhat){
      result <- mean(y != yhat)
      return(result)
}

# Exponential loss 
## Be care: here yhat is not -1 or 1 but the function without sign
exp_loss <- function(y, yhat){
      result <- sum(exp(- y*yhat))/length(y)
      return(result)
}

Then build the adaboost model and plot figure 10.3.

# Set working directory
setwd('/Users/Yanfang/Documents/Book/ESL')

# Packages to be required
library(gbm)  # generalized boosted regression models
library(ggplot2)

# Source related function
source('mis_fig_10_3.R')
source('gen_eq_10_2_data.R')

# Dimension to be simulated
N_train <- 2000
N_test <- 10000
p <- 10

# Data generation
D_train <- gen_eq_10_2_data(N = N_train, p = p)
D_train[ D_train$Y == -1, p+1 ] <- 0

# Set parameters to be used
n_trees <- 400
mis_error_vec <- rep(0, length = n_trees)
exp_loss_vec <- rep(0, length = n_trees)

terms <- paste(colnames(D_train)[1:p], collapse = '+')
formula <- formula( paste(colnames(D_train)[p+1], '~', terms) )

# Model to be trained
print('Running Adaboost ...')
obj <- gbm(formula, distribution = 'adaboost', 
           data = D_train, n.trees = n_trees, shrinkage = 1, verbose = TRUE)

for(nt in seq( 1, n_trees)){
      fx_train <- predict(obj, newdata = D_train[, 1:p], n.trees = nt)
      yhat_train <- ifelse( fx_train > 0, 1, 0 )
      mis_error_vec[nt] <- mis_error( D_train$Y, yhat_train )
      exp_loss_vec[nt] <- exp_loss( ifelse(D_train$Y == 0, -1, 1), fx_train ) # note here is fx_train not yhat_train
}

# Final plots
D_plot <- data.frame(X = 1:n_trees, mis_error_vec = mis_error_vec, 
                     exp_loss_vec = exp_loss_vec)

ggplot( data = D_plot, aes(X) ) + 
      geom_line( aes(X, mis_error_vec), col = 'red' ) +
      geom_line( aes(X, exp_loss_vec), col = 'blue' ) + 
      geom_hline( yintercept = 0, linetype = 'dashed' ) +
      geom_text( x = 50, y = 0.2, label = 'Misclassification Error', size = 4, col = 'red' ) +
      geom_text( x = 100, y = 0.38, label = 'Average Exponential Loss', size = 4, col = 'blue' ) + 
      xlab( 'Boosting Interation' ) +
      ylab( 'Training Error' ) +
      theme_bw() +
      theme( plot.title = element_text(hjust = 0.5) ) +
      ggtitle( 'Figure 10.3' ) 

[code] Chap 10 for ESL_第3张图片

你可能感兴趣的:(statistics,machine,learning,R,code,for,"ESL",R,data,science)