Firstly define the function to generate simulated data.
# function for generating data for figure 10.2
gen_eq_10_2_data <- function(N = 2000, p = 10){
X <- matrix( rnorm(N*p), nrow = N )
Y <- ifelse( rowSums(X^2) > qchisq(0.5, df = 10), 1, -1 )
data = data.frame(X = X, Y = Y)
return(data)
}
Then build Adaboost.M1. model and plot the figure
# Set working directory
setwd('/Users/Yanfang/Documents/Book/ESL')
## install.packages('gbm')
library(gbm)
library(ggplot2)
# Source data generated function
source('gen_eq_10_2_data.R')
# dimensions to be simulated
p <- 10 # the dimension of the feature vector
N_train <- 2000 # the number of samples to used in training
N_test <- 10000 # the number of samples to used in testing
# Extract the data to be classified:
D_train <- gen_eq_10_2_data(N = N_train, p = p) # training data
D_test <- gen_eq_10_2_data(N = N_test, p = p) # testing data
D_train[ D_train$Y == -1, p+1 ] <- 0 # set -1 to 0 to fit the function gbm
D_test[ D_test$Y == -1, p+1 ] <- 0 # set -1 to 0 to fit the function gbm
# Generate the formular used to fit our model of trees:
terms <- paste( colnames(D_train)[1:p], collapse = '+' )
formula <- formula( paste( colnames(D_train)[p+1], '~', terms) )
# Do training with the maximum number of trees:
n_trees <- 400
print('Running Adaboost ...\n')
obj <- gbm( formula, data = D_train, distribution = 'adaboost',
n.trees = n_trees, shrinkage = 1, verbose = TRUE )
# Plot training error as a function of the number of trees:
E_train <- matrix( 0, nrow = n_trees, ncol = 1 )
for(n in seq(1, n_trees)){
Yhat <- predict( obj, D_train[, 1:p], n.trees = n )
E_train[n] <- mean(ifelse( Yhat > 0, 1, 0 ) != D_train[, p+1])
}
E_test <- matrix( 0, nrow = n_trees, ncol = 1 )
for(n in seq(1, n_trees)){
Yhat <- predict( obj, D_test[, 1:p], n.trees = n )
E_test[n] <- mean(ifelse( Yhat > 0, 1, 0 ) != D_test[, p+1])
}
# Final plots of training error & testing error
D_plot <- data.frame( X = 1:n_trees, E_train = E_train, E_test = E_test)
ggplot( data = D_plot, aes(X) ) +
geom_line( aes(X, E_train), col = 'red' ) +
geom_line( aes(X, E_test), col = 'blue' ) +
geom_hline( yintercept = 0, linetype = 'dashed') +
geom_text( x = 200, y = 0.03, label = 'Training Error' ) +
geom_text( x = 100, y = 0.12, label = 'Testing Error' ) +
xlab( 'Boosting Interations' ) +
ylab( 'Test Error' ) +
theme_bw() +
theme( plot.title = element_text(hjust = 0.5) ) +
ggtitle( 'Figure 10.2 (with training error)' )
ggplot( data = D_plot, aes(X) ) +
geom_line( aes(X, E_test), col = 'blue' ) +
geom_hline( yintercept = 0, linetype = 'dashed') +
geom_text( x = 100, y = 0.12, label = 'Testing Error' ) +
xlab( 'Boosting Interations' ) +
ylab( 'Test Error' ) +
theme_bw() +
theme( plot.title = element_text(hjust = 0.5) ) +
ggtitle( 'Figure 10.2 (with training error)' )
Firstly define the misclassification error & average exponential loss functions.
# Misclassification error rate function
mis_error <- function(y, yhat){
result <- mean(y != yhat)
return(result)
}
# Exponential loss
## Be care: here yhat is not -1 or 1 but the function without sign
exp_loss <- function(y, yhat){
result <- sum(exp(- y*yhat))/length(y)
return(result)
}
Then build the adaboost model and plot figure 10.3.
# Set working directory
setwd('/Users/Yanfang/Documents/Book/ESL')
# Packages to be required
library(gbm) # generalized boosted regression models
library(ggplot2)
# Source related function
source('mis_fig_10_3.R')
source('gen_eq_10_2_data.R')
# Dimension to be simulated
N_train <- 2000
N_test <- 10000
p <- 10
# Data generation
D_train <- gen_eq_10_2_data(N = N_train, p = p)
D_train[ D_train$Y == -1, p+1 ] <- 0
# Set parameters to be used
n_trees <- 400
mis_error_vec <- rep(0, length = n_trees)
exp_loss_vec <- rep(0, length = n_trees)
terms <- paste(colnames(D_train)[1:p], collapse = '+')
formula <- formula( paste(colnames(D_train)[p+1], '~', terms) )
# Model to be trained
print('Running Adaboost ...')
obj <- gbm(formula, distribution = 'adaboost',
data = D_train, n.trees = n_trees, shrinkage = 1, verbose = TRUE)
for(nt in seq( 1, n_trees)){
fx_train <- predict(obj, newdata = D_train[, 1:p], n.trees = nt)
yhat_train <- ifelse( fx_train > 0, 1, 0 )
mis_error_vec[nt] <- mis_error( D_train$Y, yhat_train )
exp_loss_vec[nt] <- exp_loss( ifelse(D_train$Y == 0, -1, 1), fx_train ) # note here is fx_train not yhat_train
}
# Final plots
D_plot <- data.frame(X = 1:n_trees, mis_error_vec = mis_error_vec,
exp_loss_vec = exp_loss_vec)
ggplot( data = D_plot, aes(X) ) +
geom_line( aes(X, mis_error_vec), col = 'red' ) +
geom_line( aes(X, exp_loss_vec), col = 'blue' ) +
geom_hline( yintercept = 0, linetype = 'dashed' ) +
geom_text( x = 50, y = 0.2, label = 'Misclassification Error', size = 4, col = 'red' ) +
geom_text( x = 100, y = 0.38, label = 'Average Exponential Loss', size = 4, col = 'blue' ) +
xlab( 'Boosting Interation' ) +
ylab( 'Training Error' ) +
theme_bw() +
theme( plot.title = element_text(hjust = 0.5) ) +
ggtitle( 'Figure 10.3' )