kilkjt2

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

# Exercises Section 3: Classification Exercises

# --------------------------------------------------------

library(class) # contains knn()

library(MASS)  # to have lda()

library(car)

library(ISLR) # contains the datasets

library(pROC)

###############################################################

### Exercise 1: kNN on iris data

###############################################################

set.seed(1)

# shuffle dataset first:

z = iris[sample(1:nrow(iris)),]

# Here we focus on sepal information only

plot(z[,1:2], col=c(1,2,4)[z[,5]],

pch=20, cex=2)

x = z[,1:2] # sepal variables only

y = z$Species

# Here's how to use the knn() classifier:

K = 5

n = nrow(x)

# split the data into train+test:

i.train = sample(1:n, 100)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

ko = knn(x.train, x.test, y.train, K)

tb = table(ko, y.test)

1 - sum(diag(tb)) / sum(tb) # overall classification error rate

confusionMatrix(data=ko, reference=y.test)

# Build a loop around that to find best k:

# (NB: assess effect of various k-values

# on the same data-split)

Kmax = 30

acc = numeric(Kmax)

for(k in 1:Kmax){

ko = knn(x.train, x.test, y.train, k)

tb = table(ko, y.test)

acc[k] = sum(diag(tb)) / sum(tb)

}

plot(1-acc, pch=20, t='b', xlab='k')

###############################################################

### Exercise 2: GLM on 2-class iris data

###############################################################

n = nrow(iris)

is = sample(1:n, size=n, replace=FALSE)

dat = iris[is,-c(3,4)] # shuffled version of the original set

# recode into 2-class problem:

dat$is.virginica = as.numeric(dat$Species=="virginica")

dat$Species = NULL # "remove" this component

names(dat)

is = 1:100 # training set

fit = glm(is.virginica~., data=dat, subset=is, family=binomial(logit))

pred = predict(fit, newdata=dat[-is,], type="response")

y.test = dat$is.virginica[-is] # true test set classes

# boxplot(pred~y.test, names=c("other","virginica"))

# boxplot(pred~y.test, names=c("cancer absent","cancer present"))

abline(h=0.5, col=3)

abline(h=0.1, col=4)

# for varying cut-off (ie threshold) values, compute corresponding

# predicted labels, and corresponding confusion matrix:

err = NULL

for(cut.off in seq(.1, .9, by=.1)){

pred.y = as.numeric(pred>cut.off)

tb = table(pred.y, y.test)

err = c(err, (1-sum(diag(tb))/sum(tb)))

}

plot(seq(.1, .9, by=.1), err, t='b')

###############################################################

### Exercise 3: LDA assumptions

###############################################################

## (1) 2-class classification problem

dat = iris

dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))

# to recode cleanly, you could use for instance:

dat$Species = car::recode(dat$Species, "0='other'; 1='virginica'")

# or:

# levels(dat$Species) = c("other","virginica")

#

par(mfrow=c(1,2))

plot(iris[,1:2], pch=20, col=c(1,2,4)[iris$Species], cex=2)

legend("topright",col=c(1,2,4),

legend=levels(iris$Species),

pch=20, bty='n')

plot(dat[,1:2], pch=20, col=c(1,4)[dat$Species], cex=2)

legend("topright",col=c(1,4),

legend=levels(dat$Species),

pch=20, bty='n')

## Explore distribution of predictors:

# boxplots seem relatively symmetric:

par(mfrow=c(2,2))

for(j in 1:4){

boxplot(dat[,j]~dat$Species,

      ylab='predictor',

col=c('cyan','pink'),

main=names(dat)[j])

}

# but we'd rather check for Normality more specifically:

par(mfrow=c(2,4), font.lab=2, cex=1.2)

for(j in 1:4){

hist(dat[which(dat$Species=='other'),j], col='cyan',

      xlab='predictor for class other',

main=names(dat)[j])

hist(dat[which(dat$Species!='other'),j], col='pink',

    xlab='predictor for class virginica',

    main=names(dat)[j])

}

# could also use QQ-plots:

par(mfrow=c(2,4), cex=1.2)

for(j in 1:4){

x.other = dat[which(dat$Species=='other'),j]

qqnorm(x.other, pch=20, col='cyan',

main=names(dat)[j])

abline(a=mean(x.other), b=sd(x.other))

x.virginica = dat[which(dat$Species!='other'),j]

qqnorm(x.virginica, pch=20, col='pink',

main=names(dat)[j])

abline(a=mean(x.virginica), b=sd(x.virginica))

}

# So what do you think?

## Check initial assumption of equality of variances:

# Bartlett's test with H0: all variances are equal

for(j in 1:4){

print( bartlett.test(dat[,1]~dat$Species)$p.value )

}

# Shapiro's test with H0: the distribution is Normal

for(j in 1:4){

  print( shapiro.test(dat[which(dat$Species=='virginica'),j])$p.value )

}

for(j in 1:4){

  print( shapiro.test(dat[which(dat$Species=='other'),j])$p.value )

}

## Fit LDA model to this dataset and check accuracy:

lda.o = lda(Species~., data=dat)

(lda.o)

# can we track some of the values in that summary?

table(dat$Species)/nrow(dat)

rbind(

    apply(dat[which(dat$Species=='other'),1:4], 2, mean),

    apply(dat[which(dat$Species=='virginica'),1:4], 2, mean)

)

# what about the coefficients of linear discriminants?

x = as.matrix(dat[,1:4])

proj = x %*% lda.o$scaling

plot(proj, pch=20, col=dat$Species, cex=2)

# little hack to recover the fitted values quickly

predo = predict(lda.o, newdata=dat)

y = predo$x

plot(proj, y)

plot(y, predo$posterior[,2])

boxplot(y ~ (predo$posterior[,2]>.5))

boxplot(proj ~ (predo$posterior[,2]>.5))

# NB: The way these coefs is calculated follows the MANOVA approach

# popular hack to recover the fitted values:

fitted.values = predict(lda.o, newdata=dat)$class 

boxplot(y~dat$Species)

boxplot(proj~dat$Species)

(tb.2 = table(fitted.values, dat$Species))

sum(diag(tb.2)) / sum(tb.2)

## (2) 3-class classification problem

dat = iris

## Explore distribution of predictors:

# boxplots seem relatively symmetric:

par(mfrow=c(2,2))

# here's just a loop to save having to write 4 boxplot

# instructions with names by hand (being lazy often

# makes for nicer code):

for(j in 1:4){

boxplot(dat[,j]~dat$Species,

xlab = 'Species',

ylab = 'predictor',

col=c('cyan','pink'),

main=names(dat)[j])

}

# but we'd rather check for Normality more specifically:

Ls = levels(dat$Species)

par(mfcol=c(3,4))

for(j in 1:4){

hist(dat[which(dat$Species==Ls[1]),j], col='cyan',

main=names(dat)[j])

hist(dat[which(dat$Species==Ls[2]),j], col='pink',

main=names(dat)[j])

hist(dat[which(dat$Species==Ls[3]),j], col='green',

main=names(dat)[j])

}

# could also use QQ-plots:

par(mfcol=c(3,4))

for(j in 1:4){

x1 = dat[which(dat$Species==Ls[1]),j]

qqnorm(x1, pch=20, col='cyan', main=names(dat)[j])

abline(a=mean(x1), b=sd(x1))

x2 = dat[which(dat$Species==Ls[2]),j]

qqnorm(x2, pch=20, col='pink', main=names(dat)[j])

abline(a=mean(x2), b=sd(x2))

x3 = dat[which(dat$Species==Ls[3]),j]

qqnorm(x3, pch=20, col='green', main=names(dat)[j])

abline(a=mean(x3), b=sd(x3))

}

# So what do you think now?

## Check initial assumption of equality of variances:

# Bartlett's test with H0: all variances are equal

print( bartlett.test(dat[,1]~dat$Species)$p.value )

print( bartlett.test(dat[,2]~dat$Species)$p.value )

print( bartlett.test(dat[,3]~dat$Species)$p.value )

print( bartlett.test(dat[,4]~dat$Species)$p.value )

## or if in lazy mode:

for(j in 1:4){

print( bartlett.test(dat[,j]~dat$Species)$p.value )

}

## Fit LDA model to this dataset and check accuracy:

lda.o = lda(Species~., data=dat)

(lda.o)

ftted.values = predict(lda.o, newdata=dat)$class

(tb.3 = table(ftted.values, dat$Species))

sum(diag(tb.3)) / sum(tb.3)

###############################################################

### Exercise 4: LDA

###############################################################

## (1) 2-class classification problem

dat = iris

dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))

levels(dat$Species) = c("other","virginica")

n = nrow(dat)

set.seed(4061)

dat = dat[sample(1:n),] # shuffle dataset

i.train = 1:100

dat.train = dat[i.train,]

dat.test = dat[-i.train,]

lda.o = lda(Species~., data=dat.train)

lda.p = predict(lda.o, newdata=dat.test)

names(lda.p)

(tb = table(lda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

# QDA:

qda.o = qda(Species~., data=dat.train)

qda.p = predict(qda.o, newdata=dat.test)

(tb = table(qda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

## (2) 3-class classification problem

dat = iris

n = nrow(dat)

set.seed(4061)

dat = dat[sample(1:n),]

i.train = 1:100

dat.train = dat[i.train,]

dat.test = dat[-i.train,]

# LDA:

lda.o = lda(Species~., data=dat.train)

lda.p = predict(lda.o, newdata=dat.test)

names(lda.p)

(tb = table(lda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

# QDA:

qda.o = qda(Species~., data=dat.train)

qda.p = predict(qda.o, newdata=dat.test)

(tb = table(qda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

###############################################################

### Exercise 5: benchmarking

###############################################################

## (1) benchmarking on unscaled data

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

# get a random training sample containing 70% of original sample:

i.cv = sample(1:n, round(.7*n), replace=FALSE)

dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV) (HOLD-OUT)

# tuning of the classifiers:

K.knn = 3

# perform K-fold CV:

K = 10

N = length(i.cv)

folds = cut(1:N, K, labels=FALSE)

acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)

auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)

#

for(k in 1:K){ # 10-fold CV loop

# split into train and test samples:

i.train = which(folds!=k)

dat.train = dat.cv[i.train, ]

dat.test = dat.cv[-i.train, ]

# adapt these sets for kNN:

x.train = dat.train[,-1]

y.train = dat.train[,1]

x.test = dat.test[,-1]

y.test = dat.test[,1]

x.train[,1] = as.numeric(x.train[,1])

x.test[,1] = as.numeric(x.test[,1])

# train classifiers:

knn.o = knn(x.train, x.test, y.train, K.knn)

glm.o = glm(default~., data=dat.train, family=binomial(logit))

lda.o = lda(default~., data=dat.train)

qda.o = qda(default~., data=dat.train)

# test classifiers:

knn.p = knn.o

glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )

lda.p = predict(lda.o, newdata=dat.test)$class

qda.p = predict(qda.o, newdata=dat.test)$class

tb.knn = table(knn.p, y.test)

tb.glm = table(glm.p, y.test)

tb.lda = table(lda.p, y.test)

tb.qda = table(qda.p, y.test)

# store prediction accuracies:

acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)

acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)

acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)

acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)

#

# ROC/AUC analysis:

# WARNING: THIS IS NOT Pr(Y=1 | X), BUT Pr(Y = Y_hat | X):

# knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob

glm.p = predict(glm.o, newdata=dat.test, type="response")

lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]

qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]

# auc.knn[k] = roc(y.test, knn.p)$auc

auc.glm[k] = roc(y.test, glm.p)$auc

auc.lda[k] = roc(y.test, lda.p)$auc

auc.qda[k] = roc(y.test, qda.p)$auc

}

boxplot(acc.knn, acc.glm, acc.lda, acc.qda,

main="Overall CV prediction accuracy",

names=c("kNN","GLM","LDA","QDA"))

boxplot(auc.glm, auc.lda, auc.qda,

main="Overall CV AUC",

names=c("GLM","LDA","QDA"))

##### Taking a closer look at performance

roc(y.test, glm.p)$auc

library(caret)

(tb = table(y.test, glm.p>.5))

pred = as.factor(glm.p>.5)

pred = car::recode(pred, "FALSE='No'; TRUE='Yes'")

caret::confusionMatrix(y.test, pred)

sum(diag(tb))/sum(tb)

##### Further exercises for you to do:

## adapt code to evaluate sensitivity and specificity

## add validation analysis...

## repeat on scaled data...

###############################################################

### Exercise 6: benchmarking, again

###############################################################

## (1) benchmarking on unscaled data

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

# get a random training sample containing 70% of original sample:

i.cv = sample(1:n, round(.7*n), replace=FALSE)

x = dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV)

# Recover ROC curve manually from whole set:

n = nrow(x)

acc = numeric(length(thrs))

sens = spec = numeric(length(thrs))

thrs = seq(.05,.95,by=.05)

for(ithr in 1:length(thrs)){

thr = thrs[ithr]

glmo = glm(default~., data=x,

          family=binomial)

  tb = table(glmo$fitted.values>thr, x$default)

  acc[ithr] = sum(diag(tb))/sum(tb)

  #

  # calculate sensitivity for a given threshold

  sens[ithr] = ...

  # calculate specificity for a given threshold

  spec[ithr] = ...

  # prediction

}

plot(acc)

plot(spec, sens)

# Evaluate a cross-validated ROC curve manually:

n = nrow(x)

K = 10

train.acc = test.acc = matrix(NA, nrow=K, ncol=length(thrs))

folds = cut(1:n, K, labels=FALSE)

k = 1

thrs = seq(.05,.95,by=.05)

for(ithr in 1:length(thrs)){

thr = thrs[ithr]

for(k in 1:K){

  itrain = which(folds!=k)

  glmo = glm(default~., data=x,

          family=binomial,

          subset=itrain)

  tb = table(glmo$fitted.values>thr, x$default[itrain])

  train.acc[k, ithr] = sum(diag(tb))/sum(tb)

  #

  # calculate sensitivity for a given threshold

  # ...

  # calculate specificity for a given threshold

  # ...

  # prediction

  p.test = predict(glmo, x[-itrain,], type='response')

  tb = table(p.test>thr, x$default[-itrain])

  test.acc[k,ithr] = sum(diag(tb))/sum(tb)

}

}

boxplot(test.acc)

# warnings()

mean(train.acc) 

mean(test.acc)

你可能感兴趣的:(kilkjt2)