library(ISLR)
library(glmnet)
library(class) # contains knn()
library(MASS) # to have lda()
library(carData)
library(car)
library(pROC)
library(caret)
library(tree)
library(randomForest)
library(gbm)
library(xgboost)
library(e1071)
library(neuralnet)
library(nnet) # implements single layer NNs
library(mlbench) # includes dataset BostonHousing
library(neuralnet)
library(NeuralNetTools)
library(leaps) # contains regsubsets()
library(fdm2id)
library(survival)
# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
#### Exercises Section 1: Date pre-processing####
# --------------------------------------------------------
library(glmnet)
library(survival)
library(ISLR)
###############################################################
#### Exercise 1: effect of scaling ####
#scale:标准化"(x-miu)/sigma"
###############################################################
summary(iris)
#取前100个作为训练样本
dat = iris[1:100,]
#去掉没用的level
dat$Species = droplevels(dat$Species)
##由于只取前100行数据作为训练样本,virginica 这个species没用到,所以用droplevels把它去掉
##所以在数据前期处理时,如果新选择的训练样本中有没用到的level,应该用droplevels把它删掉
#分别取自变量x 因变量y(species)
x = dat[,1:4]
y = dat$Species
# 对数据做scale 标准化
## method 1 先scale 再自变量因变量分离
dats = dat
dats[,1:4] = apply(dats[,1:4],2,scale)
##method 2 先自变量因变量分离,再对自变量做标准化
## we can apply scaling to the x data directly:
xs = apply(x,2,scale)
# (1)
##PCA
##iris数据中有四个成分Sepal.Length Sepal.Width Petal.Length Petal.Width;方差最大的为PCA中的第一主成分
##(但是我们并不能看出来第一主成分是原数据中的哪个成分),方差第n大的是PCA中的第n主成分
## 用prcomp()做PCA,summary得到的第一行,是对应主成分的标准差;第二行是Var(PCi)/sum(Var(PCi)); 第三行是第二行的累计和
pca.unscaled = prcomp(x)
pca.scaled = prcomp(x,scale=TRUE)
pca.scaled.2 = prcomp(xs) # should be the same as pca.scaled
summary(pca.unscaled)
summary(pca.scaled)
## Proportion of Variance越大,说明该成分包含的信息越多,如果该成分信息变化,那么总体受影响更大
## summary(pca.unscaled)里,Cumulative Proportion means:
## if you take the first component(PC1), you will capture the 91%, which means it contains the most information
## if you take the first two components, you will capture the 98%, the other two dimensions in the data sets are probably redundant.
## 如果你有前两个成分的数据,那你就抓住了98%的信息, 那么剩下两个成分就是多余的,不重要的
# plot the data on its first 2 dimensions in each space:
#biplot:主成分分析散点图(双标图)
#分析biplot:
#两环境线段之间的夹角的余弦值是它们的相关系数,夹角小于90度表示正相关,说明两环境对品种排序相似,大于90度表示负相关,表示两环境对品种排序相反,等于90度说明两环境不相关。夹角较小说明试验点是重复设置的,去掉一个不影响对品种的评价。
#环境线段的长度是试验点对品种的区分能力,线段越长,区分能力越强。
par(mfrow=c(1,2))
plot(x[,1:2], pch=20, col=y, main="Data in original space")
biplot(pca.unscaled, main="Data in PCA space")
abline(v=0, col='orange')
# see the binary separation in orange along PC1?
# re-use this into biplot in original space:
pca.cols = c("blue","orange")[1+as.numeric(pca.unscaled$x[,1]>0)]
plot(x[,1:2], pch=20, col=pca.cols,
main="Data in original space\n colour-coded using PC1-split")
par(mfrow=c(2,2))
plot(pca.unscaled) # scree plot
biplot(pca.unscaled) # biplot
plot(pca.scaled) # scree plot
biplot(pca.scaled) # biplot
# now analyse this plot :)
# 从图中可以看出两点:
#1、柱状图中,PL和PW对应的柱子很长,说明他们两个包含的信息多(包含主要信息)
#2、在biplot图中,PL和PW的线方向一致,说明他们两个的相关性很大
# Petal.Length和Petal.Width capture the same kind of direction of information, Sepal.Length and Sepal.Width have smaller contribution than those two.
# (2)logistic regression
#dat——unscaled,dats——scaled
logreg.unscaled = glm(Species~., data=dat) # make this work
#family='binomial' means do logistic regression
logreg.unscaled = glm(Species~., data=dat, family='binomial')
logreg.scaled = glm(Species~., data=dats, family='binomial')
#family='binomial' means do logistic regression
# discuss...
# (...unscaled and scaled 's coefficients are the fits different?)
cbind(coef(logreg.unscaled), coef(logreg.scaled))
#the signs are the same(两个数据的正负符号是一致的)
#but the contributions of each of the variables has changed, sometimes relate to each other(但是系数和影响变化了,但是有时是有联系的)
#have a swap between SP and SL, probably because there is a lot of redundancy between them.(SP和SL的大小交换了,可能因为有太多冗杂的东西在里面)
cor(dats$Petal.Length,dats$Petal.Width)
#0.9793217 means a high correlation them, 相关性很强,SP和SL可以混同(confused),他们可以相互交换(interchangeable)
# (... does this align 一致 with the PCA analysis?)
# (yes, we see a change in the role of each variable in the information space,但是PCA里PL和PW是主成分PC1 PC2,他们在逻辑回归里的系数也是很大的两个
# (... and why this align 一致 with the PCA analysis?为什么PCA和逻辑回归会一致)
# (They are both linear techniques)
# (3)
#lasso要先把数据变成matrix
x.m = model.matrix(Species~.+0, data=dat)#unscaled
lasso.cv = cv.glmnet(x.m, y, family="binomial")#用cv.glmnet去找到后面lasso要用的lambda
lasso.unscaled = glmnet(x.m, y, family="binomial", lambda=lasso.cv$lambda.min)
lasso.pred = predict(lasso.unscaled, newx=x.m, type="class")
#
xs.m = model.matrix(Species~.+0, data=dats)#scaled
lasso.cv = cv.glmnet(xs.m, y, family="binomial")
lasso.scaled = glmnet(xs.m, y, family="binomial", lambda=lasso.cv$lambda.min)
lasso.s.pred = predict(lasso.scaled, newx=xs.m, type="class")
#
cbind(coef(lasso.unscaled), coef(lasso.scaled))
#由此可见,scaled和unscaled的intercept很不一样,这是因为我们在scaled里centred all the covariance,which means the data in scaled are all central zero,所以there is very little need for an intercept parameter
table(lasso.pred, lasso.s.pred)
###############################################################
#### Exercise 2: data imputation ####
###############################################################
par(mfrow=c(1,1))
summary(lung)
boxplot(lung$meal.cal~lung$sex, col=c("cyan","pink"))
# can you think of other ways of analyzing this?
# (1) lung cancer data: compare meal.cal values between male and female cohorts,
# and discuss w.r.t. gender-specific data imputation
# NB: "missing at random" vs "missing not at random"??
?lung
View(lung)
dim(lung)#228
dim(na.omit(lung))#167
#数据中有NA,处理missing value
#167/228= 0.7324561,有73%的数据,剩下的是不全(丢失)的数据
#data imputation:fill out the missing value补全丢失数据
#replacing the missing values with the mean or media of the variable for the data set.
nas = is.na(lung$meal.cal) # track missing values 看看哪些值是NA
table(nas, lung$sex) #统计男女中各自NA的个数
imales = which(lung$sex==1)#将data set男性的数据位置找出来
m.all = mean(lung$meal.cal, na.rm=TRUE)#男女总体的mean
m.males = mean(lung$meal.cal[imales], na.rm=TRUE)#男性的mean
m.females = mean(lung$meal.cal[-imales], na.rm=TRUE)#女性的mean
t.test(lung$meal.cal[imales], lung$meal.cal[-imales])#预测男女包含missing value的mean是否相等
# p= 0.01989<0.05, 意味着alternative hypothesis: true difference in means is not equal to 0, 男女包含missing value的mean是不相等的,所以要用不同的imputation值去填补他们
# significant difference, hence must use different imputation
# values for each gender
# (2) Run Cox PHMs on original, overall imputed and gender-specific imputed
# datsets, using the cohort sample mean for data imputation. Compare and discuss
#用不同的数据去填充missing value,比较他们用COX模型拟合的差别
# model fitting output.
dat1 = dat2 = lung
# impute overall mean in dat1:
dat1$meal.cal[nas] = m.all #假如都用整体的mean来填充
# impute gender=sepcific mean in dat2:
dat2$meal.cal[(is.na(lung$meal.cal) & (lung$sex==1))] = m.males#用女的均值填充女
dat2$meal.cal[(is.na(lung$meal.cal) & (lung$sex==2))] = m.females#用男的均值填充男
#Fit Cox proportional hazard models: Cox比例风险回归模型
?coxph()
cox0 = coxph(Surv(time,status)~.,data=lung) #没有填充missing value
cox1 = coxph(Surv(time,status)~.,data=dat1) #直接填充不分男女
cox2 = coxph(Surv(time,status)~.,data=dat2) #分男女填充
summary(cox0)
summary(cox1)
summary(cox2)
cbind(coef(coxph(Surv(time,status)~.,data=lung)),
coef(coxph(Surv(time,status)~.,data=dat1)),coef(coxph(Surv(time,status)~.,data=dat2)))
round(cbind(coef(coxph(Surv(time,status)~.,data=lung)),
coef(coxph(Surv(time,status)~.,data=dat1)),coef(coxph(Surv(time,status)~.,data=dat2))),3)
#由比较可见,没有填充和两种填充方法三者meal.cal coefficient的结果差不多
# - dat1 and dat2 yield increased sample size (from 167 to 209, both imputed
# datasets having 209 observations) dat1和dat2产生了增加的样本大小(从167到209,都有209个观测数据集)
# - overall coefficient effects comparable between the 2 sets
# - marginal differences in covariate effect and significance between lung and {dat1;dat2}
# - no substantial difference between dat1 and dat2 outputs
###############################################################
#### Exercise 3: data imputation ####
###############################################################
library(ISLR)
dat = Hitters
View(Hitters)
# (1) (Deletion)
?na.omit()
sdat = na.omit(dat)#返回删除NA后的数据
sx = model.matrix(Salary~.+0,data=sdat)
sy = sdat$Salary
cv.l = cv.glmnet(sx,sy)
slo = glmnet(sx,sy,lambda=cv.l$lambda.min)
# (2) Simple imputation (of Y) using overall mean
ina = which(is.na(dat$Salary))#把NA的位置找出来
dat$Salary[ina] = mean(dat$Salary[-ina])
x = model.matrix(Salary~.+0,data=dat)
y = dat$Salary
cv.l = cv.glmnet(x,y)
lo = glmnet(x,y,lambda=cv.l$lambda.min)
# (3)
#删除 OR 补充缺失值;fit lasso模型;再用拟合好的lasso模型做出估计的y^;比较真实值与用lasso模型估计出来的估计值的标准差
slop = predict(slo,newx=sx)#用fit好的lasso模型
lop = predict(lo,newx=x)
sqrt(mean((slop-sy)^2))
sqrt(mean((lop-y)^2))#标准差差不多
#画图比较用delete和Simple imputation两种方法拟合的模型做出来的每一个y^的差距,如果是45度直线,说明两种方法预测的y^是差不多的
plot(slop,lop[-ina])
abline(a=0,b=1)
abline(lm(lop[-ina]~slop), col='navy')
# What could we do instead of imputing the Y?
###############################################################
#### Exercise 4: resampling ####
###############################################################
###############################################################
#### Exercise 5: resampling (CV vs bootstrapping)####
###############################################################
# Implement this simple analysis and discuss - think about
# (sub)sample sizes!
# 跳过shuff过程
#n = nrow(trees)#都要先洗牌,重新随机排列一下原数据
#trees = trees[sample(1:n, n), ]
x = trees$Girth # sorted in increasing order...
y = trees$Height
plot(x, y, pch=20)
summary(lm(y~x))
N = nrow(trees)
# (1) 10-fold CV on original dataset
set.seed(4060)
K = 10
slp <- numeric(K)
itc <- numeric(K)
cc <- numeric(K)
folds = cut(1:N, K, labels=FALSE)
for(k in 1:K){
i = which(folds==k)
# train:
lmo = lm(y[-i]~x[-i])
slp[k] <- lmo$coefficient[2]
itc[k] <- lmo$coefficient[1]
cc[k] = summary(lmo)$coef[2,2]
# (NB: no testing here, so not the conventional use of CV)
}
mean(cc)#0.3443734
# (2) 10-fold CV on randomized dataset
# shuffle it
set.seed(1)
mix = sample(1:nrow(trees), replace=FALSE)
xr = trees$Girth[mix]
yr = trees$Height[mix]
set.seed(4060)
K = 10
ccr = numeric(K)
folds = cut(1:N, K, labels=FALSE)
for(k in 1:K){
i = which(folds==k)
lmo = lm(yr[-i]~xr[-i])
ccr[k] = summary(lmo)$coef[2,2]
}
mean(ccr)#0.3408507,shuff后的mean更小一些
mean(cc)#0.3443734
#(3)
sd(ccr)#0.01854404,shuff后的sd更小一些
sd(cc)#0.03990858
boxplot(cc,ccr)
t.test(cc,ccr)#p>0.05,拒绝备择假设alternative hypothesis,true difference in means is equal to 0
var.test(cc,ccr)#F-test (var.test()) p<0.05,接受备择假设alternative hypothesis,true ratio of variances is not equal to 1
?var.test
#p>0.05,accept H0, rejept H1(alternative hypothesis)
# (4) Bootstrapping (additional note)用BT来做,跟CV比较
set.seed(4060)
K = 100
cb = numeric(K)
for(i in 1:K){
# bootstrapping
ib = sample(1:N,N,replace=TRUE)
lmb = lm(y[ib]~x[ib])
cb[i] = summary(lmb)$coef[2,2]
}
mean(cb)#0.3163658,不如shuff后的CV结果好
sd(cb)#0.0505108,不如shuff后的CV结果好
dev.new()
par(font=2, font.axis=2, font.lab=2)
boxplot(cbind(cc,ccr,cb), names=c("CV","CVr","Bootstrap"))
t.test(cc,cb)
round(cc, 3)
#(5)
#we have 36.8% points out-of-BT, so using 368 points as test set, 632 points as train set
#(6)
N = 1000
x = runif(N, 2, 20)
y = 2 + 5*x + rnorm(N)
R=33
K1=3
pred.err.cv=numeric(R*K1)
###CV
for(j in 1:R){
mix=sample(1:N,replace = F)
xr=x[mix]
yr=y[mix]
folds=cut(1:N,K1,labels = F)
data53=as.data.frame(cbind(xr,yr))
mse = pred.err = numeric(K1)
for(k in 1:K1){
i.train = which(folds!=k)
o = lm(yr~xr, data=data53, subset=i.train)
i.test = which(folds==k)
yp = predict(o, newdata=data53[i.test,])
pred.err[k] = mean((yp-yr[i.test])^2)
}
pred.err.cv[c(K1*j-K1+1):c(K1*j)]=pred.err
}
mean(pred.err.cv)
###Bootstrapping command+shift+c 全选加注释
# K2 = R*K1
# pred.err.BT = numeric(K2)
# for(j in 1:K2){
# mix=sample(1:N,replace = T)
# xr=x[mix]
# yr=y[mix]
# data53=as.data.frame(cbind(xr,yr))
# o = lm(yr~xr, data=data53)
# yh = o$fitted.values
# pred.err.BT[j] = mean((yh-yr)^2)
# }
# mean(pred.err.BT)
# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
#### Exercises Section 2: Regularization ####
# --------------------------------------------------------
###############################################################
#### Exercise 1: tuning LASSO ####
###############################################################
# Have a go at this exercise yourself...
# you can refer to ST4060 material:)
library(ISLR)
library(glmnet)
Hitters = na.omit(Hitters)
x = model.matrix(Salary~., Hitters)[,-1]
y = Hitters$Salary
cv.rd <- cv.glmnet(x,y)
ridge_mod = glmnet(x, y, alpha = 0, lambda = cv.rd$lambda.min)
pred.rd <- predict(ridge_mod, newx=x)
length(pred.rd)
length(y)
sqrt(mean((pred.rd - y)^2))
summary(ridge_mod)
coef(ridge_mod)
ridge_mod$lambda
the_grid = 10^seq(10, -2, length = 100)
n <- length(the_grid)
criterion = numeric(length(the_grid))
for(i in 1:n){
fit = glmnet(x, y, alpha = 0, lambda = the_grid[i])
pred.fit <- predict(fit, newx=x)
criterion[i] = sqrt(mean((pred.fit - y)^2))
}
which.min(criterion)
the_grid[which.min(criterion)]
###############################################################
#### Exercise 2: tuning LASSO + validation split ####
###############################################################
# Have a go at this exercise yourself too...
# you can refer to ST4060 material:)
# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
#### Exercises Section 3: Classification Exercises ####
# --------------------------------------------------------
# install.packages("class")
# install.packages("MASS")
# install.packages("car")
# install.packages("ISLR")
# install.packages("pROC")
# install.packages("carData")
library(class) # contains knn()
library(MASS) # to have lda()
library(carData)
library(car)
library(ISLR) # contains the datasets
library(pROC)
###############################################################
#### Exercise 1: kNN on iris data ####
###############################################################
set.seed(1)
# shuffle dataset first:
z = iris[sample(1:nrow(iris)),]
# Here we focus on sepal information only
plot(z[,1:2], col=c(1,2,4)[z[,5]],
pch=20, cex=2)
x = z[,1:2] # sepal variables only
y = z$Species
# Here's how to use the knn() classifier:
K = 1
n = nrow(x)
# split the data into train+test:
i.train = sample(1:n, 100)
x.train = x[i.train,]
x.test = x[-i.train,]
y.train = y[i.train]
y.test = y[-i.train]
ko = knn(x.train, x.test, y.train, K)
tb = table(ko, y.test)
1 - sum(diag(tb)) / sum(tb) # overall classification error rate, error rate越小越好
library(caret)
confusionMatrix(data=ko, reference=y.test)#显示the error matrix
#Here, Accuracy : 0.7 which is sum(diag(tb)) / sum(tb); 95%Confidence interval around its accuracy matrix;还能比较3种class的Sensitivity 和 Specificity
?confusionMatrix#创建一个给定特定边界的混淆矩阵
# Build a loop around that to find best k:
# (NB: assess effect of various k-values
# on the same data-split)
Kmax = 30
acc = numeric(Kmax)#acc越大越好
for(k in 1:Kmax){
ko = knn(x.train, x.test, y.train, k)
tb = table(ko, y.test)
acc[k] = sum(diag(tb)) / sum(tb)
}
plot(1-acc, pch=20, t='b', xlab='k')#从图中找到最低点,就是最好的k
#评价:if k is too small(overfitting), decision boundry is overly flexible, low bias+large variance
# if k is too large(underfitting), decision boundry is not flexible enough, high bias+small variance
###############################################################
#### Exercise 2: GLM(logistic regression) on 2-class iris data ####
###############################################################
n = nrow(iris)
is = sample(1:n, size=n, replace=FALSE)
dat = iris[is,-c(3,4)] # shuffled version of the original set, and hold sepal variables only
# record into 2-class problem:转化成2-class problem “是virginica”=1 OR “不是virginica ”=0
dat$is.virginica = as.numeric(dat$Species=="virginica")
dat$Species = NULL # "remove" this component
names(dat)
is = 1:100 # training set
fit = glm(is.virginica~., data=dat, subset=is, family=binomial(logit))
pred = predict(fit, newdata=dat[-is,], type="response")#返回值是estimated probabilities
hist(pred)
?predict.glm
y.test = dat$is.virginica[-is] # true test set classes
boxplot(pred~y.test, names=c("other","virginica"))#通过箱线图可以看出“virginica”和“not virginica”通过glm分开的比较彻底
abline(h=0.5, col=3)
#roughly 70% of data would be classified correctly.
# for varying cut-off (ie threshold) values, compute corresponding
# predicted labels, and corresponding confusion matrix:
#find the best cutoff
err = NULL
for(cut.off in seq(.1, .9, by=.1)){
pred.y = as.numeric(pred>cut.off)
tb = table(pred.y, y.test)
err = c(err, (1-sum(diag(tb))/sum(tb)))
}
plot(seq(.1, .9, by=.1), err, t='b')
#choose the best cutoff to classification.
cut.off=0.2
pred.y = as.numeric(pred>cut.off)
tb = table(pred.y, y.test)
err = (1-sum(diag(tb))/sum(tb))
###############################################################
#### Exercise 3: LDA assumptions ####
###############################################################
## (1) 2-class classification problem
#多类转化成2类的三种方法
#(1)转换成0 1
dat = iris
dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))
#(2)转换成带命名的(更直观)
#用0 1 区分是virginica还是非virginica并不直观,不如用virginica or other 来区分,如下:
# to recode cleanly, you could use for instance:
dat$Species = car::recode(dat$Species, "0='other'; 1='virginica'")
#(3)先转化成0 1,再命名
# or:直接让levels变成virginica or other
#dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))#先变成0 1
#levels(dat$Species) = c("other","virginica")#再对0 1 命名
##(2)粗略查看分类情况
par(mfrow=c(1,2))
plot(iris[,1:2], pch=20, col=c(1,2,4)[iris$Species], cex=2)
legend("topright",col=c(1,2,4),
legend=levels(iris$Species),
pch=20, bty='n')#3-class
plot(dat[,1:2], pch=20, col=c(1,4)[dat$Species], cex=2)
legend("topright",col=c(1,4),
legend=levels(dat$Species),
pch=20, bty='n')#2-class:virginica or not virginica
#由于LDA需要满足两个assumption:normal-distribution;equal variance,否则将不能用LDA方法分类
#First assumption: all histograms are looking like normal distribution
#Second assumption: 每个predictor的方差相同
##First,we need to explore distribution of each predictors(每一列指标):(看是否满足正态分布)
head(dat)
#here, the first four columns are the predictors(or features/attributes/covariates/variables), the last column is class
# 1.boxplots seem relatively symmetric画箱线图看他是否对称:
#look at each column of the dataset:
par(mfrow=c(2,2))
for(j in 1:4){
boxplot(dat[,j]~dat$Species,
ylab='predictor',
col=c('cyan','pink'),
main=names(dat)[j])
}
#here you can see there is a clear separation between the two classes, except the sepal.width
#so we decide to use the three other predictors to make a classification
#由于正态分布是对称的,如果箱线图的箱子是对称(symmetric)的,那么很可能是满足正态分布的,如果箱子不对称,那么会有skew
#看correlation
cor(dat[,1:4])
#here we can see there is 96%(超过90%就可以这么说) correlation between Petal.Width and Petal.Length, which means these two predictors can tell us the same information
#2.只看箱线图是不够的,还要看是否是histogram正态分布 but we'd rather check for Normality more specifically:
par(mfrow=c(2,4), font.lab=2, cex=1.2)
for(j in 1:4){
hist(dat[which(dat$Species=='other'),j], col='cyan',
xlab='predictor for class other',
main=names(dat)[j])
hist(dat[which(dat$Species!='other'),j], col='pink',
xlab='predictor for class virginica',
main=names(dat)[j])
}
#PL and PW for the other class are definitely not normal,其他的图都是approximately normal
#It means there may be a limitation of our analysis if we decide to use LDA here.
#这里其实就不适合用LDA了,但是为了了解这个方法,我们继续往下做
#3.除了用柱状图判断是否normal,也可以用QQ图 could also use QQ-plots:
par(mfrow=c(2,4), cex=1.2)
for(j in 1:4){
x.other = dat[which(dat$Species=='other'),j]
qqnorm(x.other, pch=20, col='cyan',
main=names(dat)[j])
abline(a=mean(x.other), b=sd(x.other))
x.virginica = dat[which(dat$Species!='other'),j]
qqnorm(x.virginica, pch=20, col='pink',
main=names(dat)[j])
abline(a=mean(x.virginica), b=sd(x.virginica))
}
# So what do you think?
#在45度线上的是normal distribution,很明显PL和PW在other上不是
## Check initial assumption of equality of variances:all predictors have same variance for each class
# Bartlett's test with H0: all variances are equal, p>0.05 接受H0,有相同的variance
for(j in 1:4){
print( bartlett.test(dat[,j]~dat$Species)$p.value )
}
#只有SL满足
# Shapiro's test with H0: the distribution is Normal
for(j in 1:4){
print( shapiro.test(dat[which(dat$Species=='virginica'),j])$p.value )
}#p都>0.05,Normal
for(j in 1:4){
print( shapiro.test(dat[which(dat$Species=='other'),j])$p.value )
}#PL和PW的p value 很小,不normal
## Fit LDA model to this dataset and check accuracy:
lda.o = lda(Species~., data=dat)
(lda.o)
# can we track some of the values in that summary?
table(dat$Species)/nrow(dat)
#Here tells us there are one third data belongs to virginica, and two thirds belongs to other class.
#It means we don't have balance in how each class represented in the data set.
rbind(
apply(dat[which(dat$Species=='other'),1:4], 2, mean),
apply(dat[which(dat$Species=='virginica'),1:4], 2, mean)
)#按列求每一列的均值
# what about the coefficients of linear discriminates?那么线性微分的系数呢?
x = as.matrix(dat[,1:4])
proj = x %*% lda.o$scaling #取每一列的数据,乘lda.o$scaling
plot(proj, pch=20, col=dat$Species, cex=2)
#we can see, the projections increase as we move from one class to another
# little hack to recover the fitted values quickly
predo = predict(lda.o, newdata=dat)
names(predo)
predo$class#显示每一个测试样本的分类
predo$posterior#有0.9990663400的概率第一个测试样本属于other class,有0.0009336600的概率它属于virginica
y = predo$x
??predo
plot(proj, y)#由此我们能看出,the scores used by LDA in order to generate the predictions and probabilities are the result of proj by using lda.o$scaling directly.(y=proj)
plot(y, predo$posterior[,2])
boxplot(y ~ (predo$posterior[,2]>.5))#第二列probabilities > 0.5 是virginica(TRUE)
boxplot(proj ~ (predo$posterior[,2]>.5))#两种画出来的图是一样的,因为y和proj相等
# NB: The way these coefs is calculated follows the MANOVA approach
# popular hack to recover the fitted values:
fitted.values = predict(lda.o, newdata=dat)$class
boxplot(y~dat$Species)
boxplot(proj~dat$Species)
(tb.2 = table(fitted.values, dat$Species))
sum(diag(tb.2)) / sum(tb.2)#accuracy
#LDA will be more efficient than LR when class are well separated.
## (2) 3-class classification problem
dat = iris
## Explore distribution of predictors:
# boxplots seem relatively symmetric:
par(mfrow=c(2,2))
# here's just a loop to save having to write 4 boxplot
# instructions with names by hand (being lazy often
# makes for nicer code):
for(j in 1:4){
boxplot(dat[,j]~dat$Species,
xlab = 'Species',
ylab = 'predictor',
col=c('cyan','pink'),
main=names(dat)[j])
}
# but we'd rather check for Normality more specifically:
Ls = levels(dat$Species)
par(mfcol=c(3,4))
for(j in 1:4){
hist(dat[which(dat$Species==Ls[1]),j], col='cyan',
main=names(dat)[j])
hist(dat[which(dat$Species==Ls[2]),j], col='pink',
main=names(dat)[j])
hist(dat[which(dat$Species==Ls[3]),j], col='green',
main=names(dat)[j])
}
# could also use QQ-plots:
par(mfcol=c(3,4))
for(j in 1:4){
x1 = dat[which(dat$Species==Ls[1]),j]
qqnorm(x1, pch=20, col='cyan', main=names(dat)[j])
abline(a=mean(x1), b=sd(x1))
x2 = dat[which(dat$Species==Ls[2]),j]
qqnorm(x2, pch=20, col='pink', main=names(dat)[j])
abline(a=mean(x2), b=sd(x2))
x3 = dat[which(dat$Species==Ls[3]),j]
qqnorm(x3, pch=20, col='green', main=names(dat)[j])
abline(a=mean(x3), b=sd(x3))
}
# So what do you think now?
## Check initial assumption of equality of variances:
# Bartlett's test with H0: all variances are equal
print( bartlett.test(dat[,1]~dat$Species)$p.value )
print( bartlett.test(dat[,2]~dat$Species)$p.value )
print( bartlett.test(dat[,3]~dat$Species)$p.value )
print( bartlett.test(dat[,4]~dat$Species)$p.value )
## or if in lazy mode:
for(j in 1:4){
print( bartlett.test(dat[,j]~dat$Species)$p.value )
}
## Fit LDA model to this dataset and check accuracy:
lda.o = lda(Species~., data=dat)
(lda.o)
ftted.values = predict(lda.o, newdata=dat)$class
(tb.3 = table(ftted.values, dat$Species))
sum(diag(tb.3)) / sum(tb.3)
###############################################################
### Exercise 4: LDA
###############################################################
## (1) 2-class classification problem
dat = iris
dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))
levels(dat$Species) = c("other","virginica")
n = nrow(dat)
set.seed(4061)
dat = dat[sample(1:n),] # shuffle dataset
i.train = 1:100
dat.train = dat[i.train,]
dat.test = dat[-i.train,]
lda.o = lda(Species~., data=dat.train)
lda.p = predict(lda.o, newdata=dat.test)
names(lda.p)
(tb = table(lda.p$class, dat.test$Species))
sum(diag(tb))/sum(tb)
# QDA:
qda.o = qda(Species~., data=dat.train)
qda.p = predict(qda.o, newdata=dat.test)
(tb = table(qda.p$class, dat.test$Species))
sum(diag(tb))/sum(tb)
## (2) 3-class classification problem
dat = iris
n = nrow(dat)
set.seed(4061)
dat = dat[sample(1:n),]
i.train = 1:100
dat.train = dat[i.train,]
dat.test = dat[-i.train,]
# LDA:
lda.o = lda(Species~., data=dat.train)
lda.p = predict(lda.o, newdata=dat.test)
names(lda.p)
(tb = table(lda.p$class, dat.test$Species))
sum(diag(tb))/sum(tb)
# QDA:
qda.o = qda(Species~., data=dat.train)
qda.p = predict(qda.o, newdata=dat.test)
(tb = table(qda.p$class, dat.test$Species))
sum(diag(tb))/sum(tb)
###############################################################
### Exercise 5: benchmarking
###############################################################
## (1) benchmarking on unscaled data
#ROC图像 横纵坐标都越接近1越好, AOC是线与横轴包围的面积,也是越大越好
#sensitivity:纵坐标
#specificity:横坐标
#confusionMatrix:混淆矩阵
set.seed(4061)
n = nrow(Default)
dat = Default[sample(1:n, n, replace=FALSE), ]
# get a random training sample containing 70% of original sample:
i.cv = sample(1:n, round(.7*n), replace=FALSE)
dat.cv = dat[i.cv,] # use this for CV (train+test)
dat.valid = dat[-i.cv,] # save this for later (after CV) (HOLD-OUT)
# tuning of the classifiers:
K.knn = 3
# perform K-fold CV:
K = 10
N = length(i.cv)
folds = cut(1:N, K, labels=FALSE)
acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)
auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)
#
for(k in 1:K){ # 10-fold CV loop
# split into train and test samples:
i.train = which(folds!=k)
dat.train = dat.cv[i.train, ]
dat.test = dat.cv[-i.train, ]
# adapt these sets for kNN:
x.train = dat.train[,-1]
y.train = dat.train[,1]
x.test = dat.test[,-1]
y.test = dat.test[,1]
x.train[,1] = as.numeric(x.train[,1])
x.test[,1] = as.numeric(x.test[,1])
# train classifiers:
knn.o = knn(x.train, x.test, y.train, K.knn)
glm.o = glm(default~., data=dat.train, family=binomial(logit))
lda.o = lda(default~., data=dat.train)
qda.o = qda(default~., data=dat.train)
# test classifiers:
knn.p = knn.o
glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )
lda.p = predict(lda.o, newdata=dat.test)$class
qda.p = predict(qda.o, newdata=dat.test)$class
tb.knn = table(knn.p, y.test)
tb.glm = table(glm.p, y.test)
tb.lda = table(lda.p, y.test)
tb.qda = table(qda.p, y.test)
# store prediction accuracies:
acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)
acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)
acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)
acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)
#
# ROC/AUC analysis:AUC值越大越好
# WARNING: THIS IS NOT Pr(Y=1 | X), BUT Pr(Y = Y_hat | X):
knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob
glm.p = predict(glm.o, newdata=dat.test, type="response")
lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]
qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]
auc.knn[k] = roc(y.test, knn.p)$auc
auc.glm[k] = roc(y.test, glm.p)$auc
auc.lda[k] = roc(y.test, lda.p)$auc
auc.qda[k] = roc(y.test, qda.p)$auc
}
boxplot(acc.knn, acc.glm, acc.lda, acc.qda,
main="Overall CV prediction accuracy",
names=c("kNN","GLM","LDA","QDA"))
boxplot(auc.knn,auc.glm, auc.lda, auc.qda,
main="Overall CV AUC",
names=c("KNN","GLM","LDA","QDA"))
##### Taking a closer look at performance
roc(y.test, glm.p)$auc#真实值 预测值
plot(roc(y.test, glm.p))
#取threshold为0.5,用caret建立混淆矩阵,计算精确度
library(caret)
(tb = table(y.test, glm.p>.5))
pred = as.factor(glm.p>.5)
pred = car::recode(pred, "FALSE='No'; TRUE='Yes'")
caret::confusionMatrix(y.test, pred)
sum(diag(tb))/sum(tb)
(683+3)/(683+3+14)#=98%
#看总体数据的真实情况,No占96.67%,与98%相差不大,说明预测有效
table(Default$default)
table(Default$default)/nrow(Default)
##### Further exercises for you to do:
## adapt code to evaluate sensitivity and specificity
## add validation analysis...
## repeat on scaled data... 会对KNN有影响
###############################################################
### Exercise 6: benchmarking, again
###############################################################
## (1) benchmarking on unscaled data
set.seed(4061)
n = nrow(Default)
dat = Default[sample(1:n, n, replace=FALSE), ]
# get a random training sample containing 70% of original sample:
i.cv = sample(1:n, round(.7*n), replace=FALSE)
x = dat.cv = dat[i.cv,] # use this for CV (train+test)
dat.valid = dat[-i.cv,] # save this for later (after CV)
# Recover ROC curve manually from whole set:
n = nrow(x)
acc = numeric(length(thrs))
sens = spec = numeric(length(thrs))
thrs = seq(.05,.95,by=.05)
for(ithr in 1:length(thrs)){
thr = thrs[ithr]
glmo = glm(default~., data=x,
family=binomial)
tb = table(glmo$fitted.values>thr, x$default)
acc[ithr] = sum(diag(tb))/sum(tb)
#
# calculate sensitivity for a given threshold
sens[ithr] = tb[2,2]/sum(tb[,2])
# calculate specificity for a given threshold
spec[ithr] = tb[1,1]/sum(tb[,1])
# prediction
}
plot(1-spec,sens,xlim=c(0,1),ylim=c(0,1),t='b')
abline(h=c(0,1),v=c(0,1),col=8)
abline(a=0,b=1,col=8)
plot(acc)
plot(spec, sens)#能看出来spec增加,sens会下降
confusionMatrix(y.test,pred)
# Evaluate a cross-validated ROC curve manually:
# 手工评估交叉验证的ROC曲线:
n = nrow(x)
K = 10
train.acc = test.acc = matrix(NA, nrow=K, ncol=length(thrs))
folds = cut(1:n, K, labels=FALSE)
k = 1
thrs = seq(.05,.95,by=.05)
for(ithr in 1:length(thrs)){
thr = thrs[ithr]
for(k in 1:K){
itrain = which(folds!=k)
glmo = glm(default~., data=x,
family=binomial,
subset=itrain)
tb = table(glmo$fitted.values>thr, x$default[itrain])
train.acc[k, ithr] = sum(diag(tb))/sum(tb)
#
# calculate sensitivity for a given threshold
sens[ithr] = tb[2,2]/sum(tb[,2])
# calculate specificity for a given threshold
spec[ithr] = tb[1,1]/sum(tb[,1])
# prediction
p.test = predict(glmo, x[-itrain,], type='response')
tb = table(p.test>thr, x$default[-itrain])
test.acc[k,ithr] = sum(diag(tb))/sum(tb)
}
}
boxplot(test.acc)
mean(train.acc)
mean(test.acc)
####eric上课讲的2018CA1整体代码####
# install.packages("mlbench")
library(mlbench)
data(Sonar)
N = nrow(Sonar)
P = ncol(Sonar)-1
M = 150
set.seed(1)
mdata = Sonar[sample(1:N),]#shuffle
itrain = sample(1:N,M)#150/208作为训练样本
x = mdata[, -ncol (mdata)]
y = mdata$Class
xm = as.matrix(x)
#1)
N-M
#2)
lasso.opt = cv.glmnet(xm[itrain,],
y[itrain],
alpha=1,
family='binomial')
lasso.opt$lambda.min
#0.01103539
#3)
lasso.mod = glmnet(xm[itrain,],
y[itrain],
alpha=1,
family='binomial',
lambda=lasso.opt$lambda.min)
coef(lasso.mod)
#4)
tree.mod = tree(y~., data=x, subset=itrain)
summary(tree.mod)
sort(summary(tree.mod) $used)
#11 variables
#5)
rf.mod = randomForest(y~., data=x, subset=itrain)
varImpPlot(rf.mod)
#6)
tree.pred = predict(tree.mod, x[-itrain,], 'class')
rf.pred = predict(rf.mod, x[-itrain,], 'class')
(tb.tree = table(tree.pred,y[-itrain]))
(tb.rf = table(rf.pred,y[-itrain]))
1-sum(diag(tb.tree))/sum(tb.tree)
1-sum(diag(tb.rf))/sum(tb.rf)
#The classification error rate is smaller, means the model is more accuracy.
#7)
tree.p = predict(tree.mod, x[-itrain,], 'vector')[,2]
rf.p = predict(rf.mod, x[-itrain,], 'prob')[,2]
auc.tree = roc(response=y[-itrain], predictor=tree.p)$auc
auc.rf = roc(response=y[-itrain], predictor=rf.p)$auc
c(auc.tree, auc.rf)
#The auc is greater,means this model has more sensitive and specificity.
#The classification error rate is smaller, means the model is more accuracy.
#Since both error rate and auc improved, we can say that RF model is better rather than a tree for classification.
# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
#### Exercises Section 4: Tree-based methods 用树来分类####
# --------------------------------------------------------
###############################################################
#### Exercise 1: growing and pruning a tree ####
###############################################################
#decision tree:从所有变量中挑出一个最重要的来作为根结点,后面的每一阶都是从所有变量里选最重要的一个作为下一个节点,这会使得有重复的变量被选中,画出的树特别大,造成overfitting
#因此,我们需要对做出的tree做删减pruning,目的是找到一个最理想的tree size分级大小,也就是方差dev最小的那个
install.packages("tree")
library(ISLR) # contains the dataset
library(tree) # contains... tree-building methods
# Recode response variable so as to make it a classification problem
High = ifelse(Carseats$Sales<=8, "No", "Yes")
# Create a data frame that includes both the predictors and response
# (a data frame is like an Excel spreadsheet, if you like)
CS = data.frame(Carseats, High)
CS$Sales = NULL#把sale去掉
CS$High = as.factor(CS$High) # <-- this bit was missing 必须有 不然后面会warning
# Fit the tree using all predictors (except for variable Sales,
# which we have "recoded" into a cateorical response variable)
# to response variable High
tree.out = tree(High~., CS)
summary(tree.out)
# plot the tree
plot(tree.out)
text(tree.out, pretty=0)
#The tree is fully grown, there is lots of little branches, maybe have a very detailed breakdown, means there is an overfitting of the dataset
# pruning:修剪
?cv.tree
set.seed(3)
cv.CS = cv.tree(tree.out, FUN=prune.misclass)#添加prune.misclass函数以找到合适的size
names(cv.CS)
# - size:
# number of terminal nodes in each tree in the cost-complexity pruning sequence.
#two method to control the depth of the tree:
# (1)- deviance:
# total deviance of each tree in the cost-complexity pruning sequence.
# (2)- k:
# the value of the cost-complexity pruning parameter of each tree in the sequence.
cv.CS
par(mfrow=c(1,2))
plot(cv.CS$size,cv.CS$dev,t='b')#find which size has the smallest deviation 93 (偏差:观测值-真实值)
min(cv.CS$dev)
cv.CS$size[which.min(cv.CS$dev)]
abline(v=cv.CS$size[which.min(cv.CS$dev)])#marking the 最合适size的location
plot(cv.CS$k,cv.CS$dev,t='b')#还可以找到最小的dev对应的最合适的k
# use pruning:
# - use which.min(cv.CS$dev) to get the location of the optimum
# - retrieve the corresponding tree size
# - pass this information on to pruning function
opt.size = cv.CS$size[which.min(cv.CS$dev)] #Find the optimal size
# see:
plot(cv.CS$size,cv.CS$dev,t='b')
abline(v=cv.CS$size[which.min(cv.CS$dev)])
ptree = prune.misclass(tree.out, best=opt.size) #using the optimal size to pruning the tree
ptree
summary(ptree)
par(mfrow=c(1,2))
plot(tree.out)#initially
text(tree.out, pretty=0)
plot(ptree)#pruned
text(ptree, pretty=0)
#Compare with the initial treem, there is fewer branches in prune tree.
###############################################################
#### Exercise 2: apply CV and ROC analysis ####
###############################################################
# Train/test:
set.seed(2)
n = nrow(CS)
itrain = sample(1:n, 200)#一共有400组数据,随机选取其中的200个作为训练向本
CS.test = CS[-itrain,]
High.test = High[-itrain]
# argument 'subset' makes it easy to handle training/test splits:
tree.out = tree(High~., CS, subset=itrain)#总体的树,没有修剪过
summary(tree.out)
plot(tree.out)
text(tree.out, pretty=0)
# prediction from full tree:
tree.pred = predict(tree.out, CS.test, type="class")#总体的树的预测值
(tb1 = table(tree.pred,High.test))#总体的树对High的混淆矩阵
# prediction from pruned tree:
ptree.pred = predict(ptree, CS.test, type="class")#修剪过的树的预测值
(tb2 = table(ptree.pred,High.test)) # 修剪过的树的confusion matrix
sum(diag(tb1))/sum(tb1)#总体的树的准确度#classification rate
sum(diag(tb2))/sum(tb2)#修剪过的树的准确度
#预测错的值老师叫做misclassified obsevations
#预测错的概率叫misclassification rate
#准确度高的老师叫做is better suited to predict unseen data
# perform ROC analysis
library(pROC)
# here we specify 'type="vector"' to retrieve continuous scores
# as opposed to predicted labels, so that we can apply varying
# threshold values to these scores to build the ROC curve:
#在ROC中,图像越靠近左上角越好
ptree.probs = predict(ptree, CS.test, type="vector")#在画ROC的时候,必须用type="vector"
roc.p = roc(response=(High.test), predictor=ptree.probs[,1])
plot(roc.p)
#AUC
#AUC是曲线下面的面积,越大越好
#下面是两种方法,都可以
#取值范围在0.5和1之间
#越接近1.0,检测方法真实性越高;
#等于0.5时,则真实性最低,无应用价值
#AUC < 0.5,比随机猜测还差
auc(roc.p)
roc.p$auc
###############################################################
#### Exercise 3: find the tree ####
###############################################################
#Bagging
#由于decision tree有许多缺点,比如重复选择,太庞大,overfitting,用一个更为优化的分类方法叫做Bagging
#用Bootstrap,每次BT里选一个新的variable做节点往下分,high variance, low bias,用于分类和回归都可以
#但是每一层选出来的variable会与前面的有很强的相关性
# ... can you find it?
###############################################################
#### Exercise 4: grow a random forest ####
###############################################################
#更好的一种方法,each learner:可以针对某一种特性来分类random subset is considering feature Q
#不容易overfit,不敏感
library(tree)
library(ISLR)
#install.packages("randomForest")
library(randomForest)
# ?Carseats
High = as.factor(ifelse(Carseats$Sales <= 8, 'No', 'Yes'))
CS = data.frame(Carseats, High)
CS$Sales = NULL
P = ncol(CS)-1 # number of features(把High去掉)
# grow a single (unpruned) tree
tree.out = tree(High~., CS)
# fitted values for "training set"按照decision tree直接全部预测
tree.yhat = predict(tree.out, CS, type="class")
# grow a forest:建立一个森林
rf.out = randomForest(High~., CS)
# fitted values for "training set"按照randomForest直接全部预测
rf.yhat = predict(rf.out, CS, type="class")
#If you want to perform bagging instead of random forest
# compare to bagging: using all features,not include y 这种方法叫bagging(用了全部的variables);rf可以随便选用mtry,自定义用多少feature
bag.out = randomForest(High~., CS, mtry=P)#添加mtry= P = ncol(CS)-1 # number of features(把High去掉)
# fitted values for "training set"
bag.yhat = predict(bag.out, CS, type="class")#按照bag直接全部预测
# confusion matrix for tree:
(tb.tree = table(tree.yhat, High))#按照decision tree直接全部预测的混淆矩阵
# confusion matrix for RF
(tb.rf = table(rf.yhat, High))#按照randomForest直接全部预测的混淆矩阵,可以看出分类全部正确,perfect
# confusion matrix for bagging
(tb.bag = table(bag.yhat, High))#按照bag直接全部预测的混淆矩阵
# Note this is different to the confusion of RF
# matrix for the OOB observations:OOB对建树时未使用的数据(bootstrap没用到的数据)进行误差估计
(tb.rf2 = rf.out$confusion)#rf.out是真实的数据直接放到RF里,不做预测,说明真实情况下按照用RF方法应该有部分数据是错误分类的
sum(diag(tb.tree))/sum(tb.tree)#decision tree方法精确度91%
sum(diag(tb.rf))/sum(tb.rf)#RF方法精确度1(完全正确)但不是什么数据都是1,这里是巧了
sum(diag(tb.bag))/sum(tb.bag)#accurate=1
sum(diag(tb.rf2))/sum(tb.rf2)
#原本bag做出来的精确度应该比rf低,但现在它比rf的精确度高了,1>0.8141762, 说明bag出现了over fit
# train-test split
set.seed(6041)
N = nrow(CS)
itrain = sample(1:N, 200)
CS.train = CS[itrain,]
CS.test = CS[-itrain,]
tree.out = tree(High~., CS.train)
# fitted values for "train set"
tree.yhat = predict(tree.out, CS.train, type="class")
# fitted values for "test set"
tree.pred = predict(tree.out, CS.test, type="class")
rf.out = randomForest(High~., CS.train)
# fitted values for "training set"
rf.yhat = predict(rf.out, CS.train, type="class")
# fitted values for "test set"
rf.pred = predict(rf.out, CS.test, type="class")
bag.out = randomForest(High~., CS.train, mtry=(ncol(CS)-2))
# fitted values for "training set"
bag.yhat = predict(bag.out, CS.train, type="class")
# fitted values for "test set"
bag.pred = predict(bag.out, CS.test, type="class")
# confusion matrix for tree (test data):
(tb.tree = table(tree.pred, CS.test$High))
# confusion matrix for RF (test data):
(tb.rf = table(rf.pred, CS.test$High))#better performance of rf
# confusion matrix for Bagging (test data):
(tb.bag = table(bag.pred, CS.test$High))#bagging is not as good as rf
sum(diag(tb.tree))/sum(tb.tree)
sum(diag(tb.rf))/sum(tb.rf)
sum(diag(tb.bag))/sum(tb.bag)
###############################################################
#### Exercise 5: benchmarking ####
###############################################################
###### Exercise 5: benchmarking (this exercise is left as homework)######
# bring in that code from Section 2 (below) and add to it:
library(class) # contains knn()
library(ISLR) # contains the datasets
library(pROC)
library(tree)
library(randomForest)
#### (1) benchmarking on unscaled data对未缩放数据进行基准测试####
#对一类测试对象的某项性能指标进行定量的和可对比的测试
set.seed(4061)
n = nrow(Default)
dat = Default[sample(1:n, n, replace=FALSE), ]
i.cv = sample(1:n, round(.7*n), replace=FALSE)
dat.cv = dat[i.cv,] # use this for CV (train+test)
dat.valid = dat[-i.cv,] # save this for later (after CV)
# tuning of the classifiers:
K.knn = 3
K = 10
N = length(i.cv)
folds = cut(1:N, K, labels=FALSE)
acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)#prediction accuracies
auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)#AUC
acc.rf = auc.rf = numeric(K)
for(k in 1:K){ # 10-fold CV loop
# split into train and test samples:
#首先要划分训练样本和测试样本
i.train = which(folds!=k)
dat.train = dat.cv[i.train, ]
dat.test = dat.cv[-i.train, ]
#作出KNN拟合
# adapt these sets for kNN:
x.train = dat.train[,-1]
y.train = dat.train[,1]
x.test = dat.test[,-1]
y.test = dat.test[,1]
x.train[,1] = as.numeric(x.train[,1])
x.test[,1] = as.numeric(x.test[,1])
# train classifiers:
knn.o = knn(x.train, x.test, y.train, K.knn)
#通训练样本作出其他拟合
glm.o = glm(default~., data=dat.train, family=binomial(logit))# 做logistic regression
lda.o = lda(default~., data=dat.train)#做LDA
qda.o = qda(default~., data=dat.train)#做QDA
rf.o = randomForest(default~., data=dat.train)#做randomForest
#用测试样本进行预测
# test classifiers:
# (notice that predict.glm() does not have a functionality to
# return categorical values, so we copmute them based on the
# scores by applying a threshold of 50%)
# 测试分类器:
# (注意 predict.glm() 没有功能
# 返回分类值,因此我们根据
# 通过应用 50% 的阈值得分)
knn.p = knn.o
glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )
lda.p = predict(lda.o, newdata=dat.test)$class
qda.p = predict(qda.o, newdata=dat.test)$class
rf.p = predict(rf.o, newdata=dat.test)
#做各个方法的测试样本的混淆矩阵
# corresponding confusion matrices:
tb.knn = table(knn.p, y.test)
tb.glm = table(glm.p, y.test)
tb.lda = table(lda.p, y.test)
tb.qda = table(qda.p, y.test)
tb.rf = table(rf.p, y.test)
#每个方法的准确度
# store prediction accuracies:
acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)
acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)
acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)
acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)
acc.rf[k] = sum(diag(tb.rf)) / sum(tb.rf)
#
# ROC/AUC analysis:
#FandonForest没有
# WARNING: THIS IS NOT PR(Y=1 | X), BUT Pr(Y = Y_hat | X):
knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob
glm.p = predict(glm.o, newdata=dat.test, type="response")
lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]
qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]
#auc.knn[k] = roc(y.test, knn.p)$auc
auc.glm[k] = roc(y.test, glm.p)$auc
auc.lda[k] = roc(y.test, lda.p)$auc
auc.qda[k] = roc(y.test, qda.p)$auc
}
boxplot(acc.knn, acc.glm, acc.lda, acc.qda,
main="Overall CV prediction accuracy",
names=c("kNN","GLM","LDA","QDA"))
#下面两个的区别在于有没有KNN的AUC(因为KNN的AUC的图和其他方法放不在一起)
boxplot(auc.glm, auc.lda, auc.qda,
main="Overall CV AUC",
names=c("GLM","LDA","QDA"))
boxplot(auc.knn, auc.glm, auc.lda, auc.qda,
main="Overall CV AUC",
names=c("kNN","GLM","LDA","QDA"))
###############################################################
### Exercise 6: Variable importance from RF ####
###############################################################
#由RF看哪个variable是最重要的(系数越大越重要)
library(ISLR)
library(randomForest)
# ?Carseats
High = as.factor(ifelse(Carseats$Sales <= 8, 'No', 'Yes'))
CS = data.frame(Carseats, High)
CS$Sales = NULL
# grow a forest:
rf.out = randomForest(High~., CS)
# compare to bagging:
bag.out = randomForest(High~., CS, mtry=(ncol(CS)-1))
cbind(rf.out$importance, bag.out$importance)
#bag会让原本重要的variable更重要,会让不重要的variable更不重要,because essentially those strong predictors have a better chance to get selected every time.
#比如,CompPrice,用bag之后重要性增加了,后面四个variable,用bag之后重要性减少了,还可以说用rf,CompPrice的重要性是US的3倍
par(mfrow=c(1,2))
varImpPlot(rf.out, pch=15, main="Ensemble method 1")#Price最重要,urban最不重要
varImpPlot(bag.out, pch=15, main="Ensemble method 2")
#Mean decrease Gini, 纵坐标是减小的幅度,减小的幅度越大,也就是数值越大,variable越重要。
?randomForest
###############################################################
#### Exercise 7: gradient boosting ####
#baging 所有的自变量都是要用的; boosting是加权的
###############################################################
# gradient boosting model 梯度增强模型
# 之前的方法,下一阶与上一阶会有相关性,为了减小correlation
# 找到真实值与预测值的残差,下一阶基于上一阶的残差分析,每一阶都弥补了上一阶的残差,可以减小偏差
# 缺点:容易over fit,比较敏感
library(ISLR) # contains the dataset
library(tree) # contains... tree-building methods
#install.packages('gbm')
library(gbm) # contains the GB model implementation
library(pROC)
# Recode response variable so as to make it a classification problem
High = as.factor(ifelse(Carseats$Sales<=8, "No", "Yes"))
CS = data.frame(Carseats, High)
# remove Sales from data frame, just to make formulas simpler!
CS$Sales = NULL
####(1)####
set.seed(1)
itest = sample(1:nrow(CS), round(nrow(CS)/4))#题目里面要求选100个作为测试样本
CS.test = CS[itest,]#测试样本
CS = CS[-itest,]
####(2)####
set.seed(4061)
# Note:
gbm(High~., data=CS, distribution="bernoulli")
#对于分类问题,选择bernoulli或者adaboost,前者更为推荐
#对于连续因变量(回归),选择gaussian或者laplace
# so we must recode the levels...
CS$High = (as.numeric(CS$High=="Yes")) # 前面HIGH是factor,这里又变会numeric(yes=1,no=0), this hack could be useful
gb.out = gbm(High~., data=CS,
distribution="bernoulli", # use "gaussian" instead for regression
n.trees=5000, # size of the ensemble
interaction.depth=1) # depth of the trees, 1 = stumps 限制每棵树的深度
#distribution:模型计算损失函数时,需要对输出变量的数据分布做出假设。
#对于分类问题,选择bernoulli或者adaboost,前者更为推荐
#对于连续因变量,选择gaussian或者laplace
#n.trees:即number of iteration—迭代次数。迭代次数的选择与学习速率密切相关
#interaction.depth和n.minobsinnode:
#子决策树即基础学习器的深度和决策树叶节点包含的最小观测树,
#若基础学习器训练得过于复杂,将提升模型对于样本的拟合能力而导致过拟合问题,
#因此子决策树深度不宜过大,
#叶节点可包含的最小观测书不宜过小。
summary(gb.out$train.error)
# inspect output:
par(mar=c(4.5,6,1,1))
summary(gb.out, las=1)#summary函数返回自变量的相对重要性
plot(gb.out)
plot(gb.out, i="Price")# i is the index of variable or which variable you want to pick
#从图中可以看出,随着price的增加,预测的High数值的中位数越小
plot(gb.out, i="ShelveLoc")
#由于ShelveLoc是离散的,画出来是点,可以看出随着ShelveLoc变好,预测的High数值越大
gb.p = predict(gb.out, newdata=CS.test, n.trees=5000)
gb.p
roc.gb = roc(response=CS.test$High, predictor=gb.p)
plot(roc.gb)
roc.gb$auc
# compare AUC's with a Random Forest:
library(randomForest)
CS$High = as.factor(CS$High)
rf.out = randomForest(High~., CS, ntree=5000)
# fitted values for "training set"
rf.p = predict(rf.out, CS.test, type="prob")[,2]
roc.rf = roc(response=CS.test$High, predictor=rf.p)
plot(roc.rf, add=TRUE, col=2)
roc.gb$auc
roc.rf$auc
#AUC高的那个 great illustrate the model
###############################################################
#### Exercise 8: gradient boosting using caret...####
###############################################################
# Plug in the following snips of code within demo code
# Section4b_demo_using_caret.R:
############ (I) Classification example ############
### Gradient boosting (using caret) for classification
rm(list=ls()) # clear the environment
library(ISLR) # contains the data
library(caret) # contains everything else
set.seed(4061) # for reproducibility
# Set up the data (take a subset of the Hitters dataset)
data(Hitters)
Hitters = na.omit(Hitters)
dat = Hitters
n = nrow(dat)
NC = ncol(dat)
# Change the response variable to a factor to make this a
# classification problem:
dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),
"High","Low"))
# Data partition
itrain = sample(1:n, size=round(.7*n))
dat.train = dat[itrain,]
dat.validation = dat[-itrain,] # independent validation set for later
# x = select(dat.train,-"Salary") ### if using dplyr
# training set:
x = dat.train
x$Salary = NULL
y = as.factor(dat.train$Salary)
gb.out = train(Salary~., data=dat.train, method='gbm', distribution='bernoulli')
gb.fitted = predict(gb.out) # corresponding fitted values
gb.pred = predict(gb.out, dat.validation)
confusionMatrix(reference=dat.validation$Salary, data=gb.pred,
mode="everything")
############ (II) Regression example ############
### Gradient boosting (using caret) for regression
rm(list=ls()) # clear the environment
# Set up the data (take a subset of the Hitters dataset)
data(Hitters)
Hitters = na.omit(Hitters)
dat = Hitters
# hist(dat$Salary)
dat$Salary = log(dat$Salary)
n = nrow(dat)
NC = ncol(dat)
# Data partition
itrain = sample(1:n, size=round(.7*n))
dat.train = dat[itrain,]
dat.validation = dat[-itrain,]
x = dat.train
x$Salary = NULL
y = dat.train$Salary
ytrue = dat.validation$Salary
gb.out = train(Salary~., data=dat.train, method='gbm', distribution='gaussian')
gb.fitted = predict(gb.out) # corresponding fitted values
gb.pred = predict(gb.out, dat.validation)
mean((gb.pred-ytrue)^2)
# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
#### Section 4: demo code for xgboost (Extreme GB) ####
# --------------------------------------------------------
rm(list=ls()) # clear the environment
library(ISLR) # contains the data
library(xgboost) # XGBoost... and xgb.DMatrix()
library(caret)
set.seed(4061) # for reproducibility
# Set up the data (take a subset of the Hitters dataset)
data(Hitters)
Hitters = na.omit(Hitters)
dat = Hitters
n = nrow(dat)
NC = ncol(dat)
# Change the response variable to a factor to make this a
# classification problem:
dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),
"High","Low"))
# Data partition
itrain = sample(1:n, size=round(.7*n))
dat.train = dat[itrain,]
dat.validation = dat[-itrain,] # independent validation set for later
# x = select(dat.train,-"Salary") ### if using dplyr
# training set:
x = dat.train
x$Salary = NULL
y = dat.train$Salary
# test set:
x.test = dat.validation
x.test$Salary = NULL
y.test = dat.validation$Salary
# XGBoost...
set.seed(4061)
# (a) line up the data in the required format
# train set:
xm = model.matrix(y~., data=x)[,-1]
x.xgb = xgb.DMatrix(xm)
# test set:
xm.test = model.matrix(y.test~., x.test)[,-1]
x.xgb.test = xgb.DMatrix(xm.test)
# (b) training...
# NB: one can run xgbboost() with default parameters by using:
xgb.ctrl = trainControl(method="none")
# otherwise:
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.model = train(x.xgb, y, trControl=xgb.ctrl, method="xgbTree")
# NB: use argument tuneGrid to specify custom grids of values for
# tuning parameters. Otherwise train() picks its own grids.
xgb.model$bestTune
# (c) testing...
xgb.pred = predict(xgb.model, newdata=x.xgb.test)
confusionMatrix(data=xgb.pred, reference=y.test)
# --------------------------------------------------------
# The below demo code is only for information. There is no need
# to spend time looking into it for ST4061/ST6041 tests/exam!
#
# There are a number of parameters to be tuned for XGBoost:
modelLookup('xgbTree')
# All or a subset of these parameters can be tuned in a sequential
# manner. For each tuning parameter, we can define a grid of
# potential values and search for an optimal value within that grid.
#
# Careful! Running this code will take some time...
#
# (1) Max number of trees (just an example!):
tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),
eta = c(0.025, 0.05, 0.1, 0.3),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1)
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")
#
# (2) Max tree depth and min child weight (just an example!):
tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),
eta = xgb.tune$bestTune$eta,
max_depth = c(1:xgb.tune$bestTune$max_depth+2),
gamma = 0,
colsample_bytree = 1,
min_child_weight = c(1:3),
subsample = 1)
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")
#
# (3) sampling (just an example!):
tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),
eta = xgb.tune$bestTune$eta,
max_depth = xgb.tune$bestTune$max_depth,
gamma = 0,
colsample_bytree = seq(0.2,1,by=.2),
min_child_weight = xgb.tune$bestTune$min_child_weight,
subsample = seq(.5,1,by=.1))
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")
#
# (4) gamma (just an example!):
tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),
eta = xgb.tune$bestTune$eta,
max_depth = xgb.tune$bestTune$max_depth,
gamma = seq(0,1,by=.1),
colsample_bytree = xgb.tune$bestTune$colsample_bytree,
min_child_weight = xgb.tune$bestTune$min_child_weight,
subsample = xgb.tune$bestTune$subsample)
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")
#
# (5) learning rate (just an example!):
tune.grid = expand.grid(nrounds = seq(500, 5000, by=100),
eta = c(0.01,0.02,0.05,0.075,0.1),
max_depth = xgb.tune$bestTune$max_depth,
gamma = xgb.tune$bestTune$gamma,
colsample_bytree = xgb.tune$bestTune$colsample_bytree,
min_child_weight = xgb.tune$bestTune$min_child_weight,
subsample = xgb.tune$bestTune$subsample)
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")
#
# Then fit:
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")
# testing:
xgb.pred = predict(xgb.model, newdata=x.xgb.test)
confusionMatrix(data=xgb.pred, reference=y.test)