#### Section 4: demo code for xgboost (Extreme GB) ####
# --------------------------------------------------------
rm(list=ls()) # clear the environment
library(ISLR) # contains the data
library(xgboost) # XGBoost... and xgb.DMatrix()
library(caret)
set.seed(4061) # for reproducibility
# Set up the data (take a subset of the Hitters dataset)
data(Hitters)
Hitters = na.omit(Hitters)
dat = Hitters
n = nrow(dat)
NC = ncol(dat)
# Change the response variable to a factor to make this a
# classification problem:
dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),
"High","Low"))
# Data partition
itrain = sample(1:n, size=round(.7*n))
dat.train = dat[itrain,]
dat.validation = dat[-itrain,] # independent validation set for later
# x = select(dat.train,-"Salary") ### if using dplyr
# training set:
x = dat.train
x$Salary = NULL
y = dat.train$Salary
# test set:
x.test = dat.validation
x.test$Salary = NULL
y.test = dat.validation$Salary
# XGBoost...
set.seed(4061)
# (a) line up the data in the required format
# train set:
xm = model.matrix(y~., data=x)[,-1]
x.xgb = xgb.DMatrix(xm)
# test set:
xm.test = model.matrix(y.test~., x.test)[,-1]
x.xgb.test = xgb.DMatrix(xm.test)
# (b) training...
# NB: one can run xgbboost() with default parameters by using:
xgb.ctrl = trainControl(method="none")
# otherwise:
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.model = train(x.xgb, y, trControl=xgb.ctrl, method="xgbTree")
# NB: use argument tuneGrid to specify custom grids of values for
# tuning parameters. Otherwise train() picks its own grids.
xgb.model$bestTune
# (c) testing...
xgb.pred = predict(xgb.model, newdata=x.xgb.test)
confusionMatrix(data=xgb.pred, reference=y.test)
# --------------------------------------------------------
# The below demo code is only for information. There is no need
# to spend time looking into it for ST4061/ST6041 tests/exam!
#
# There are a number of parameters to be tuned for XGBoost:
modelLookup('xgbTree')
# All or a subset of these parameters can be tuned in a sequential
# manner. For each tuning parameter, we can define a grid of
# potential values and search for an optimal value within that grid.
#
# Careful! Running this code will take some time...
#
# (1) Max number of trees (just an example!):
tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),
eta = c(0.025, 0.05, 0.1, 0.3),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1)
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")
#
# (2) Max tree depth and min child weight (just an example!):
tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),
eta = xgb.tune$bestTune$eta,
max_depth = c(1:xgb.tune$bestTune$max_depth+2),
gamma = 0,
colsample_bytree = 1,
min_child_weight = c(1:3),
subsample = 1)
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")
#
# (3) sampling (just an example!):
tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),
eta = xgb.tune$bestTune$eta,
max_depth = xgb.tune$bestTune$max_depth,
gamma = 0,
colsample_bytree = seq(0.2,1,by=.2),
min_child_weight = xgb.tune$bestTune$min_child_weight,
subsample = seq(.5,1,by=.1))
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")
#
# (4) gamma (just an example!):
tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),
eta = xgb.tune$bestTune$eta,
max_depth = xgb.tune$bestTune$max_depth,
gamma = seq(0,1,by=.1),
colsample_bytree = xgb.tune$bestTune$colsample_bytree,
min_child_weight = xgb.tune$bestTune$min_child_weight,
subsample = xgb.tune$bestTune$subsample)
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")
#
# (5) learning rate (just an example!):
tune.grid = expand.grid(nrounds = seq(500, 5000, by=100),
eta = c(0.01,0.02,0.05,0.075,0.1),
max_depth = xgb.tune$bestTune$max_depth,
gamma = xgb.tune$bestTune$gamma,
colsample_bytree = xgb.tune$bestTune$colsample_bytree,
min_child_weight = xgb.tune$bestTune$min_child_weight,
subsample = xgb.tune$bestTune$subsample)
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")
#
# Then fit:
xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)
xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")
# testing:
xgb.pred = predict(xgb.model, newdata=x.xgb.test)
confusionMatrix(data=xgb.pred, reference=y.test)
# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
#### Section 5: SVM Support Vector Machine ####
# --------------------------------------------------------
rm(list=ls()) # clear out running environment
library(randomForest)
library(class)
library(pROC)
library(e1071)
#SVM:一种复杂的分类工具
#### Example using SVM on the "best-student" data ####
# simulate data:生成随机数
set.seed(1)
n = 100
mark = rnorm(n, m=50, sd=10)
choc = rnorm(n, m=60, sd=5)
summary(mark)
summary(choc)
#评分标准--f(x) separating hyperplane: f(x)=int+a*mark + b*choc
int = 10
a = 2
b = 4
# rating of students on basis of their marks and level of appreciation of chocolate:
# 根据学生的分数和对巧克力的欣赏程度给他们打分
#生成data set:
mod = int + a*mark + b*choc # values for true model,生成separating hyperplane,该线/平面上是理论值
z = rnorm(n,s=8) # additive noise
obs = mod + z #围绕separating hyperplane生成一系列实际观测值
#分类:
y = as.factor(ifelse(obs>350,1,0)) # classification data,y记录了每个实际观测值所对应的真实分类
plot(mod, obs, xlab="model", ylab="observations", pch=20, cex=2)
plot(mark, choc, xlab="x1 (mark)", ylab="x2 (choc)",
pch=20, col=c(2,4)[as.numeric(y)], cex=2)
legend("topright", box.col=8, bty='n',
legend=c("good student","better student"),
pch=15, col=c(2,4))
table(y)#可以看到how complicated the data set is to be classify
set.seed(1)
# split the data into train+test (50%-50%):
# 生成的dataset是由观测值mark&choc组成的x,对应分类组成的y构成的
x = data.frame(mark,choc)
i.train = sample(1:n, 50)
x.train = x[i.train,]
x.test = x[-i.train,]
y.train = y[i.train]
y.test = y[-i.train]
class(x.train)
class(y.train)
xm = as.matrix(x.train)
# fit an SVM as follows:
# ?svm # in e1071
set.seed(1)
svmo = svm(xm, y.train, kernel='polynomial') #kernel参数可换,根据plot图像看一下大概是什么分割线
#把x和y输入,通过肉眼观察,假定kernel是多项式(曲线)的
#直线linear、曲线polynomial、圆radial basis
svmo
names(svmo)
# cf. ?svm:
# "Parameters of SVM-models usually must be tuned to yield sensible results!"
# 支持向量机模型的参数通常必须调整以产生合理的结果
#为了找到最合理的svm参数(find the maximum-margin hyperplane,离margin近的点才对其有影响)
# one can tune the model as follows:在指定范围ranges&gamma内找到最合适的Parameters
set.seed(1)
svm.tune = e1071::tune(svm, train.x=x.train, train.y=y.train,
kernel='polynomial',
ranges=list(cost=10^(-2:4), gamma=c(0.25,0.5,1,1.5,2)))
# 这里列出一系列想要尝试的调节参数
svm.tune
svm.tune$best.parameters#最好的
# then fit final SVM for optimal parameters:测试样本
svmo.final = svm(xm, y.train, kernel='polynomial',
gamma=svm.tune$best.parameters$gamma,
cost=svm.tune$best.parameters$cost)
# corresponding confusion matrices:混淆矩阵看分的怎么样
table(svmo$fitted,y.train)
table(svmo.final$fitted,y.train)
# we can also use caret for easy comparison:直接看accuracy
library(caret)
caret::confusionMatrix(svmo$fitted,y.train)$overall[1]
caret::confusionMatrix(svmo.final$fitted,y.train)$overall[1]
# assessing model fit to training data评估模型是否适合训练数据
identical(fitted(svmo), svmo$fitted)#TRUE——got the right information here
# to identify support vectors: 能左右hyperplane的点
# either svmo$index (indices), or svmo$SV (coordinates)
length(svmo$index)#tuned前的
length(svmo.final$index)#tuned后的,tuned后能左右hyperplane的点变少了,说明分得更干净了,svm拟合的结果更好了
# visualize:
plot(x.train, pch=20, col=c(1,2)[y.train], cex=2)
points(svmo$SV, pch=14, col=4, cex=2) # explain why this does not work!?
# apply scaling to dataset to see SV's:一定要先归一化,才能描出用到的点(用于找到最好参数的离margin近的点,能左右hyperplane的点)
plot(apply(x.train,2,scale), pch=20, col=c(1,2)[y.train], cex=2)
points(svmo$SV, pch=14, col=4, cex=2)
points(svmo.final$SV, pch=5, col=3, cex=2)
# If you want to use predict(), use a formula-type
# expression when calling svm(). Because of this,
# we re-shape our dataset:
#如果你想使用predict(),在调用svm()时使用公式类型的表达式。因此,我们重新塑造我们的数据集:
dat.train = data.frame(x=x.train, y=y.train)
dat.test = data.frame(x=x.test)
# decision boundary visualization:
svmo = svm(y~., data=dat.train)
plot(svmo, dat.train,
svSymbol = 15, dataSymbol = 'o',
col=c('cyan','pink')) # this is plot.svm()
svmo.final = svm(y~., data=dat.train, kernel='polynomial',
gamma=svm.tune$best.parameters$gamma,
cost=svm.tune$best.parameters$cost) #最好的拟合
plot(svmo.final, dat.train,
svSymbol = 15, dataSymbol = 'o',
col=c('cyan','pink')) # this is plot.svm()
# How to generate predictions from SVM fit:
# fitting the SVM model:
svmo = svm(y~., data=dat.train,
kernel='polynomial',
gamma=svm.tune$best.parameters$gamma,
cost=svm.tune$best.parameters$cost)
# Note that if we need probabilities P(Y=1)
# (for example to calculate ROC+AUC),
# we need to set 'probability=TRUE' also in
# fitting the SVM model:
svmo = svm(y~., data=dat.train, probability=TRUE,
kernel='polynomial',
gamma=svm.tune$best.parameters$gamma,
cost=svm.tune$best.parameters$cost)
#Generate predictions from SVM fit:
svmp = predict(svmo, newdata=dat.test, probability=TRUE)
roc.svm = roc(response=y.test, predictor=attributes(svmp)$probabilities[,2])
roc.svm$auc#越接近1越好
plot(roc.svm)#very happy
# compare with RF:
rf = randomForest(y~., data=dat.train)
rfp = predict(rf, dat.test, type='prob')
roc.rf = roc(y.test, rfp[,2])
roc.rf$auc
plot(roc.svm)
par(new=TRUE)
plot(roc.rf, col='yellow')
legend("bottomright", bty='n',
legend=c("RF","SVM"),
lty=1, lwd=3, col=c('yellow',1))
#random forest 比不过svm
--------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
##### Section 5: demo code for effect of kernel on SVM ####
# Here we simulate 2D data that have a circular spatial
# distribution, to see the effect of the choice of kernel
# shape on decision boundaries
# --------------------------------------------------------
rm(list=ls())
library(e1071)
# Simulate circular data...
# simulate 2-class circular data:
set.seed(4061)
n = 100
S1 = 15; S2 = 3
x1 = c(rnorm(60, m=0, sd=S1), rnorm(40, m=0, sd=S2))
x2 = c(rnorm(60, m=0, sd=S1), rnorm(40, m=0, sd=S2))
# corresponding 2D circular radii:
rads = sqrt(x1^2+x2^2)
# make up the 2 classes in terms of whether lower or greater than median radius:
c1 = which(rads c2 = c(1:n)[-c1] # now we apply scaling factors to further separate the # 2 classes: x1[c1] = x1[c1]/1.2 x2[c1] = x2[c1]/1.2 x1[c2] = x1[c2]*1.2 x2[c2] = x2[c2]*1.2 # label data according to class membership: lab = rep(1,n) lab[c2] = 2#lab里,c1对应的位置为1;c2对应的位置为2 par(mfrow=c(1,1)) plot(x1,x2,col=c(1,2)[lab], pch=c(15,20)[lab], cex=1.5) # create final data frame: x = data.frame(x1,x2) y = as.factor(lab) dat = cbind(x,y) # apply SVMs with different choices of kernel shapes: svmo.lin = svm(y~., data=dat, kernel='linear') svmo.pol = svm(y~., data=dat, kernel='polynomial') svmo.rad = svm(y~., data=dat, kernel='radial') svmo.sig = svm(y~., data=dat, kernel='sigmoid') plot(svmo.lin, dat, col=c("cyan","pink"), svSymbol=15) plot(svmo.pol, dat, col=c("cyan","pink"), svSymbol=15) plot(svmo.rad, dat, col=c("cyan","pink"), svSymbol=15) plot(svmo.sig, dat, col=c("cyan","pink"), svSymbol=15) #### NOTE: the code below is outside the scope of this course! 注意:下面的代码超出了本课程的范围! #### It is used here for illustrations purposes only. # this call format is easier when using predict(): svmo.lin = svm(x, y, kernel='linear', scale=F) svmo.pol = svm(x, y, kernel='polynomial', scale=F) svmo.rad = svm(x, y, kernel='radial', scale=F) svmo.sig = svm(x, y, kernel='sigmoid', scale=F) # evaluate the SVM boundaries on a regular 2D grid of points: ng = 50 xrg = apply(x, 2, range) x1g = seq(xrg[1,1], xrg[2,1], length=ng) x2g = seq(xrg[1,2], xrg[2,2], length=ng) xgrid = expand.grid(x1g, x2g) plot(x, col=c(1,2)[y], pch=20) abline(v=x1g, col=8, lty=1) abline(h=x2g, col=8, lty=1) # ygrid.lin = predict(svmo.lin, xgrid) ygrid.pol = predict(svmo.pol, xgrid) ygrid.rad = predict(svmo.rad, xgrid) ygrid.sig = predict(svmo.sig, xgrid) par(mfrow=c(2,2), font.lab=2, font.axis=2) CEX = .5 COLS = c(1,3) DCOLS = c(2,4) # plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.lin)], cex=CEX, main="Linear kernel") points(x, col=DCOLS[as.numeric(y)], pch=20) # points(x[svmo.lin$index,], pch=21, cex=2) points(svmo.lin$SV, pch=21, cex=2) # same as previous line! # plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.pol)], cex=CEX, main="Polynomial kernel") points(x, col=DCOLS[as.numeric(y)], pch=20) points(x[svmo.pol$index,], pch=21, cex=2) # plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.rad)], cex=CEX, main="Radial kernel") points(x, col=DCOLS[as.numeric(y)], pch=20) points(x[svmo.rad$index,], pch=21, cex=2) # plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.sig)], cex=CEX, main="Sigmoid kernel") points(x, col=DCOLS[as.numeric(y)], pch=20) points(x[svmo.sig$index,], pch=21, cex=2) # Alternative plot: par(mfrow=c(2,2), font.lab=2, font.axis=2) CEX = .5 COLS = c(1,3) DCOLS = c(2,4) # L1 = length(x1g) L2 = length(x2g) # plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.lin)], cex=CEX, main="Linear kernel") bnds = attributes(predict(svmo.lin, xgrid, decision.values=TRUE))$decision contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2) # plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.pol)], cex=CEX, main="Polynomial kernel") bnds = attributes(predict(svmo.pol, xgrid, decision.values=TRUE))$decision contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2) # plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.rad)], cex=CEX, main="Radial kernel") bnds = attributes(predict(svmo.rad, xgrid, decision.values=TRUE))$decision contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2) # plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.sig)], cex=CEX, main="Sigmoid kernel") bnds = attributes(predict(svmo.sig, xgrid, decision.values=TRUE))$decision contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2) # NB: naive Bayes decision boundary is obtained with # contour(x1g, x2g, matrix(bnds, L1, L2), level=0.5, add=TRUE, col=4, lwd=2) # -------------------------------------------------------- # ST4061 / ST6041 # 2021-2022 # Eric Wolsztynski # ... #### Exercises Section 5: SVM #### # -------------------------------------------------------- rm(list=ls()) library(randomForest) library(class) library(pROC) library(e1071) library(caret) library(ISLR) ############################################################### #### Exercise 1: using SVM to classify (High Carseat sales dataset) #### ############################################################### library(ISLR) # contains the dataset # Recode response variable so as to make it a classification problem High = ifelse(Carseats$Sales<=8, "No", "Yes") CS = data.frame(Carseats, High) CS$Sales = NULL #construct dataset x = CS x$High = NULL y = CS$High # split the data into train+test (50%-50%): n = nrow(CS) set.seed(4061) i.train = sample(1:n, 350) x.train = x[i.train,] x.test = x[-i.train,] y.train = y[i.train] y.test = y[-i.train] class(x.train) class(y.train) # ?svm : svm有两种形式 # svmo = svm(xm, y.train, kernel='polynomial')##svm(x, y = NULL, scale = TRUE, type = NULL, kernel ="radial") # svmo = svm(y~., data=dat.train)##svm(formula, data = NULL, ..., subset, na.action = na.omit, scale = TRUE) #由于x必须喂一个matrix,y必须喂一个numeric,所以x.train=as.matrix(x.train),y.train=as.numeric(y.train) # (3) Explain why this does not work: svmo = svm(x.train, y.train, kernel='polynomial') # >> The problem is the presence of categorical variables in # the dataset. They must be "recoded" into numerical variables # for svm() to analyse their spatial contribution. # (4) Carry out the appropriate fix from your conclusion from (a). # Then, fit two SVM models, one using a linear kernel, the other # a polynomial kernel. Compare the two appropriately. #将两个SVM分类器适合于适当的“按摩”训练集,一个使用线性核函数,另一个使用多项式核函数。 #修正的做法: NC = ncol(x) # x = x[,-c(NC-1,NC)] # take out the last two columns (predictors) xm = model.matrix(y~.+0, data=x)#remove the intercept xm.train = xm[i.train,] xm.test = xm[-i.train,] y.train = as.factor(y.train) # so that svm knows it's classification svmo.lin = svm(xm.train, y.train, kernel='linear') svmo.pol = svm(xm.train, y.train, kernel='polynomial') svmy.lin = fitted(svmo.lin)#fitted:显示x的对应类别y svmy.pol = fitted(svmo.pol) table(y.train, fitted(svmo.lin)) table(y.train, fitted(svmo.pol)) # (5) Comparison... # * visual (there are better ways of visualising!): par(mfrow=c(1,3)) yl = as.integer(y=="Yes")+1 plot(apply(xm,2,scale), pch=c(15,20)[yl], col=c(1,4)[yl], cex=c(1.2,2)[yl], main="The data") # plot(apply(xm.train,2,scale), pch=c(15,20)[y.train], col=c(1,4)[y.train], cex=1, main="linear") points(svmo.lin$SV, pch=5, col=2, cex=1.2) # plot(apply(xm.train,2,scale), pch=c(15,20)[y.train], col=c(1,4)[y.train], cex=1, main="polynomial") points(svmo.pol$SV, pch=5, col=2, cex=1.2) # * in terms of training fit: svmy.lin = fitted(svmo.lin) svmy.pol = fitted(svmo.pol) table(y.train, svmy.lin) table(y.train, svmy.pol) # * test error: pred.lin = predict(svmo.lin, newdata=xm.test, probability=TRUE) pred.pol = predict(svmo.pol, newdata=xm.test) # ... the above does not work well: summary(pred.lin) # --> these are not probabilities! That's because we need to specify # ", probability=TRUE" also when fitting the SVM, in order to enable # probabilities to be computed and returned... # 这些都不是概率!这是因为我们在拟合SVM时也需要指定“,probability=TRUE”,以便能够计算和返回概率…… # SO IF WE WANT TO GENERATE TEST-SET PREDICTIONS, THIS IS THE WAY: svmo.lin = svm(xm.train, y.train, kernel='linear', probability=TRUE) svmo.pol = svm(xm.train, y.train, kernel='polynomial', probability=TRUE) pred.lin = predict(svmo.lin, newdata=xm.test, probability=TRUE) pred.pol = predict(svmo.pol, newdata=xm.test, probability=TRUE) y.test = as.factor(y.test) confusionMatrix(y.test, pred.lin) confusionMatrix(y.test, pred.pol) # * AUC (we need to extract P(Y=1|X)) p.lin = attributes(pred.lin)$probabilities[,2] p.pol = attributes(pred.pol)$probabilities[,2] y.test = as.factor(y.test) roc(response=y.test, predictor=p.lin)$auc roc(response=y.test, predictor=p.pol)$auc #sensitivity和specificity都很高 ############################################################### #### Exercise 2: 3-class problem (iris dataset) #### ############################################################### x = iris x$Species = NULL y = iris$Species set.seed(4061) n = nrow(x) i.train = sample(1:n, 100) x.train = x[i.train,] x.test = x[-i.train,] y.train = y[i.train] y.test = y[-i.train] # (a) plot(x.train[,1:2], pch=20, col=c(1,2,4)[as.numeric(y.train)], cex=2) # (b) dat = data.frame(x.train, y=as.factor(y.train)) svmo.lin = svm(y~., data=dat, kernel='linear') svmo.pol = svm(y~., data=dat, kernel='polynomial') svmo.rad = svm(y~., data=dat, kernel='radial') # # number of support vectors: summary(svmo.lin) summary(svmo.pol) summary(svmo.rad) #The number of support vectors less, the more complicated controversy. # test error: pred.lin = predict(svmo.lin, newdata=x.test) pred.pol = predict(svmo.pol, newdata=x.test) pred.rad = predict(svmo.rad, newdata=x.test) cm.lin = confusionMatrix(y.test, pred.lin) cm.pol = confusionMatrix(y.test, pred.pol) cm.rad = confusionMatrix(y.test, pred.rad) c(cm.lin$overall[1], cm.pol$overall[1], cm.rad$overall[1]) #rad more accurcte. # (c) tune the model(via cross-validation)... set.seed(4061) svm.tune = e1071::tune(svm, train.x=x.train, train.y=y.train, kernel='radial', ranges=list(cost=10^(-2:2), gamma=c(0.5,1,1.5,2))) print(svm.tune) names(svm.tune) # retrieve optimal hyper-parameters bp = svm.tune$best.parameters # use these to obtain final SVM fit: svmo.rad.tuned = svm(y~., data=dat, kernel='radial', cost=bp$cost, gamma=bp$gamma) summary(svmo.rad) summary(svmo.rad.tuned)#Not changed much,means it's got more to do with the shape of the kernel itself #, rather than how the model is tunes for this dataset. # test set predictions from tuned SVM model: pred.rad.tuned = predict(svmo.rad.tuned, newdata=x.test) cm.rad.tuned = confusionMatrix(y.test, pred.rad.tuned) c(cm.rad$overall[1], cm.rad.tuned$overall[1]) # so maybe not an exact science!? # ... in fact these performances are comparable, bear in mind CV assessment is # itself subject to variability... #These are estimates of prediction performance, there is uncertainty related to the points where the value you have here, #which means when you see 100% or 96%, you are really looking at the same thing. So you need to look at the confidence interval #around these value to understand what you are really seeing a difference or not. ############################################################### #### Exercise 3: SVM using caret #### ############################################################### # Set up the data (take a subset of the Hitters dataset) data(Hitters) Hitters = na.omit(Hitters) dat = Hitters n = nrow(dat) NC = ncol(dat) # Change into a classification problem: dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary), "High","Low")) # Data partition set.seed(4061) itrain = sample(1:n, size=round(.7*n)) dat.train = dat[itrain,] dat.validation = dat[-itrain,] # independent validation set x = dat.train # training set x$Salary = NULL y = as.factor(dat.train$Salary) ### Random forest rf.out = caret::train(Salary~., data=dat.train, method='rf') rf.pred = predict(rf.out, dat.validation) rf.cm = confusionMatrix(reference=dat.validation$Salary, data=rf.pred, mode="everything") ### SVM (linear) svm.out = caret::train(Salary~., data=dat.train, method="svmLinear") svm.pred = predict(svm.out, dat.validation) svm.cm = confusionMatrix(reference=dat.validation$Salary, data=svm.pred, mode="everything") # modelLookup('svmRadial') ### SVM (radial) svmR.out = caret::train(Salary~., data=dat.train, method="svmRadial") svmR.pred = predict(svmR.out, dat.validation) svmR.cm = confusionMatrix(reference=dat.validation$Salary, data=svmR.pred, mode="everything") perf = rbind(rf.cm$overall, svm.cm$overall, svmR.cm$overall) row.names(perf) = c("RF","SVM.linear","SVM.radial") round(perf, 4) perf = cbind(rf.cm$overall, svm.cm$overall, svmR.cm$overall) colnames(perf) = c("RF","SVM.linear","SVM.radial") round(perf, 4) #评价就看accuracy,结合一开始的图像,图像不像是线性的,所以rf应该比较好 ############################################################### #### Exercise 4: SVM-based regression #### ############################################################### x = iris x$Sepal.Length = NULL y = iris$Sepal.Length#using Sepal.Length as response variable pairs(iris[,1:4]) ?pairs#pairs生成一个配对的散点图矩阵,矩阵由X中的每列的列变量对其他各列列变量的散点图组成 set.seed(4061) n = nrow(x) i.train = sample(1:n, 100) x.train = x[i.train,] x.test = x[-i.train,] y.train = y[i.train] y.test = y[-i.train] dat.train = cbind(x.train,y=y.train) # specify statistical training settings: ctrl = caret::trainControl(method='cv') # perform statistical training: svm.o = caret::train(y~., data=dat.train, method="svmLinear", trControl=ctrl)#trControl=ctrl # compute test set predictions: svm.p = predict(svm.o, newdata=x.test) # and corresponding MSE: mean( (y.test-svm.p)^2 ) par(pty='s') #makes or a square plot box rr = range(c(y.test, svm.p)) plot(y.test, svm.p, pch=20, xlab="true values", ylab="predicted values", xlim=rr,ylim=rr) abline(a=0,b=1) #Here is a very good enlightenment #### Section 6 神经网络#### ####分类——归一化;回归——标准化#### #learning rate:is applied to sort of calibrate the speed of the learning process data. #DL deep learning vs ML meachine learning ------------------------------------------------------------ #### Example 1 : iris data with neuralnet #### #install.packages('neuralnet') library(neuralnet) n = nrow(iris) dat = iris[sample(1:n), ] # shuffle initial dataset NC = ncol(dat) nno = neuralnet(Species~., data=dat, hidden=c(6,5))#知道我们要建几层,每层几个节点时 #hidden 第一层6个,第二层5个的神经网络 plot(nno, information=FALSE,#不要标数据 col.entry='red', col.out='green', show.weights=FALSE) plot(nno, information=TRUE,#要标数据 col.entry='red', col.out='green', show.weights=TRUE) #(1)‘blue’ bits are bias #(2)在黑色线条上的数字是weights, 可以是positive, 也可以是negative的 #### Example 2: single layer NN - regression - effect of scaling #### library(nnet) # implements single layer NNs library(mlbench) # includes dataset BostonHousing #install.packages('mlbench') data(BostonHousing) # load the dataset #View(BostonHousing) # train neural net n = nrow(BostonHousing) itrain = sample(1:n, round(.7*n), replace=FALSE)#要70%的数据并取整作为训练样本 nno = nnet(medv~., data=BostonHousing, subset=itrain, size=5)#size 隐藏层中的单元数 #只知道几个神经元,但不知道有几层 #?nnet summary(nno$fitted.values) #We saw the fitted value are all 1, it because the information was not scaled, we should do scale first. # the above output indicates the algorithm did not # converge, probably due to explosion of the gradients... #都是1,说明算法没有收敛,需要归一化值(50 是这个数据的最大值,除以最大值): # We try again, this time normalizing the values # (50 is the max value for this data, see its range): nno = nnet(medv/50~., data=BostonHousing, subset=itrain, size=5) summary(nno$fitted.values)# there was thus a need to normalise the response variable... # 测试神经网络 preds = predict(nno, newdata=BostonHousing[-itrain,]) * 50 # (we multiply by 50 to fall back to original data domain) #(由于之前除以了50,我们乘以 50 以回退到原始数据域) summary(preds) # RMSE:偏方误差 sqrt(mean((preds-BostonHousing$medv[-itrain])^2)) # compare with lm(): lmo = lm(medv~., data=BostonHousing, subset=itrain) lm.preds = predict(lmo, newdata= BostonHousing[-itrain,]) # RMSE: sqrt(mean((lm.preds-BostonHousing$medv[-itrain])^2)) #Compare with lm, we have a lower test RMSE. #平价的时候看一下length(itrain)和dim(BostonHousing),看有没有足够的训练样本和测试样本 # Further diagnostics may highlight various aspects of the # model fit - always run these checks!每次都要检查跟lm的比较 # 进一步的诊断可能会突出模型拟合的各个方面 - 始终运行这些检查! par(mfrow=c(2,2)) ################################################################################ #PLOT(1): residuals form LM against NN plot(lmo$residuals, nno$residuals*50) abline(a=0, b=1, col="limegreen") #there are some extreme residuals from LM ################################################################################ #PLOT(2): residuals from LM against Original Train Data - no relationship plot(BostonHousing$medv[itrain], lmo$residuals, pch=20) ################################################################################ #PLOT(3): residuals from LM against Original Train Data (in grey) # plus(+) residuals from NN against Original Train Data plot(BostonHousing$medv[itrain], lmo$residuals, pch=20, col=8) points(BostonHousing$medv[itrain], nno$residuals*50, pch=20) ################################################################################ #PLOT(4): QQ plot #In general, Noise is assumed to be normal distributed or Gaussian... #But in NN, we do not make this assumption. Since between X(input) and Y(output), we hope to use some non-linear function. #But QQ plot is still a useful tool. qqnorm(nno$residuals) abline(a=mean(nno$residuals), b=sd(nno$residuals), col=2) #### Example 3: effect of tuning parameters (iris data) #### rm(list=ls()) n = nrow(iris) # 像往常一样打乱初始数据集(删除第 4 个值,减少数据量,使数据更难准确)打乱数据、重排 #移除Petal.Width数据 dat = iris[sample(1:n),-4] NC = ncol(dat) #data scaling 不是变成0,1分布,而是将数据缩放至为 [0,1] ####scale function: y_normalized = (y-min(y)) / (max(y)-min(y)):#### #dat离第四列character型的数据不用归一化,所以dat[,-NC] mins = apply(dat[,-NC],2,min) maxs = apply(dat[,-NC],2,max) dats = dat dats[,-NC] = scale(dats[,-NC],center=mins,scale=maxs-mins) # 设置训练样本: itrain = sample(1:n, round(.7*n), replace=FALSE) nno = nnet(Species~., data=dats, subset=itrain, size=5) # 预测: nnop = predict(nno, dats[-itrain,]) head(nnop)#返回向量、矩阵、表格、数据框或函数的第一部分 #这是从概率中获取预测标签的一种方法: preds = max.col(nnop) #找到矩阵每一行的最大位置在第几列,用来做混淆矩阵 #(上面的行为每一行选择概率最高的列,即每个观察值)或者我们可以直接使用它: preds = predict(nno, dats[-itrain,], type='class') tbp = table(preds, dats$Species[-itrain])#混淆矩阵 sum(diag(tbp))/sum(tbp) #准确率 # #nnet里size怎样影响正确率 ####找到最合适的size#### # 在这里,我们尝试使用 1 到 10 的尺寸进行说明,但您可以随意使用这些值! sizes = c(1:10) rate = numeric(length(sizes)) # 训练集分类准确率 ratep = numeric(length(sizes)) # 测试集分类准确率 for(d in 1:length(sizes)){ nno = nnet(Species~., data=dats, subset=itrain, size=sizes[d]) tb = table(max.col(nno$fitted.values), dats$Species[itrain]) rate[d] = sum(diag(tb))/sum(tb) # now looking at test set predictions nnop = predict(nno, dats[-itrain,]) tbp = table(max.col(nnop), dats$Species[-itrain]) ratep[d] = sum(diag(tbp))/sum(tbp) } plot(rate, pch=20, t='b', xlab="layer size", ylim=range(c(rate,ratep))) points(ratep, pch=15, t='b', col=2) legend('bottomright', legend=c('training','testing'), pch=c(20,15), col=c(1,2), bty='n') # 注意训练集和测试集的表现不一定相似...... #由此找到最好最合适的size #nnet里decay(权重衰减参数,默认为 0) 怎样影响正确率 decays = seq(1,.0001,lengt=11) rate = numeric(length(decays)) # train-set classification rate ratep = numeric(length(decays)) # test-set classification rate for(d in 1:length(decays)){ # fit NN with that particular decay value (decays[d]): nno = nnet(Species~., data=dats, subset=itrain, size=10, decay=decays[d]) # corresponding train set confusion matrix: tb = table(max.col(nno$fitted.values), dats$Species[itrain]) rate[d] = sum(diag(tb))/sum(tb) # now looking at test set predictions: nnop = predict(nno, dats[-itrain,]) tbp = table(max.col(nnop), dats$Species[-itrain]) ratep[d] = sum(diag(tbp))/sum(tbp) } plot(decays, rate, pch=20, t='b', ylim=range(c(rate,ratep))) points(decays, ratep, pch=15, t='b', col=2) legend('topright', legend=c('training','testing'), pch=c(20,15), col=c(1,2), bty='n') rm(list=ls()) ######Exercise ###### #### Exercise 1#### # 1. What type of neural network does this code implement? FFNN #有两种函数能实现神经网络,一种是step function——大于某一个值是1,小于是0,非黑即白;一种是activation function——有确切的数字输出 # 2. library(MASS) library(neuralnet) # --- NN with one 10-node hidden layer nms = names(Boston)[-14] f = as.formula(paste("medv ~", paste(nms, collapse = " + "))) # fit a single-layer, 10-neuron NN: set.seed(4061) out.nn = neuralnet(f, data=Boston, hidden=c(10), rep=5, linear.output=FALSE) #plot(out.nn, information=TRUE, col.entry='red', col.out='green',show.weights=TRUE) #create single-hidden layer neural network and repeat 5 times # without using an activation function: set.seed(4061) out.nn.lin = neuralnet(f, data=Boston, hidden=c(10), rep=1, linear.output=TRUE) # Warning message: 算法在 stepmax 内的 1 次重复中没有收敛,所以需要运行此代码两遍 # Algorithm did not converge in 1 of 1 repetition(s) within the stepmax. #线性输出: set.seed(4061) out.nn.tanh = neuralnet(f, data=Boston, hidden=c(10), rep=5, linear.output=FALSE, act.fct='tanh') p1 = predict(out.nn, newdata=Boston) p2 = predict(out.nn.tanh, newdata=Boston) sqrt(mean((p1-Boston$medv)^2)) sqrt(mean((p2-Boston$medv)^2)) #参数: #linear.output: logical,线性输出为TRUE,nonlinear为FALSE #rep: 神经网络训练的重复次数 #act.fct: 一个可微函数,用于平滑协变量或神经元与权重的叉积的结果 #act.fct: 默认“logistic”,也可以是“tanh” #### Exercise 2 #### library(neuralnet) set.seed(4061) n = nrow(iris) dat = iris[sample(1:n), ] # shuffle initial dataset NC = ncol(dat) nno = neuralnet(Species~., data=dat, hidden=c(6,5)) plot(nno) #### Exercise 3 ##### #Load dataset MASS::Boston and perform a 70%-30% split for training and test sets #respectively. Use set.seed(4061) when splitting the data and also every time you run a #neural network. #1. Compare single-layer neural network fits from the neuralnet and nnet libraries. #Can you explain any difference you may find? # 2. Change the "threshold" argument value to 0.001 in the call to neuralnet, and #comment on your findings (this run might take a bit more time to converge) #加载数据集 MASS::Boston 并分别对训练集和测试集执行 70%-30% 的拆分。 #1. 比较来自神经网络和 nnet 库的单层神经网络拟合。 #解释可能发现的任何区别吗? #2. 在对神经网络的调用中将“阈值”参数值更改为 0.001, #并评论发现(此运行可能需要更多时间才能收敛) #当外界刺激达到一定的阀值时,神经元才会受刺激,影响下一个神经元。 #超过阈值,就会引起某一变化,不超过阈值,无论是多少,都不产生影响 rm(list=ls()) library(neuralnet) library(nnet) # implements single layer NNs library(MASS) # includes dataset BostonHousing data(Boston) # load the dataset # train neural nets n = nrow(Boston) itrain = sample(1:n, round(.7*n), replace=FALSE) dat = Boston dat$medv = dat$medv/50 #归一化 dat.train = dat[itrain,] dat.test = dat[-itrain,-14]#自变量的测试样本 y.test = dat[-itrain,14]#因变量的测试样本 #nnet 单层五个神经元 nno1 = nnet(medv~., data=dat.train, size=5, linout=TRUE) fit1 = nno1$fitted.values mean((fit1-dat.train$medv)^2) #偏差 #neuralnet 单层五个神经元 nno2 = neuralnet(medv~., data=dat.train, hidden=c(5), linear.output=TRUE) fit2 = predict(nno2, newdata=dat.train)[,1] mean((fit2-dat.train$medv)^2) #偏差 ##阈值threshold0.0001的neuralnet nms = names(dat)[-14] f = as.formula(paste("medv ~", paste(nms, collapse = " + ")))#下面所用的函数太长,所以先写出来 nno3 = neuralnet(f, data=dat.train, hidden=5, threshold=0.0001)#threshold 阈值####f跟medv有啥区别??#### #Threshold in 'neuralnet' is originally 0.01. Now we set it to be 0.0001. fit3 = predict(nno3, newdata=dat.train)[,1] mean((fit3-dat.train$medv)^2)#0.005276877 #even much better! #用mean来看模型能不能用,mean不能太大 # test neural nets predict一定要乘回去50 y.test = y.test*50 p1 = predict(nno1, newdata=dat.test)*50 p2 = predict(nno2, newdata=dat.test)*50 p3 = predict(nno3, newdata=dat.test)*50 mean((p1-y.test)^2) mean((p2-y.test)^2) mean((p3-y.test)^2) #test的mean用来看哪个模型更好 # explain these differences? names(nno1)#nnet names(nno2)#neuralnet # nnet: # - activation function: logistic # - algorithm: BFGS in optim # - decay: 0 # - learning rate: NA # - maxit: 100 # neuralnet: # - activation function: logistic # - algorithm: (some form of) backpropagation # - decay: ? # - learning rate: depending on algorithm # - maxit:? # so what is it? #nnet里的activation function: #不是所有的信号都要做反应,需要activation function去看需要对哪些信号作出反应 #hide层和output层都有activation function #hide层的activation function是由act.fun决定的——tanch正切或者logistic #output层的activation function是由linout决定的 #做regression时一般linout=T,表明output层的activation function是identical的,就是输入是啥输出就是啥,不用做改变 #linout=F output的activation function是logistic,输出值要变成逻辑变量 #默认值hide和output都是logistic #### Exercise 4 #### #Fit a single-layer feed-forward neural network using nnet to #Report on fitted values. #使用 nnet 拟合单层前馈神经网络 #报告拟合值。 rm(list=ls()) library(caret) library(neuralnet) library(nnet) library(ISLR) #set up the data (take a subset of the Hitters dataset)设置数据(获取 Hitters 数据集的子集) dat = na.omit(Hitters) #返回删除NA后的向量a 因为该数据里有缺失值 #is.na(Hitters) n = nrow(dat) NC = ncol(dat) # Then try again after normalizing the response variable to [0,1]:将响应变量归一化为 [0,1] dats = dat dats$Salary = (dat$Salary-min(dat$Salary)) / diff(range(dat$Salary)) # train neural net itrain = sample(1:n, round(.7*n), replace=FALSE) dat.train = dat[itrain,] dats.train = dats[itrain,] dat.test = dat[-itrain,] dats.test = dats[-itrain,] #data Salary which is not scaled: do not work #归一化前 dat是归一化前,dats是归一化后 nno = nnet(Salary~., data=dat.train, size=10, decay=c(0.1)) summary(nno$fitted.values) #data Salary is scaled, but no regularization: do not work either #归一化后 nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0)) summary(nno.s$fitted.values) #data Salary is scaled, and also have regularization progress: works! #归一化后 nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0.1)) summary(nno.s$fitted.values) #Our last attempt above was a success. #But we should be able to get a proper fit even for decay=0... #what's going on? Can you get it to work? #改进!添加系数linout=1 #(A1) Well, it's one of these small details in how you call a function; #here we have to specify 'linout=1' because we are considering a regression problem #for regression problem: class k = 1 #data Salary which is not scaled: set.seed(4061) nno = nnet(Salary~., data=dat.train, size=10, decay=c(0.1), linout=1) summary(nno$fitted.values) #data Salary which is scaled, but with no decay: set.seed(4061) nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0), linout=1) summary(nno.s$fitted.values) #data Salary which is scaled, and also with decay: set.seed(4061) nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0.1), linout=1) summary(nno.s$fitted.values) #改进!写function # (A2) but let's do the whole thing again more cleanly... # 重新编码和放缩数据对结果影响的比较 # re-encode and scale dataset properly 正确重新编码和缩放数据集 myrecode <- function(x){ # function recoding levels into numerical values #函数将级别重新编码为数值 if(is.factor(x)){ levels(x) return(as.numeric(x)) } else { return(x) } } myscale <- function(x){ # function applying normalization to [0,1] scale #对 [0,1] 尺度应用归一化的函数 minx = min(x,na.rm=TRUE) maxx = max(x,na.rm=TRUE) return((x-minx)/(maxx-minx)) } datss = data.frame(lapply(dat,myrecode)) datss = data.frame(lapply(datss,myscale)) # replicate same train-test split: #复制相同的训练测试拆分: datss.train = datss[itrain,] datss.test = datss[-itrain,] nno.ss.check = nnet(Salary~., data=datss.train, size=10, decay=0, linout=1) summary(nno.ss.check$fitted.values) # use same scaled data but with decay as before: #使用相同的缩放数据,但与以前一样衰减: nno.ss = nnet(Salary~., data=datss.train, size=10, decay=c(0.1), linout=1) summary(nno.ss$fitted.values) # evaluate on test data (with same decay for both models): #评估测试数据(两个模型的衰减相同): datss.test$Salary - dats.test$Salary pred.s = predict(nno.s, newdata=dats.test) pred.ss = predict(nno.ss, newdata=datss.test) mean((dats.test$Salary-pred.s)^2) mean((datss.test$Salary-pred.ss)^2) #Feed-forward neural network(FFNN) #• Single or multiplelayers 单层或多层 #• Forward propagationonly 仅前向传播 #• Number of layers determines function complexity 层数决定功能复杂度 #• Typically uses a nonlinear activation function 通常使用非线性激活函数 #• Some definitions specify a unique hidden layer, others allow any number of layers一些定义指定一个唯一的隐藏层,其他定义允许任意数量的层 #Multilayer Perceptron(MLP) #Recurrent neural network(RNN) #Long short-term memory neural network(LSTMNN) #Convolutional neural network(CNN) #### Exercise 6: Olden index ##### #使用 NeuralNetTools::olden() 计算以下数据集的变量重要性, #每次拟合一个 7 神经元单层 FFNN (nnet): #1. 鸢尾花数据集(使用全套); #2. 波士顿数据集 #。 与从随机森林获得的变量重要性评估进行比较。 rm(list=ls()) library(nnet) library(NeuralNetTools) library(randomForest) library(MASS) myscale <- function(x){ minx = min(x,na.rm=TRUE) maxx = max(x,na.rm=TRUE) return((x-minx)/(maxx-minx)) } # (1) Iris data # shuffle dataset... set.seed(4061) n = nrow(iris) dat = iris[sample(1:n),] # rescale predictors... dat[,1:4] = myscale(dat[,1:4]) # fit Feed-Forward Neural Network... set.seed(4061) nno = nnet(Species~., data=dat, size=c(7), linout=FALSE, entropy=TRUE) pis = nno$fitted.values matplot(pis, col=c(1,2,4), pch=20) y.hat = apply(pis, 1, which.max) # fitted values table(y.hat, dat$Species) # compute variable importance... #神经网络中输入变量的相对重要性作为原始输入隐藏、隐藏输出连接权重的乘积之和 vimp.setosa = olden(nno, out_var='setosa', bar_plot=FALSE) vimp.virginica = olden(nno, out_var='virginica', bar_plot=FALSE) vimp.versicolor = olden(nno, out_var='versicolor', bar_plot=FALSE) names(vimp.setosa) par(mfrow=c(1,2)) plot(iris[,3:4], pch=20, col=c(1,2,4)[iris$Species], cex=2) plot(iris[,c(1,3)], pch=20, col=c(1,2,4)[iris$Species], cex=2) dev.new() plot(olden(nno, out_var='setosa')) plot(olden(nno, out_var='virginica')) plot(olden(nno, out_var='versicolor')) v.imp = cbind(vimp.setosa$importance, vimp.virginica$importance, vimp.versicolor$importance) rownames(v.imp) = names(dat)[1:4] colnames(v.imp) = levels(dat$Species) (v.imp) #正负值代表positive 还是nagative effect #绝对值越大,自变量对因变量的影响越大 # fit RF... set.seed(4061) rfo = randomForest(Species~., data=dat, ntrees=1000) rfo$importance # how can we compare variable importance assessments? cbind(apply(v.imp, 1, sum), #所有自变量放在一起对y的影响重要性(有抵消) apply(abs(v.imp), 1, sum), #取绝对值 rfo$importance) #三个值都大的自变量have overall contribution # (2) Boston data set.seed(4061) n = nrow(Boston) dat = Boston[sample(1:n),] # rescale predictors... dats = myscale(dat) dats$medv = dat$medv/50 set.seed(4061) nno = nnet(medv~., data=dats, size=7, linout=1) y.hat = nno$fitted.values plot(y.hat*50, dat$medv)#从图中看准确度 mean((y.hat*50-dat$medv)^2)#偏差 v.imp = olden(nno, bar_plot=FALSE) plot(v.imp) # fit RF...里面的重要性,跟神经网络拟合得到的重要性进行比较 set.seed(4061) rfo = randomForest(medv~., data=dat, ntrees=1000) rfo$importance # how can we compare variable importance assessments?我们如何比较变量重要性评估? cbind(v.imp, rfo$importance) round(cbind(v.imp/sum(abs(v.imp)), rfo$importance/sum(rfo$importance)),3)*100 #重要性的百分比 #一些变量两边数都大,突出standout #一些变量差距两边很大 # should we use absolute values of Olden's index?应该使用奥尔登指数的绝对值 par(mfrow=c(2,1)) barplot(abs(v.imp[,1]), main="importance from NN", names=rownames(v.imp), las=2) barplot(rfo$importance[,1], main="importance from RF", names=rownames(v.imp), las=2) # 作图比较 # or possibly normalize across all values for proportional contribution? par(mfrow=c(2,1)) NNN = sum(abs(v.imp[,1])) NRF = sum(abs(rfo$importance[,1])) barplot(abs(v.imp[,1])/NNN, main="importance from NN", names=rownames(v.imp), las=2) barplot(rfo$importance[,1]/NRF, main="importance from RF", names=rownames(v.imp), las=2) # 把上面两个图合在一起looks alright... now make it a nicer comparative plot :) par(font=2, font.axis=2) imps = rbind(NN=abs(v.imp[,1])/NNN, RF=rfo$importance[,1]/NRF) cols = c('cyan','pink') barplot(imps, names=colnames(imps), las=2, beside=TRUE, col=cols, ylab="relative importance (%)", main="Variable importance from NN and RF") legend("topleft", legend=c('NN','RF'), col=cols, bty='n', pch=15)