ST4061

#Q2

#(a)

dat.nona = na.omit(airquality)

dat = airquality

dat.nona$Month = as.factor(dat.nona$Month)

dat.nona$Day = as.factor(dat.nona$Day)

dat$Month = as.factor(dat$Month)

dat$Day = as.factor(dat$Day)

set.seed(4061)

pairs(dat)

cor(dat$Wind,dat$Temp)

cor(dat.nona$Ozone,dat.nona$Wind)

cor(dat.nona$Ozone,dat.nona$Temp)

#(c)

summary(dat)

nas = is.na(dat$Solar.R)

mean.solar = mean(dat.nona$Solar.R)

dat$Solar.R[nas] = mean.solar

sdat=na.omit(dat)

#install.packages("glmnet")

library(glmnet)

sx = model.matrix(Wind~.+0,data=sdat)

sy = sdat$Wind

cv.l = cv.glmnet(sx,sy)

slo = glmnet(sx,sy,lambda=cv.l$lambda.min)

pre.Wind=predict(slo,newx = sx)

mean((pre.Wind-sy)^2)#8.136048

#(d)

#fit linear model

linearModel <- lm(Ozone ~ Temp, data=dat)

#view model summary

summary(linearModel)

#create a new variable for hours2

data=dat

data$Temp2 <- data$Temp^2

#fit quadratic regression model

quadraticModel <- lm(Ozone ~ Temp + Temp2, data=data)

#view model summary

summary(quadraticModel)

dat$Ozone[nas] = predict(quadraticModel,newdata = data[nas,])

newdat=dat

newx = model.matrix(Wind~.+0,data=newdat)

newy = newdat$Wind

cv.l = cv.glmnet(newx,newy)

slo = glmnet(newx,newy,lambda=cv.l$lambda.min)

pre.Wind=predict(slo,newx = newx)

mean((pre.Wind-newy)^2)#9.003919

#(e)

#install.packages("tree")

library(tree) # contains... tree-building methods

tree.out = tree(Wind~., dat)

summary(tree.out)

# plot the tree

plot(tree.out)

text(tree.out, pretty=0)

#RSS

pre.wind=predict(tree.out,newdata = dat)

mean((pre.wind-newy)^2)#4.716928

#(f)

# pruning:

ptree = prune.tree(tree.out, best=10)

ptree

summary(ptree)

#RSS

pre.wind=predict(ptree,newdata = dat)

mean((pre.wind-newy)^2)#5.681356

#plot pree

plot(ptree)#pruned

text(ptree, pretty=0)

#Q3

#(a)

#install.packages("ISLR")

#install.packages("gbm")

#install.packages("randomForest")

library(ISLR) # for the data

library(gbm)

library(randomForest)

x.train = Khan$xtrain

x.test = Khan$xtest

y.train = as.factor(Khan$ytrain)

y.test = as.factor(Khan$ytest)

set.seed(4061)

table(y.train)

table(y.test)

length(y.train)

length(y.test)

#(c)

# grow a forest:

rf.out = randomForest(Khan$ytrain~.,x.train)

# fitted values for "training set"

rf.yhat = predict(rf.out,x.train , type="class")

tb.rf = table(round(rf.yhat), y.train)

#(d)

# fitted values for "test set"

rf.pred = predict(rf.out, x.test, type="class")

tb.rf = table(round(rf.pred), y.test)

sum(diag(tb.rf))/sum(tb.rf)

#(e)

which(rf.out$importance>0.4)

#(f)

varImpPlot(rf.out, pch=15)

#(g)

set.seed(4061)

gb.out=gbm(Khan$ytrain~., data=as.data.frame(x.train), distribution="bernoulli")

gb.p = predict(gb.out, newdata=as.data.frame(x.test))

tb.gb = table(round(gb.p), y.test)

sum(diag(tb.gb))/sum(tb.gb)

你可能感兴趣的:(ST4061)