#Q2
#(a)
dat.nona = na.omit(airquality)
dat = airquality
dat.nona$Month = as.factor(dat.nona$Month)
dat.nona$Day = as.factor(dat.nona$Day)
dat$Month = as.factor(dat$Month)
dat$Day = as.factor(dat$Day)
set.seed(4061)
pairs(dat)
cor(dat$Wind,dat$Temp)
cor(dat.nona$Ozone,dat.nona$Wind)
cor(dat.nona$Ozone,dat.nona$Temp)
#(c)
summary(dat)
nas = is.na(dat$Solar.R)
mean.solar = mean(dat.nona$Solar.R)
dat$Solar.R[nas] = mean.solar
sdat=na.omit(dat)
#install.packages("glmnet")
library(glmnet)
sx = model.matrix(Wind~.+0,data=sdat)
sy = sdat$Wind
cv.l = cv.glmnet(sx,sy)
slo = glmnet(sx,sy,lambda=cv.l$lambda.min)
pre.Wind=predict(slo,newx = sx)
mean((pre.Wind-sy)^2)#8.136048
#(d)
#fit linear model
linearModel <- lm(Ozone ~ Temp, data=dat)
#view model summary
summary(linearModel)
#create a new variable for hours2
data=dat
data$Temp2 <- data$Temp^2
#fit quadratic regression model
quadraticModel <- lm(Ozone ~ Temp + Temp2, data=data)
#view model summary
summary(quadraticModel)
dat$Ozone[nas] = predict(quadraticModel,newdata = data[nas,])
newdat=dat
newx = model.matrix(Wind~.+0,data=newdat)
newy = newdat$Wind
cv.l = cv.glmnet(newx,newy)
slo = glmnet(newx,newy,lambda=cv.l$lambda.min)
pre.Wind=predict(slo,newx = newx)
mean((pre.Wind-newy)^2)#9.003919
#(e)
#install.packages("tree")
library(tree) # contains... tree-building methods
tree.out = tree(Wind~., dat)
summary(tree.out)
# plot the tree
plot(tree.out)
text(tree.out, pretty=0)
#RSS
pre.wind=predict(tree.out,newdata = dat)
mean((pre.wind-newy)^2)#4.716928
#(f)
# pruning:
ptree = prune.tree(tree.out, best=10)
ptree
summary(ptree)
#RSS
pre.wind=predict(ptree,newdata = dat)
mean((pre.wind-newy)^2)#5.681356
#plot pree
plot(ptree)#pruned
text(ptree, pretty=0)
#Q3
#(a)
#install.packages("ISLR")
#install.packages("gbm")
#install.packages("randomForest")
library(ISLR) # for the data
library(gbm)
library(randomForest)
x.train = Khan$xtrain
x.test = Khan$xtest
y.train = as.factor(Khan$ytrain)
y.test = as.factor(Khan$ytest)
set.seed(4061)
table(y.train)
table(y.test)
length(y.train)
length(y.test)
#(c)
# grow a forest:
rf.out = randomForest(Khan$ytrain~.,x.train)
# fitted values for "training set"
rf.yhat = predict(rf.out,x.train , type="class")
tb.rf = table(round(rf.yhat), y.train)
#(d)
# fitted values for "test set"
rf.pred = predict(rf.out, x.test, type="class")
tb.rf = table(round(rf.pred), y.test)
sum(diag(tb.rf))/sum(tb.rf)
#(e)
which(rf.out$importance>0.4)
#(f)
varImpPlot(rf.out, pch=15)
#(g)
set.seed(4061)
gb.out=gbm(Khan$ytrain~., data=as.data.frame(x.train), distribution="bernoulli")
gb.p = predict(gb.out, newdata=as.data.frame(x.test))
tb.gb = table(round(gb.p), y.test)
sum(diag(tb.gb))/sum(tb.gb)