R笔记 ST4060 17/02/2022

# w2----------------

# Q2.1---------

rm(list = ls())

N <- 1000

x <- rexp(N,rate=0.5)

hist(x,freq = F)

xx <- seq(0,15,by=0.1)#没记住怎么加的

lines(xx,dexp(xx,rate = 0.5),col='red')#这个叫theorical

lines(density(x),col='blue')#没记住怎么加的,这个叫estimate,要分开

?density()

lines(density(x,kernel = "rectangular"),col='orange')

# 怎么直接画KDE

plot(density(x), lwd=3, col=2)

rug(x)

?rug#将数据的地毯表示(一维图)添加到图中。

density(x)$bw

ud <- density(x,bw=3.789586-3)#under-smoothing

ov <- density(x,bw=3.789586+3)#over-smoothing

plot(ud)

lines(ov)

# Q2.2---------

eps <- 0.95

N <- 100

u <- seq(0,1,length=N)

x <- numeric(N) # 让showing的是generated sample,所以是x不是y

for(i in 1:N){#这个就是这样定义的

  if(u[i]>eps){

    x[i] <- rt(1,df=3)

  }else{

    x[i] <- rnorm(1)

  }


}

hist(x)

# Q2.3---------

n <- 1000

set.seed(4060)

# (a)

x1 <- rnorm(n,mean=2,sd=1)

f1 <- density(x1)

hist(x1,freq = F)

xx1 <- seq(-1,7,by=0.1)

lines(xx1,dnorm(xx1,mean = 2,sd=1),col='red')

lines(f1,col='blue')

f1 <- density(x1,adjust =1.3)

lines(f1,col='orange')

mse <- mean((dt(density(x2)$x,df=3)-f2)^2)

lines(density(x2,adjust = 1.5),col='orange')

# (b)

x2 <- rt(n,df=3)

f2 <- density(x2)$y

hist(x2,freq = F,ylim = c(0,0.6))#这里要控制y的范围!!!

xx2 <- seq(-15,15,by=0.1)

lines(xx2,dt(xx2,df=3),col='red')

lines(density(x2),col='blue')

hist(x2,freq = F,ylim = c(0,0.6),breaks = 21)

xx2 <- seq(-15,15,by=0.1)

lines(xx2,dt(xx2,df=3),col='red')

lines(density(x2),col='blue')

# (b)

x3 <- rexp(n,rate = 2)

f3 <- density(x3)$y

hist(x3,freq = F,ylim = c(0,1.5))

lines(density(x3),col='blue')

xx3 <- seq(0,3.5,length=1000)

lines(xx3,dexp(xx3,rate=3),col='red')

# Q2.4---------

library(KernSmooth)

dat <-  faithful

# (a)

bw = apply(faithful, 2, sd)/2

fx <- bkde2D(dat,bandwidth = bw)

plot(dat,xlim=c(1,6),ylim=c(35,105))

contour(fx$x1, fx$x2, fx$fhat,add = T)#没记住

persp(fx$x1, fx$x2, fx$fhat, shade=.15, theta=-30, phi=15)

# 总结

# h越小,bw越小,adjust越小,KDE越squiggly,越高&narrow

# h越da,bw越大,adjust越大,KDE越smooth,越矮

# (b)

lp.fit = locpoly(faithful$eruptions, faithful$waiting, bandwidth=1)#没记住

# 只问local polynomial就这么写,bw越小越高,这个不是默认值,自己找一个

plot(faithful, pch=20)

lines(lp.fit, lwd=3, col=1)

?locpoly

# Q2.5---------

# 这里直接不会

# (a)

n = 1000

set.seed(4060)

Mu = c(1,4)#自己设一个

Sigma = c(2,1)#自己设一个

X1 = rnorm(N, mean=Mu[1], sd=Sigma[1])

X2 = rnorm(N, mean=Mu[2], sd=Sigma[2])

X = cbind(X1,X2)

# (b)

library('MASS')

bw = c(.5, .5) * 4

f.kde2d = kde2d(X1, X2, h=bw) # xy是说两个而已,不是x和y

names(f.kde2d)

plot(X,xlim=c(-6,12),ylim=c(0,8))

contour(f.kde2d$x,f.kde2d$y,f.kde2d$z,add = T)#这里是要3个的,或者直接写f.kde2d就行

# (c)不考

# w3----------------

rm(list = ls())

# Q5.1 MC integration-----

# 求e^(-x) 在 (2,4)上的积分(0.117)没记住

# g(x)=e^(-x); x~f(x)=1/2=UNIF(2,4); 2*E[g(x)]=所求

# x~f(x)=1/2=UNIF(2,4)很重要!!!!!

M <- 1000

a <- 2

b <- 4

x <- runif(M,a,b)

theta.est <- (sum(exp(-x))/M)*2;theta.est

# Q5.2-------

N <- 1000

x <- 1

a <- x

b <- -x

t <- runif(N,min=b,max = a)

prob <- ((sum((1/sqrt(2*pi))*exp(-0.5*(t^2))))/N)*2

rs <- 0.5+prob/2

# check:

pnorm(1)

# evaluate variance的话我们加一个MC求出estimator

M <- 999

v <- numeric(M)

for(i in 1:M){

  t <- runif(N,min=b,max = a)

  prob <- ((sum((1/sqrt(2*pi))*exp(-0.5*(t^2))))/N)*2

  rs <- 0.5+prob/2

  v[i] <- rs

}

mean(v)#check

var(v)

quantile(v,c(0.025,0.975))

# Q5.3-------

# (a)

n <- 1000

x <- rnorm(n)

is <- order(x)

xk <- x[is]

k <- 0

xnew <- xk[c((k+1):(n-k))]

length(xnew)

mn <- numeric(n/2)

for(k in 0:(n/2-1)){

  xnew <- xk[c((k+1):(n-k))]

  mn[k+1] <- mean(xnew)

}

mn

# (b)

k <- 1 #这里k是定下的

M = 1000

n = 20 #sample size

tm <- numeric(M)

mn <- numeric(M)

for (m in 1:M) {

  x <- rnorm(n,mean=15,sd=3)

  mn[m] <- mean(x)

  xs <- sort(x)

  xnew <- xs[c((k+1):(n-k))]

  tm[m] <- mean(xnew)

}

mse1 <- mean((mn-15)^2)

mse2 <- mean((tm-15)^2)

b1 <- mean(mn-15)

b2 <- mean(tm-15)

mse1

var(mn)+b1

mse2

var(tm)+b2

# Q5.4-------

# (a)

N <- 50

M <- 100

a <- 7

b <- 3

x <- rep(c(1:5), N/5)

m <- 0.5

s <- 1.2

rseed=0

LSvec = RMvec = matrix(0,2,M)

set.seed(rseed)

library(MASS)

library(VGAM)

# (b)

for(m in 1:M){

  e <- rlnorm(N,m,s)

  y <- a+b*x+e

  mylm <- lm(y~x)

  myrlm <- rlm(y~x,method="MM")

}

# Q5.5-------

M <- 10000

N <- 50

x <- rnorm(N,mean = 0,sd=2)

s1 <- s2 <- numeric(M)

?var()

for(m in 1:M){

  x <- rnorm(N,mean = 0,sd=2)

  s1[m] <- sd(x)

  s2[m] <- sqrt(mean((x-mean(x))^2))#这个是诱有偏的,记住

}

mean(s1)

mean(s2)

boxplot(s1,s2)

M <- 10000

N <- 100

x <- rnorm(N,mean = 0,sd=2)

s1 <- s2 <- numeric(M)

?var()

for(m in 1:M){

  x <- rnorm(N,mean = 0,sd=2)

  s1[m] <- sd(x)

  s2[m] <- ((N-1)*s1[m])/N

}

sd(s1)

sd(s2)

boxplot(s1,s2)

# w4----------------

rm(list = ls())

iris <- iris

x <- subset(iris$Sepal.Length,iris$Species=='setosa')

xbar <- mean(x,na.rm = T)

n <- length(x)

B <- 100

xb <- numeric(B)

for(b in 1:B){

  #idx <- sample(1:n,n,replace = T)

  tst <- sample(x,n,replace = T)#所以这是一样的

  #xnew <- x[idx]

  xb[b] <- mean(tst)

}

mean(xb)

mean(xb)-xbar#bias

#estimate of SE:

sd(xb)

# Q5.9-------

cars <- cars

x <- cars$speed

y <- cars$dist

B <- 10000

n <- nrow(cars)

lm1 <- lm(y~x)

summary(lm1)

lm1$coefficients[2]

slp <- itc <- numeric(B)

for(b in 1:B){

  idx <- sample(1:n,n,replace = T)

  ynew <- y[idx]

  xnew <- x[idx]

  lmb <- lm(ynew~xnew)

  slp[b] <- lmb$coefficients[2]

  itc[b] <- lmb$coefficients[1]

}

# Bs SE estimate:

sd(slp)

sd(itc)

#Bs estimate of bias:

mean(slp)-coef(lm1)[2]

#naive bootstrapping 95%Ci

quantile(slp,c(0.25,0.975))

# MC matrix-------

# 让matrix里的每一个值都是MC的mean estimate

M <- 100

mu <- 10

sigma <- 3

Ns <- c(20,30,50,100,200,500)

mt <- matrix(NA,nrow = M,ncol = length(Ns))

for (j in 1:length(Ns)) {

  for (m in 1:M) {

    x <- rnorm(Ns[j],mean = mu,sd=sigma)

    mt[m,j] <- mean(x)

  }

}

apply(mt,2,mean)

boxplot(mt,names=Ns,xlab='sample size N')

#计算theoretic CI:

lb <- mu-1.96*sigma/sqrt(Ns)#lower bound

ub <- mu+1.96*sigma/sqrt(Ns)#upper bound

#计算emprical/pracital CI:

lb <- apply(means,2,mean)-1.96*sigma/sqrt(Ns)#lower bound

ub <- apply(means,2,mean)+1.96*sigma/sqrt(Ns)#upper bound

# Q5.6-------

M <- 100

ndf <- c(2,4,10)

n <- 100

mt <- matrix(NA,nrow = M,ncol = 3)

for(j in 1:3){

  for (i in 1:M) {

    x <- rchisq(n,df=ndf[j])

    mt[i,j] <- mean(x)

  }

}

boxplot(mt[,1],mt[,2],mt[,3])

#可以看出来的是df=E(x),没记住

sd <- apply(mt,2,sd)#the estimates of standard error of xbar

# w5----------------

# 20-21 的CA1 Q1------

set.seed(6040)

M <- 1000

N <- 1000

r <- numeric(M)

for(m in 1:M){

  x <- sample(1:N,N,replace = T)

  xnew <- unique(x)

  r[m] <- length(xnew)/N

}

mean(r)

# 20-21 的CA1 Q2------

set.seed(6015)

M <- 1000

N <- 100

a <- 3

b <- 2

mn <- numeric(M)

for(m in 1:M){

  x <- rgamma(N,shape = a,rate = b)

  mn[m] <- mean(x)

}

# (a)

mean(mn)

# (b)

# 不惊奇,因为true value是1.5,MN越大,那么这个值越接近1.5

# (c)

sd(mn)

# 20-21 的CA1 Q3------

# 看好了,只有feet和girth

set.seed(4060)

dat <- trees

B <- 100

slp <- numeric(B)

n <- nrow(dat)

for(b in 1:B){

  idx <- sample(c(1:n),n,replace = T)

  ynew <- dat$Height[idx]

  xnew1 <- dat$Girth[idx]

  lmb <- lm(ynew~xnew1+xnew2)

  slp[b] <- lmb$coefficient[2]

}

lmb <- lm(dat$Height~dat$Girth+dat$Volume)

lmb$coefficients[2]

# (a)

boxplot(slp)

# (b)

mean(slp)

# (c)

sd(slp)

# (d)

# Bs的empirical CI

lso <- lm(y~x)$coef#这个就是y~x的斜率

2*lso[2] - quantile(slp, probs=c(0.975,0.025))#这个没写出来

# w6----------------

mtcars <- mtcars

n <- nrow(mtcars)

dat <- mtcars[sample(1:n,n),]

y = dat$mpg

x <- dat[,-1]

glm1 <- glm(y~.,data = x)

yhat <- glm1$fitted.values#取fitted value

plot(y,yhat)

boxplot((y-yhat)^2)

mean((y-yhat)^2)

train.idx <- c(1:floor(n/2))

train.idx <- c(1:16)

glm.train <- glm(y[train.idx]~.,data = x[train.idx,])

o1 <- glm(y~.,data = x,subset = train.idx)#一样的

ytrain <- glm.train$fitted.values

ytst <- predict(glm.train,newdata = x[-train.idx,])

mse.train <- mean((glm.train$residuals)^2)

mse.tst <- mean((y[-train.idx]-ytst)^2)

boxplot((glm.train$residuals)^2,(y-ytst)^2)

boxplot((ytrain-y[train.idx])^2)

# 加上Bs:

B <- 100

mseb.train <- mseb.tst <- numeric(B)

for(b in 1:B){

  idx <- sample(1:n,n,replace = T)

  glm.t <- glm(y~.,data=x,subset = idx)

  mseb.train[b] <- mean((glm.t$residuals)^2)

  ytst <- predict(glm.t,newdata=x[-idx,])

  mseb.tst[b] <- mean((y[-idx]-ytst)^2)

}

boxplot(mseb.train,mseb.tst)

# k-fold CV

K <- 5

msecv.train <- msecv.tst <- numeric(K)

folds <- cut(1:n,K,labels = F)

table(folds)

for(k in 1:K){

  idx <- which(folds!=k)

  glm.t <- glm(y~.,data = x,subset = idx)

  msecv.train[k] <- mean((glm.t$residuals)^2)

  ytst <- predict(glm.t,newdata = x[-idx,])

  msecv.tst[k] <- mean((y[-idx]-ytst)^2)

}

boxplot(msecv.train,msecv.tst)

B <- 10

mseb.train <- mseb.tst <- numeric(B)

for(b in 1:B){

  idx <- sample(1:n,n,replace = T)

  glm.t <- glm(y~.,data=x,subset = idx)

  mseb.train[b] <- mean((glm.t$residuals)^2)

  ytst <- predict(glm.t,newdata=x[-idx,])

  mseb.tst[b] <- mean((y[-idx]-ytst)^2)

}

boxplot(mseb.train,mseb.tst)

# Bs的话用了32-32*0.632个,大约是10个做test

# CV里K=3的话那么大约用32/3=10个做test

# 所以我们期望K=3时两对error是一样的,但是k=5,CV的

# K-fold更accurate,不那么widespread,如果增加B widespread会变好,但是会得到很多outliers

# Q5.8--------------

rm(list = ls())

library(bootstrap)

dat <- law

cor(dat$LSAT,dat$GPA)

B <- 1000

n <- nrow(dat)

cr <- numeric(B)

for(b in 1:B){

  idx <- sample(1:n,n,replace = T)

  x1 <- dat$LSAT[idx]

  x2 <- dat$GPA[idx]

  cr[b] <- cor(x1,x2)

}

mean(cr)

# bias

mean(cr)-cor(dat$LSAT,dat$GPA)

# native CI

quantile(cr,c(0.025,0.975))

# Q5.9--------------

rm(list = ls())

dat <- cars

# (a)

lm1 <- lm(dat$dist~dat$speed)

lm1$coefficients

# (b)

B <- 10000

n <- nrow(dat)

slp <- numeric(B)

itc <- numeric(B)

for(b in 1:B){

  idx <- sample(1:n,n,replace = T)

  xb <- dat$speed[idx]

  yb <- dat$dist[idx]

  lm <- lm(yb~xb)

  slp[b] <- lm$coefficient[2]

  itc[b] <- lm$coefficient[1]

}

lm1$coefficient

mean(slp)

mean(itc)

# (c)

# 不会

# 这个题就是在说让看看两个estimator的分布,分布的话用hist就行了

hist(slp)

hist(itc)

# (d)

# 不会

# 记住怎么看joint distr------

library(ellipse)

plot(itc,slp,cex=0.5,pch=20)

e <- ellipse(lm1)

plot(e,t='l')

bcoef <- cbind(itc,slp)

points(bcoef,pch=20,col=8,cex=.5)

points(itc,slp,pch=20,col=1,cex=.5)#是一样的

sd(slp)

sd(itc)

# 用命令直接Bs--------

library(boot)

#自己定义一个func,但是这个func的第一个argument必须是data,第二个必须是bs的idx:

my.mean <- function(x, ib){

  return( mean(x[ib]) )

}

B = 100

x = cars$dist

ob = boot(x, my.mean, R=B)

# ob里的original就是true value,ie mean(x)

# Q5.10 这个是nls怎么用----------

rm(list = ls())

x = mtcars$disp

y = mtcars$mpg

plot(x,y,pch=20)#有inflexion(屈折),所以用nonlinear可能会更好

# nonlinear regression比如polynomial/exp

theta <- c(3,-0.01)

nls1 <- nls(y~exp(a+b*x),start = list(a=3,b=-0.01))

fitted(nls1)

plot(fitted(nls1),y)

plot(fitted(nls1)-y)#这个是residual,应该没有pattern才对

hist(fitted(nls1)-y)

# (a)这是怎么着nls的参数--------

summary(nls1)

summary(nls1)$parameter[1,1]

summary(nls1)$parameter[2,1]

plot(x,y,pch=20)

is = order(x)#返回的事从小到大的位置

#20 19 18 26 28  3 ......

x[20]

min(x)

lines(x[is], fitted(nls1)[is], pch=20, cex=2, col=4, lwd=2)

?order

# (b)

z <- density(x)

plot(z)

# (c)

set.seed(1)

B = 100

n <- length(x)

theta1 <- numeric(B)

theta2 <- numeric(B)

for(b in 1:B){

  idx <- sample(1:n,n,replace = T)

  xnew <- x[idx]

  ynew <- y[idx]

  bnl <- nls(ynew~exp(a+b*xnew),start = list(a=3,b=-0.01))

  theta1[b] <- summary(bnl)$parameter[1,1]

  theta2[b] <- summary(bnl)$parameter[2,1]

}

mean(theta1)

mean(theta2)

sd(theta1)

sd(theta2)

#se:

sd(theta1)

# nonparametric 95% confidence intervals 就是native

# Q2.11----------

#  coerce强迫

rm(list = ls())

library(MASS)

x = Boston[,c('crim','indus','rm','tax')]

y = Boston$medv

lm1 <- lm(y~.,data = x)

# (a)

n <- nrow(x)

a <- c(1:floor(n/2))

lma <- lm(y~.,data = x,subset = a)

preds <- predict(lma,newdata = x[-a,])

length(preds)

rmsea <- sqrt(mean((y[-a]-preds)^2))

# (b)--------leave-one-out CV

preds <- resi <- numeric(n)

for(i in 1:n){

  tot <- c(1:n)

  b <- tot[-i]

  lmb <- lm(y~.,data = x,subset = b)

  preds[i] <- predict(lmb,newdata = x[i,])

  resi[i] <- y[i]-preds[i]

  rmseb <- sqrt(mean((resi)^2))

}

# (c)

K <- 5

folds <- cut(1:n,K,labels = F)

rmse <- numeric(K)

for(k in 1:K){

  train.idx <- which(folds!=k)

  tst.idx <- which(folds==k)

  lmc <- lm(y~.,data = x,subset = train.idx)

  predsc <- predict(lmc,newdata = x[tst.idx,])

  rmse[k] <- sqrt(mean((y[tst.idx]-predsc)^2))

}

# 但是这里我们忘记打乱了,so:

dat <- Boston[sample(1:n,n),]

x = dat[,c('crim','indus','rm','tax')]

y = dat$medv

K <- 5

folds <- cut(1:n,K,labels = F)

rmse <- numeric(K)

for(k in 1:K){

  train.idx <- which(folds!=k)

  tst.idx <- which(folds==k)

  lmc <- lm(y~.,data = x,subset = train.idx)

  predsc <- predict(lmc,newdata = x[tst.idx,])

  rmse[k] <- sqrt(mean((y[tst.idx]-predsc)^2))

}

rmse

# 重复R次的CV---------

# R里的每一次都要打乱一次

R <- 10

K <- 5

folds <- cut(1:n,K,labels = F)

rmse <- numeric(K*R)

for (r in 1:R) {

  is <- sample(1:n,n)

  x <- x[is,]

  y <- y[is]

  for(k in 1:K){

    train.idx <- which(folds!=k)

    tst.idx <- which(folds==k)

    lmc <- lm(y~.,data = x,subset = train.idx)

    predsc <- predict(lmc,newdata = x[tst.idx,])

    rmse <- c(rmse,sqrt(mean((y[tst.idx]-predsc)^2)))#这是在nested for loop里怎feed in

  }

}

rmse

# w7----------------

rm(list = ls())

# 课堂例题 section3 P8的--------

attach(pressure)

x <- temperature

y <- pressure

plot(x,y,pch=20,cex=0.5)#很明显这不是一个linear pattern

lm1 <- lm(y~x)

lines(x,fitted(lm1))

lmf <- lm(y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5))

plot(x,y,pch=20,cex=0.5)

lines(x,fitted(lmf))

nls1 <- nls(y~exp(a+b*x),start = list(a=0,b=0.02))

summary(nls1)

lines(x,fitted(nls1),col='red')

cc <- coef(nls1)

coefficients(nls1)

curve(exp(cc[1]+cc[2]*x),col='blue',add=T)#和lines一样

nls1 <- nls(y~exp(a+b*x),start = list(a=0,b=0.02))

summary(nls1)

nlstst <- nls(y~exp(a+b*x),start = list(a=0,b=0.02),trace = T)

summary(nlstst)

crit <- function(theta,x,y){

  return( sum( (y-x*theta)^2 ) )

}

thbar = 1.8# theta的true value

#x = rep(c(1,3,7,8), len=100)

#y = x*thbar + rnorm(x)

optim.out = optim(par=c(1), fn=crit, x=x, y=y, method='L-BFGS-B',lower=c(-1), upper=c(0))

optim.out # $par是theta的值

curve(exp(1e-07*x),col='blue',add=T)

# grid search----------

# nls(y~exp(a+b*x),start = list(a=0,b=0.02))

# 现在假设一点都不知道ab的值

x <- temperature

y <- pressure

L <- 10

a.grid <- seq(-1,1,length=L)

b.grid <- seq(0.01,0.5,length=L)

crit <- matrix(NA,ncol=L,nrow=L)

for(i in 1:L){

  for(j in 1:L){

    crit[i,j] <- sum( (y-exp(a.grid[i]+b.grid[j]*x))^2 )


  }

}

persp(a.grid,b.grid,crit,theta=45,col = 'lightblue')

b.grid <- seq(0.01,0.02,length=L)

crit <- matrix(NA,ncol=L,nrow=L)

for(i in 1:L){

  for(j in 1:L){

    crit[i,j] <- sum( (y-exp(a.grid[i]+b.grid[j]*x))^2 )


  }

}

persp(a.grid,b.grid,crit,theta=45,col = 'lightblue')

contour(a.grid,b.grid,crit)

#什么样的图才是好的grid search---------

# BONUS: a nicer example of a criterion surface plot...

rm(list=ls())

set.seed(4060)

n = 100

x = runif(n,-1,1)

y = .2*x+.3*x^2

plot(x,y)

L = 100

ag = seq(-1,1,length=L)

bg = seq(-1,1,length=L)

crit = matrix(NA,nrow=L,ncol=L)

for(i in 1:L){

  for(j in 1:L){

    crit[i,j] = sum( (y-ag[i]*x-bg[j]*x^2)^2 )

  }

}

persp(ag,bg,crit,col='lightblue',theta=45)

contour(ag,bg,crit)

# how to fit any model using optim()----------

remove(list = ls())

x <- temperature

y <- pressure

crit <- function(th,x,y){

  a <- th[1]

  b <- th[2]

  return(sum(y-exp(a+b*x))^2)

}

range(x)

range(y)

?optim

opt <- optim(par = c(0,0.02),fn=crit,x=x,y=y)

# 参数par:Initial values for the parameters to be optimized over.

opt$par

opt <- optim(par = c(0,0.02),fn=crit,x=x,y=y,

            method = 'L-BFGS-B', lower=c(0.01,0.02), upper=c(0.1,0.03))

opt$par

opt <- optim(par = c(0,0.02),fn=crit,x=x,y=y,

            method = 'L-BFGS-B', lower=c(0,0.02), upper=c(0.1,0.03))

opt$par

opt <- optim(par = c(0,0.02),fn=crit,x=x,y=y,

            method = 'L-BFGS-B', lower=c(-1,0.02), upper=c(0.1,0.03))

opt$par

opt <- optim(par = c(0,0.02),fn=crit,x=x,y=y,

            method = 'L-BFGS-B', lower=c(-0.6,0.02), upper=c(0.1,0.03))

opt$par

nls1 <- nls(y~exp(a+b*x),start = list(a=0,b=0.02))

summary(nls1)

yhat <- exp(opt$par[1]+opt$par[2]*x)

plot(x,y)

lines(x,fitted(nls1),col='red')

lines(x,yhat,col='blue')

opt1 <- optim(par = c(0,0.02),fn=crit,x=x,y=y)

yhat1 <- exp(opt1$par[1]+opt1$par[2]*x)

lines(x,yhat1,col='orange')#这个接近nls

# 那么我们应不应该加第三个参数c呢?

crit2 <- function(th,x,y){

  # Must have theta as 1st parameter and return a single value...

  # Here we implement a simple linear regression model

  # for the sake fo the example:

  a = th[1]

  b = th[2]

  c = th[3] ## additional scaling parameter

  return( sum( (y-c*exp(a+b*x))^2 ) )

}

opt2 <- optim(par = c(0,0.02,1),fn=crit2,x=x,y=y)

opt2$par

yhat2 <- opt2$par[3]*exp(opt2$par[1] + (opt2$par[2])*x)#因为是scale parameter

# The result is not great:因为跟不加c的一摸一样,所以不用加这个paremeter了

plot(x, y, pch=20,

    xlab="Temperature (Celsius)",

    ylab="Vapor pressure (ml of mercury)",

    main="Example: polynomial regression")

lines(x, yhat2, col='navy')

lines(x, fitted(nls1), col='blue')

lines(x, yhat, col=2, lwd=3)

# w8----------------

#1D overfitting

rm(list = ls())

dat <- na.omit(airquality)

x <- dat$Temp

y <- dat$Ozone

plot(x,y)

dat <- data.frame(x,y)#为了方便写subset和predict,是data.frame不是as.data.frame!!!!

x <- data.frame(x)#这样不行!!!!

n <- nrow(x)

train.idx <- which(y<100)#which返回的也是位置

#tst.idx <- which(y>=100)

lmt <- lm(y~.,data = dat,subset = train.idx)

preds <- predict(lmt,newdata = dat[-train.idx,])

sqrt(mean( (preds-y[-train.idx])^2 ))

# multivariate overfitting

rm(list = ls())

dat <- mtcars

n <- nrow(dat)

R <- 20

K <- 2

fit <- numeric(K*R)

mse <- numeric(K*R)

fit.l <- numeric(K*R)

mse.l <- numeric(K*R)

folds <- cut(1:n,K,labels = F)

set.seed(4060)

library('glmnet')

for( r in 1:R){

  dat <- dat[sample(1:n,n),]

  x <- dat[,-1]

  y <- dat$mpg

  xm <- as.matrix(x)#全部都用这个就行,防止x有categorical

  #glm可以自己识别categorical

  for(k in 1:K){

    train.idx <- which(folds!=k)

    glm1 <- glm(y~.,data = x,subset = train.idx)

    yh <- fitted(glm1)

    fit <- c(fit,mean((yh-y[train.idx])^2))

    preds <- predict(glm1,newdata=x[-train.idx,])

    mse <- c(mse,mean((preds-y[-train.idx])^2))

    cv.lb = cv.glmnet(xm[train.idx,],y[train.idx],grouped=FALSE)

    # 10-fold cv

    # 上 是怎么找lasso的lambda(高阶 参数,不参与model,只是scale)

    lasso =  glmnet(xm[train.idx,],y[train.idx],lambda=cv.lb$lambda.min)

    yh.l = predict(lasso,newx=xm[train.idx,])#怎么求yhat

    fit.l[k+(r-1)*K] = mean((yh.l-y[train.idx])^2)

    yp.l = predict(lasso,newx=xm[-train.idx,])# test的prediction

    mse.l[k+(r-1)*K] = mean((yp.l-y[-train.idx])^2)#MSE

  }

}

boxplot(fit,mse,fit.l,mse.l,

        names=c("glm fit","glm error","lasso fit","lasso error"),

        ylim=c(0,300))

boxplot(fit,mse,fit.l,mse.l,

        names=c("glm fit","glm error","lasso fit","lasso error"),

        ylim=c(0,25))

#现在这里有categorical variable了

rm(list = ls( ))

library(ISLR)

set.seed(6026)

dat = na.omit(Hitters)

n = nrow(dat)

dat$Salary = log(dat$Salary)# 原始dta太skewed了,取log让data更normal

REPS = 50

train.error = test.error = numeric(REPS)

lasso.train.error = lasso.test.error = numeric(REPS)

for(j in 1:REPS){

  dat = dat[sample(1:n, n, replace=FALSE),]


  i.train = c(1:200)# 用前200个作为training,后63个当test

  train = dat[i.train,]

  test = dat[-i.train,]

  fit = glm(Salary~., data=train)

  pred = predict(fit, newdata=test)


  train.error[j] = mean(fit$residuals^2)

  test.error[j] = mean((pred-test$Salary)^2)


  xm = model.matrix(Salary~., data=train)[,-1]

  newxm = model.matrix(Salary~., data=test)[,-1]

  y = train$Salary

  lam = cv.glmnet(xm, y, data=train)

  lasso = glmnet(xm, y, data=train, lambda=lam$lambda.min)

  lasso.pred = predict(lasso, newx=newxm)


  lasso.fit = predict(lasso, newx=xm)

  lasso.train.error[j] = mean((lasso.fit-train$Salary)^2)

  lasso.test.error[j] = mean((lasso.pred-test$Salary)^2)

}

summary(fit)

coef(lasso) #怎么看被lasso mute down了-----------

par(font=2, font.axis=2)

boxplot(train.error, test.error,

        main="MLB player salary prediction",

        names=c("Training error", "Test error"),

        col=c('pink','cyan'))

# 另一个例子Prestige example

rm(list = ls( ))

library(car)  # contains dataset Prestige

library('glmnet')

pairs(Prestige)

y = Prestige$income

x = Prestige[,c("education","prestige","women")]

class(x) # note x is of class "data.frame" as opposed to "matrix"

lmo = lm(y~., data=x) # this works though (check out ?lm)

summary(lmo)

# how to predict from this fit:

newx = data.frame(education=seq(11,16,length=5),prestige=seq(14,80,length=5),women=seq(10,90,length=5))

lmo.pred = predict(lmo, newdata=newx)

# 只predict一个:

newx = data.frame(education=c(13),prestige=c(61),women=c(12))

lmo.pred = predict(lmo, newdata=newx)

xm = model.matrix(y~., data=x)[,-1]

lasso.cv = cv.glmnet(xm, y, alpha=1)#Cross-validation for glmnet

lasso = glmnet(xm, y, alpha=1, lambda=lasso.cv$lambda.min)

# let's compare LASSO with our earlier linear regression output:

cbind(coef(lasso), coef(lmo))

# how to predict from this fit:

# ?predict.glmnet

class(newx)

lasso.pred = predict(lasso, newx=as.matrix(newx))

# NB:

cbind(lasso.pred, lmo.pred) # close enough to the lm() predictions

mean((lasso.pred-lmo.pred)^2)

# percentage error-----------

mean((lasso.pred-lmo.pred)/lmo.pred)*100 # less than 1% error...

#只有-0.1155595,所以origional model是够好的

# 另一个model: Blood Pressure example

library(glmnet)

setwd('/Users/apple/Desktop/22-r/ST4060/ST4060txt-csv')

dat = read.table(file="Blood Pressure.txt",header=TRUE)

str(dat)

dat$PatientID = NULL # getting rid of this variable

# GLM fit

glm.fit = glm(Systolic~., data=dat)

mean(glm.fit$residuals^2)

# LASSO - note: fitting these models is rather cody...

alpha = 1

# prepare the data:

xm = model.matrix(Systolic~., data=dat)[,-1]

y = dat$Systolic

# compute best lambda:

lam = cv.glmnet(xm, y, alpha=alpha)

# c) fit model using best lambda:

lasso = glmnet(xm, y, lambda=lam$lambda.min)

# d) recover fitted values:

lasso.fit = predict(lasso, newx=xm)

# e) calculate RSS or MSE from LASSO

mean((lasso.fit-y)^2)

rm(list = ls( ))

dat = na.omit(airquality)

x = dat$Temp

y = dat$Ozone

plot(x,y)

nl.crit <- function(x, y, a, b){

  return( mean( (y - a*exp(b*x))^2 ) )

}

# GRID SEARCH:----------

Pa = Pb = 100

a.grid = seq(0,2,length=Pa)

b.grid = seq(0,1,length=Pb)

crit.values = matrix(NA, nrow=Pa, ncol=Pb)

for(i in 1:Pa){

  for(j in 1:Pb){

    crit.values[i,j] = nl.crit(x,y,a.grid[i],b.grid[j])

  }

}

ij.best = which.min(crit.values)

ij.best = arrayInd(ij.best, .dim=dim(crit.values))

# location of the minimum of the surface

# grid search solutions:

a.best = a.grid[ij.best[1]]

b.best = b.grid[ij.best[2]]

fitted = a.best * exp(b.best*x)

nlso = nls(y~a*exp(b*x), start=list(a=a.best, b=b.best))

a.best

b.best

plot(x, y, pch=20, xlab="Temperature (degrees F)",

    ylab="Ozone (ppb)",

    xlim=c(55,105))

points(x, fitted(nlso), pch=20, col='orange')

# 直接用optim也能smooth---------

dat = na.omit(airquality)

x = dat$Temp

y = dat$Ozone

#### Pick a nonlinear model and fit it using optim()

# First, write a function that evaluates the criterion:

criterion <- function(param,x,y){

  # The function has to be defined this way

  # (with param as its 1st argument)

  # if we are going to apply optim() to it... cf. ?optim

  a = param[1]

  b = param[2]

  return( mean( (y - a*exp(b*x))^2 ) )

}

# Then, apply optim() to it:

oo = optim(par=list(a=1, b=0.01), fn=criterion, x=x, y=y)

# now reconstruct fitted values from optim() output:

a.hat = oo$par[1]

b.hat = oo$par[2]

y.hat = a.hat * exp(b.hat*x)

plot(x,y)

is <- order(x)

lines(x[is], y.hat[is], pch=21, col=2)

# w9----------------

# B-spline--------

# using dataset ahmd$mx from library MortalityLaws, plot the crude force of mortality curve.

rm(list = ls())

library('MortalityLaws')

age <- c(0:110)

mM <- ahmd$mx[,4]

plot(dat$Age,mM,t='b',pch=20)#Crude force of mortality (2010)

plot(age,log(mM))

library('splines')

BM3 <- bs(age)

BM6 <- bs(age,df=6)

matplot(age,BM3,t='b')#Plot Columns of Matrices

matplot(age,BM6,t='b')

BMtst <- bs(age,knots = F)#

# Compute the design matrix for 3 dof’s and quote the projection for age 40 onto

# it. Then reconstruct the B-spline approximation to the crude male force data

# using linear regression. Plot the output over the data points and criticize.

age <- c(0:110)

mM <- ahmd$mx[,4]

BM <- bs(age)# compute bases

lm1 <- lm(mM~BM)

y <- numeric(length(age))#一个vector

for (i in 1:length(age)) {

  y[i] <- as.numeric(sum(lm1$coef[2:4]*BM[i,])+lm1$coef[1])

}

lm1$coefficients

plot(age,mM,pch=20)

lines(age,y,t='l',col=4)# 这两条线是一样的

lines(age,fitted(lm1),t='l',col=2)

y-fitted(lm1)

# 更好的B-spline:

BM <- bs(age,df=6)# compute bases

lm1 <- lm(mM~BM)

y <- numeric(length(age))

for (i in 1:length(age)) {

  y[i] <- as.numeric(sum(lm1$coef[2:7]*BM[i,])+lm1$coef[1])

}

plot(age,mM,pch=20)

lines(age,y,t='l',col=4)

lines(age,fitted(lm1),t='l',col=2)

# Continuing from previous exercise, compute the B-spline as a power series,

# i.e. compute the traditional cubic spline representation

# S(x)=β0+β1x+β2x2+β3x3+∑j=1Jαj(x?6?1xj)3+

# where (x?6?1xj)3+=max((x?6?1xj)3,0), using J=3 interior knots at 40, 60 and 80.

# Compare with the fitted values from the regression onto the design matrix

# obtained from splines::bs() for those knots.

x <- c(0:110)

mM <- ahmd$mx[,4]

BM <- bs(x,knots = c(40,60,80))#也叫design matrix

lm2 <- lm(mM~BM)

x <- 41

pmax((x-40)^3,0)

x <- 39

pmax((x-40)^3,0)

k40 <- pmax((x-40)^3,0)

k60 <- pmax((x-60)^3,0)

k80 <- pmax((x-80)^3,0)

lm3 <- lm(mM~x+I(x^2)+I(x^3)+k40+k60+k80+0)

plot(x,mM,pch=20)

lines(x,fitted(lm3),col='red')

lines(x,fitted(lm2),col='blue')

par(mfrow=c(1,2))

matplot(BM)# original basis for B-spline

POW <- cbind(x,x^2,x^3,k40,k60,k80)

matplot(POW)

# w10----------------

# loess是多个x,lowess只有一个x

rm(list = ls())

dat = na.omit(airquality)

t = dat$Temp

y = dat$Ozone

plot(t,y,xlim = c(50,110) )

range(t)

newx <- c(95:100)

abline(v=newx,col='blue')

lo <- loess(y~t)

summary(lo)

# check output values are in the same order as input values:

summary(c(lo$x - x)) # ok, they are

summary(c(lo$y - y)) #都是相同order

pred.lo <- predict(lo,newdata = data.frame(t=newx))# 这个t是喂进去的

points(newx,pred.lo,col='red')#预测不了出了x的range 的点

low <- lowess(t,y)

summary(low)

summary(c(low$x - t))      # NO, THEY ARE NOT

summary(c(low$x - sort(t))) # coz lowess() sorted the input values!

sqrt( mean( (y-fitted(nlso))^2 ) )

sqrt( mean( (y-fitted(lo))^2 ) )

sqrt( mean( (y[order(x)]-low$y)^2 ) )

pred.low <- approx(low$x,low$y,xout = newx,rule = 2)

# lowess没有predict,所以我们用了linear interpolation的方法

# rule=2用的是极限值,rule=1就是返回这个值

pred.low

pred.low$y

points(newx,pred.low$y,col='red',pch=20,cex=.8)

# 这个也是不够好,因为出了范围之后呢就是直线了,这是因为approox

# 是linear interpolation,而且用的是极限值,所以说当右侧没有点

# 之后自然是一条直线

tp <- approx(lo$x,y=lo$fitted,xout = newx,rule = 2)

# 对loess用linear interpolation

tp$y

points(newx,tp$y,pch=20,col='orange')

# 这也不是最好的,毕竟都是在linear interpolation

# 所以用spline可以解决

# Using B-splines------------

library('splines')

KN <- quantile(t,c(0.15,0.40, 0.60, 0.70, 0.80, 0.90))

BM <- bs(t,knots = KN)

bl <- lm(y~BM)# 这种以后就不写了

ny = predict(BM, newx)

pred.bspline = predict(bl, newdata=as.data.frame(ny)) # nope 不匹配

# b-spline 的正确predict方式

B.spline = lm(y~.,data=data.frame(BM))# 不用lm(y~BM),所以下次lm就写. 不写别的way

# 这样pred才能work well

pred.bspline = predict(B.spline, newdata=data.frame(ny))

pred.bspline

points(newx,pred.bspline,col='red')

# P-spline--------

rm(list = ls())

dat = na.omit(airquality)

t = dat$Temp

y = dat$Ozone

plot(t,y,xlim = c(50,110) )

newx <- c(95:100)

abline(v=newx,col='blue')

ps <- smooth.spline(t,y,cv=T)

# how to predict from the P-spline:

pred.pspline = predict(ps, x=newx)#这个x=是个argument

KN <- quantile(t,c(0.15,0.40, 0.60, 0.70, 0.80, 0.90))

BM <- bs(t,knots = KN)

b.s <- lm(y~.,data = data.frame(BM))

ny = predict(BM, newx)

pred.bspline <- predict(b.s,newdata =data.frame(ny) )

points(t, fitted(b.s), pch=20, col='brown', cex=1.5)

points(t, fitted(ps), pch=15, col='navy')

legend("topleft", legend=c("data","NLS fit","B-spline","P-spline"),

      col=c(8,4,2,'navy'), pch=c(20,20,20,15), bty='n')

points(newx,pred.bspline, col='brown', pch=14)# 这几个点follow general pattern

points(pred.pspline, col=3, pch=14)

plot(t,y,xlim = c(50,110) )

points(t, fitted(b.s), pch=20, col='brown', cex=1.5)

points(t, fitted(ps), pch=15, col='navy')

# 这里P比B好

# 控制B-spline

x <- t

KN = quantile(x, c(0.1, 0.4, 0.7))

BM = bs(x, knots=KN)

KN = quantile(x, c(0.05, 0.10, 0.20, 0.23, 0.45, 0.65, 0.95))

BM = bs(x, knots=KN)

KN = quantile(x, c(0.15,0.45, 0.65, 0.95))

BM = bs(x, knots=KN)

KN = quantile(x, c(0.15,0.45, 0.65, 0.95, 0.99)) # 这个很dramastic

BM = bs(x, knots=KN)

KN = quantile(x, c(0.15,0.45, 0.65, 0.70, 0.85, 0.91))  # 这个比较合

BM = bs(x, knots=KN)

KN = quantile(x, c(0.15,0.45, 0.65, 0.70, 0.85, 0.92))

BM = bs(x, knots=KN)

KN = quantile(x, c(0.15,0.45, 0.65, 0.70, 0.85, 0.93))

BM = bs(x, knots=KN)

# konts不同那么end处的estimate是很不一样的,B-spline很难控制end

B.spline.df = lm(y~., data=as.data.frame(BM))

newBM = predict(BM, newx)

pred.bspline = predict(B.spline.df, newdata=as.data.frame(newBM))

plot(x, y, pch=20, col=8,

    xlab="Temperature (degrees F)", ylab="Ozone (ppb)",

    xlim=c(55,105))

points(x, fitted(B.spline.df), pch=20, col='red')

lines(newx, pred.bspline, col='navy', pch=8, t='b')

# 那么用P-spline控制

# NOTE: control degrees of freedom in P-splines

P.spline3 = smooth.spline(x, y, df=3)# df控制一个knot用多少data points

P.spline5 = smooth.spline(x, y, df=5)

P.spline10 = smooth.spline(x, y, df=10)# df越小线越平

pred3 = predict(P.spline3, x=newx)

pred5 = predict(P.spline5, x=newx)

pred10 = predict(P.spline10, x=newx)

#

plot(x, y, pch=20, col=8,

    xlab="Temperature (degrees F)", ylab="Ozone (ppb)",

    xlim=c(55,105))

points(x, fitted(P.spline3), pch=15, col='navy',cex=.5)

points(x, fitted(P.spline5), pch=15, col='blue',cex=.5)

points(x, fitted(P.spline10), pch=15, col='cyan',cex=.5)

points(pred3, col='navy', pch=20)

points(pred5, col='blue', pch=20)

points(pred10, col='cyan', pch=20)

# 其中有一条线与B-spline长得很像,所以控制h may be a good way:

# NOTE: control smoothness (ie penalty) parameter in P-splines

P.spline10a = smooth.spline(x, y, df=10)

P.spline10b = smooth.spline(x, y, df=10, spar=.9)# spar=smoothing parameter

P.spline10c = smooth.spline(x, y, df=10, spar=.1)

#

plot(x, y, pch=20, col=8,

    xlab="Temperature (degrees F)", ylab="Ozone (ppb)",

    xlim=c(55,105))

points(x, fitted(P.spline10a), pch=15, col='navy',cex=.5)# 这个是默认的

points(x, fitted(P.spline10b), pch=15, col='blue',cex=.5)# h=0.9,很平

points(x, fitted(P.spline10c), pch=15, col='red',cex=.5)# 很snesitive,h=0.1

# plot the latter as a curve (requires re-ordering the points):

is = order(x)# 就是怎么画图

xs = x[is]

lines(xs, fitted(P.spline10c)[is], pch=15, col='red')

# Q4.1----------

rm(list=ls())

setwd('/Users/apple/Desktop/22-r/ST4060/ST4060txt-csv')

library(splines)

LT <- read.table('irl_lifetable_2005.txt',header = T )

head(LT)

SLT = LT[c(1:106),]

mx = SLT[,8]

x = SLT$Age

onls = nls(mx~A+B*c^x,start=list(A=.0003, B=.00002, c=1.108))

ABc = summary(onls)$coef[,1]#这是true value,怎么也算不出来的

set.seed(1)

x = seq(0, 110, by=1)

mx = ABc[1]+ABc[2]*ABc[3]^x  # this is the theorical model

mxn = mx

s1 = which(x<86)

s2 = which(x>85)

mxn[s1] = pmax(0.005,mx[s1]+rnorm(length(s1),0,.03))

mxn[s2] = mx[s2]+rnorm(length(s2),0,.06)# mxn has noise in it

dat = cbind(x,mx,mxn)

# (a)

plot(x,mx)#这是data 的true value,怎么也算不出来的

points(x,mxn,cex=0.8,pch=20)#这是加上noise的data,相当于我们的观察值,ie这是要用的东西

nls.ns = nls(mxn~A+B*c^x,start=list(A=.0003, B=.00002, c=1.108))

lines(x,fitted(nls.ns),col='red')

ABc.ns = summary(nls.ns)$coef[,1]#取nls后的系数

# (b)

ps <- smooth.spline(x,mxn)

?smooth.spline

summary(ps)

lines(x,fitted(ps),col='blue')

sy = ps$y #人家说Repeat the previous step, but on the smoothed curve obtained

#from a generic P-spline.所以才有的这一步!!!!

nls.ps = nls(sy~A+B*c^x,start=list(A=.0003, B=.00002, c=1.108))

ABc.ps = summary(nls.ps)$coef[,1]

# MSE:

(ABc-ABc.ns)^2

(ABc-ABc.ps)^2

(ABc-ABc.ns)^2-(ABc-ABc.ps)^2 #还是ps后小一点

# percentage errors:

((ABc-ABc.ns)/ABc)

((ABc-ABc.ps)/ABc)

((ABc-ABc.ns)/ABc)-((ABc-ABc.ps)/ABc) #还是ps后小一点

# 以上is a way of doing a simulation using a real life data 

# 这里的启示是:当你fit a complicated model的时候,可以smooth the data (这里是p-spline)

# first to reduce the level of noise which a nonlinesr procedure would be very

# sensitive to

# Q4.2 ----------------------

rm(list = ls())

dat <- read.csv('insdata.csv')

age = dat$Age

mF = dat$mF

# (a)

sp <- smooth.spline(age,mF,cv=T)

plot(age,mF,pch=20,cex=0.8)

lines(age,fitted(sp),col='red')

# (b)

par = sp$spar

sp2 <- smooth.spline(age,mF,cv=F,spar=par/2)

lines(age,fitted(sp2),col='blue')

# spar=smoothing parameter

# df控制一个knot用多少data points

# (c)

sp$x-sp2$x # check p1$x和p2$x是一样的

# (d)

sp$x-age#也要检查

mean((mF-sp$y)^2)

mean((mF-sp2$y)^2)

sqrt(mean((mF-sp$y)^2))

sqrt(mean((mF-sp2$y)^2))

# 看数字的话第二个更好,因为RMSE更小,但是这个第二个对应的是上边那个图里

# 弯弯曲曲的蓝线,这并不是我们想要的。

# 所以要搞清楚我们这里想要的是什么,在这里我们想要一个smooth curve,而不是一个好的fit

# 所以应该是RMSE大的更好

sd(mF-sp$y) #var

sd(mF-sp2$y)#var

mean(mF-sp$y) #bias

mean(mF-sp2$y)#bias

# 可以看到bias是很小的,所以mse的大部分都是由var贡献的,所以如果apline是smoother

# 的话,mse大

# (e)

KN <- quantile(age,c(0.25,0.5,0.75))

BM <- bs(age,knots = KN)

matplot(age,BM,t='b')#不要掉了age!!!!!!x轴!!!!

# (f)

round(BM[which(age==60),],4)#这是 coordinates

abline(v=60, lty=3)

abline(h=BM[which(age==60),], lty=2)#怎么indicate

# (g)

bs <- lm(mF~.,data=data.frame(BM))

bs$coefficients

# (h)

mean((bs$residuals)^2)

mean((mF-sp$y)^2)

mean((mF-sp2$y)^2)

plot(age,mF)

lines(sp,col='red')

lines(sp2,col='blue')

lines(age,fitted(bs),col='orange')

# b-spline is very sensitive to the tail end, that's because the way of

# arranging the knots

# This B-spline seems comparable to the 1st P-spline, and

# not as effective as the second one.

# (i)

# first, see that we're missing some ages:

plot(age)

# interpolation grid:

all.ages = seq(min(age),max(age),by=1)

plot(all.ages)

lip <- approx(sp$x,sp$y,xout = all.ages)#全写上没事

sd(lip$y)

lo <- lowess(age,mF)

lilo <- approx(lo$x,lo$y,xout = all.ages)

lpred-lilo$y

sd(lilo$y)

# 问----------

plot(age,mF)

lines(all.ages,lip$y,col='red')

lines(all.ages,lpred,col='blue')

# Q4.3 ----------------------

rm(list = ls())

library('car')

library(splines)

dat <- Prestige

x = Prestige$income

y = Prestige$prestige

# B-spline:

BM <- bs(x,df=6)

bs <- lm(y~.,data = data.frame(BM))

bmse <- mean((bs$residuals)^2)

# P-spline:

ps <- smooth.spline(x,y)

ps$x-x

# For the P-spline, we need to interpolate!

range(x)

range(ps$x)

# range相同,rule=1/2都是一样的,xout的range出了x,那么就用rule=2

yp <- approx(ps$x,ps$y,xout=x)$y

yp2 <- approx(ps$x,ps$y,xout=x,rule = 2)$y

pmse <- mean((y-yp)^2)

# w11----------------

rm(list = ls())

x = iris[,c(2:4)]

y = iris[,5]

K <- 2

ko <- kmeans(x,K)

?kmeans

is <- c(1,3)

plot(x[,is],col=c(1,2,4)[ko$cluster],pch=20,cex=2)

z <- apply(x,2,scale)

(x[1,1]-mean(x$Sepal.Width))/sd(x$Sepal.Width)-z[1,1]

koz <- kmeans(z,K)

plot(x[,is],col=c(1,2,4)[koz$cluster],pch=20,cex=2)

pairs(iris[,1:4],pch=20)

cor(iris[,1:4])

dat <- EuStockMarkets

plot(EuStockMarkets, lwd=1.5)

pca <- prcomp(EuStockMarkets)

?prcomp #Principal Components Analysis

par(mfcol=c(4,1), mar=c(1,3,0,1))

# 分解four dimensional set of time series into time series

for(i in 1:4){

  plot(pca$x[,i], t='l', lwd=2)

}

summary(pca)

??EuStockMarkets

names(pca)

pca$x

plot(pca)

plot(prcomp(iris[,1:4]))

plot(prcomp(iris[,1:4], scale.=TRUE))

# Logistic regression based classification--------------

rm(list = ls())

library(ISLR)

dat <- Smarket

head(Smarket)

?Smarket

glm1 <- glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,

            data=dat, family=binomial)

summary(glm1)

n <- nrow(dat)

itrain <- which(dat$Year<2005)

glm1.cv <- glm(Direction~Lag1+Lag2,

              data=dat,subset = itrain, family=binomial)

summary(glm1.cv)

po <- predict(glm1.cv,type = 'response')

summary(po)

glm.pred= rep("Down", length(po))

glm.pred[po>.5] = "Up"

tb <- table(glm.pred,dat$Direction[itrain])

tb

sum(tb)

length(itrain)

1-sum(diag(tb))/sum(tb)# overall error rate

Direction.2005 <- dat$Direction[!itrain]

po.2005 <- predict(glm1.cv,type ='response',newdata = dat[-itrain,])

length(po.2005)

length(po.2005)+length(po)

nrow(dat)

glm.pred.2005= rep("Down", length(po.2005))

glm.pred.2005[po.2005>.5] = "Up"

tb.test = table(glm.pred.2005,dat$Direction[-itrain])

1-sum(diag(tb.test))/sum(tb.test)

# Logistic regression based classification

# another example using iris--------------------------------------

# 这里化简到二维问题

rm(list = ls())

iris <- iris

n <- nrow(iris)

n

x <- iris[sample(1:n,n),]

is.var <- (x$Species=='virginica') #这是在判断,返回T/F

# is.var <- which(x$Species=='virginica') #返回位置,但是我们要T/F

x$Species <- is.var

glmo <- glm(Species~., x, family=binomial)

glmo$fitted.values - predict(glmo, type='response')

# 反正就看别xy都是同一个vector就行了:

a <- x$Sepal.Length

b <- x[,-1]

lm1 <- lm(a~.,data = b)

lm2 <- lm(a~.,data = x)

lm3 <- lm(Sepal.Length~.,data = x)

n = nrow(x)

K = 10

train.acc = test.acc = numeric(K)

folds = cut(1:n, K, labels=FALSE)

k = 1

# rm(k)

thr = .8

for(k in 1:K){

  itrain = which(folds!=k)

  glmo = glm(Species~., data=x,

            family=binomial,

            subset=itrain)

  tb1 = table(glmo$fitted.values>thr, x$Species[itrain])

  train.acc[k] = sum(diag(tb1))/sum(tb1)

  # prediction

  p.test = predict(glmo, x[-itrain,], type='response')

  tb2 = table(p.test>thr, x$Species[-itrain])

  test.acc[k] = sum(diag(tb2))/sum(tb2)

}

mean(train.acc) 

mean(test.acc)

tb1

tb2

# Illustration of hierarchical clustering ------------------------------------------------

# 分层聚类

data(eurodist)

hc = hclust(eurodist, method="ward")

plot(hc)

hc = hclust(eurodist, method="single")

plot(hc)

hc = hclust(eurodist, method="complete")

plot(hc)

# another Clustering examples

# partition-based clustering------------------------------------------------

rm(list = ls())

x = iris[,1:4]

plot(x[,1:2], pch=20)

plot(x[,1:2], pch=20, col=c(1,2,4)[iris$Species])

plot(x[,c(3,4)], pch=20,col=c(1,2,4)[iris$Species])

# hierarchical clustering

# 这里与下边无关

xs = apply(x, 2, scale)

ho = hclust(dist(x), method="ward.D")

hc = cutree(ho, k=3)

plot(x[,1:2], pch=20, col=hc)

你可能感兴趣的:(R笔记 ST4060 17/02/2022)