selectvar.R-20170907

# datap=cbind(condata[,1],pc)
# colnames(datap)=c('y',paste("x",1:9,sep=""))
# l=lm(y~.,data=data.frame(datap))
# summary(l)

condata2=condata3
condata2=as.matrix(condata2)
colnames(condata2)=c('y','x1','x2','x7^2','x7','x17',paste("x3",0:1,sep="="),paste("x4",0:6,sep="="),
                    paste("x5","=",0:1,sep=""),paste("x6","=",0:1,sep=""),#paste("x7","=",0:8,sep=""),
                    paste("x8","=",0:1,sep=""),paste("x9","=",0:1,sep=""),paste("x10","=",0:1,sep=""),
                    paste("x11","=",0:1,sep=""),paste("x12","=",0:1,sep=""),paste("x13","=",0:2,sep=""),
                    paste("x14","=",0:3,sep=""),paste("x15","=",0:1,sep=""),paste("x16","=",0:1,sep=""))

#带约束的最小二乘回归
library(lsei)
lsei(condata2[,2:40],condata[,1], c=NULL, d=NULL, e=-diag(1,39), 
     f=rep(0,39), lower=-Inf, upper=Inf)
lsei(condata2[,2:40],condata[,1], c=NULL, d=NULL, e=NULL, 
     f=NULL, lower=rep(0,39), upper=Inf)

bl=data.matrix(nnls(condata2[,2:40],condata[,1])$x)
rownames(bl)=c('x1','x2','x7^2','x7','x17',paste("x3",0:1,sep="="),paste("x4",0:6,sep="="),
               paste("x5","=",0:1,sep=""),paste("x6","=",0:1,sep=""),#paste("x7","=",0:8,sep=""),
               paste("x8","=",0:1,sep=""),paste("x9","=",0:1,sep=""),paste("x10","=",0:1,sep=""),
               paste("x11","=",0:1,sep=""),paste("x12","=",0:1,sep=""),paste("x13","=",0:2,sep=""),
               paste("x14","=",0:3,sep=""),paste("x15","=",0:1,sep=""),paste("x16","=",0:1,sep=""))
ind=rownames(bl)[which(bl!= 0)]

user=as.matrix(user)
colnames(user)=c('x1','x2','x7^2','x7','x17',paste("x3",0:1,sep="="),paste("x4",0:6,sep="="),
               paste("x5","=",0:1,sep=""),paste("x6","=",0:1,sep=""),#paste("x7","=",0:8,sep=""),
               paste("x8","=",0:1,sep=""),paste("x9","=",0:1,sep=""),paste("x10","=",0:1,sep=""),
               paste("x11","=",0:1,sep=""),paste("x12","=",0:1,sep=""),paste("x13","=",0:2,sep=""),
               paste("x14","=",0:3,sep=""),paste("x15","=",0:1,sep=""),paste("x16","=",0:1,sep=""))
ypre=user%*%bl


#AIC
l=lm(y~.,data=data.frame(condata2))
summary(l)
l.step=step(l,direction = "backward") 
summary(l.step)
l.new=drop(l.step) 
summary(l.new)
beta=data.frame(l.new$coefficients)

x=cbind(condata[,'x1'],condata[,'x2'],condata[,'x4=0'],condata[,'x4=1'],condata[,'x4=2'],
        condata[,'x4=4'],condata[,'x10=0'],condata[,'x13=0'],condata[,'x13=1'],
        condata[,'x14=0'],condata[,'x15=0'],condata[,'x16=0'])
lsei(x,condata[,1], c=NULL, d=NULL, e=NULL, 
              f=NULL, lower=0, upper=Inf)
nnls(x,condata[,1])

#BIC
l.step2=step(l,direction = "backward", k = log(nrow(condata2)))
l.new2=drop(l.step2)
summary(l.new2)

# l.new2=lm(y~ x1 + x2 + x17.2 + x17 + x3.0 + x4.1 + x4.2 + x4.3 + x4.5 + 
#             x10.1 + x13.1 + x13.2 + x14.1 + x15.1 + x16.1,data=data.frame(condata2))
# summary(l.new2)
# beta2=data.frame(l.new2$coefficients)
# 
# l.new3=lm(y~ x1 + x2 + x3.0 + x4.1 + x4.2 + x4.3 + x4.5 + 
#             x10.1 + x13.1 + x13.2 + x14.1 + x15.1 + x16.1,data=data.frame(condata2))
# summary(l.new3)
# beta3=data.frame(l.new3$coefficients)

#岭回归
# library(MASS)
# l.r=lm.ridge(y~.,data=data.frame(condata2),lambda = 0)
# summary(l.r)
# print(l.r)
# plot(lm.ridge(y~.,data=data.frame(condata2),lambda =seq(0,0.1,0.001)))
# select(lm.ridge(y ~ .,data=data.frame(condata2),lambda = seq(0,0.1,0.0001)))

l.ridge=cv.glmnet(x=condata2[,2:40],y=condata2[,1], alpha=0,family = "gaussian",
                  type.measure="mse")
summary(l.ridge)
print(l.ridge)
beta.ridge=data.matrix(coef(l.ridge,s=l.ridge$lambda.min))
#beta.ridge
#beta.ridge[which(beta.ridge != 0)] 
rownames(beta.ridge)[which(beta.ridge != 0)]

#lasso
library(glmnet)
l.lasso=cv.glmnet(x=condata2[,2:40],y=condata2[,1], alpha=1,family = "gaussian",
                  type.measure="mse")
summary(l.lasso)
print(l.lasso)
beta.lasso=data.matrix(coef(l.lasso,s=l.lasso$lambda.min))
rownames(beta.lasso)[which(beta.lasso != 0)]
rownames(beta.lasso)[which(beta.lasso == 0)]

# #偏最小二乘回归
# library(pls)
# l.p=pls(y~., ncomp=1,data=data.frame(condata2),validation = "CV")

# #Elastic Net, alpha=k/p
# library(glmnet)
# p=100 #alpha精度为1/p
# mse=matrix(0,p,1)
# for (i in 0:p) {
#   l.en=cv.glmnet(x=condata2[,2:40],y=condata2[,1], alpha=i/p,family = "gaussian",
#                  type.measure="mse")
#   yhat=predict(l.en, s=l.en$lambda.1se, newx=condata2[,2:40],type="response")
#   mse[i]=mean((condata2[,1]-yhat)^2)
# }
# a=matrix(0,1,2)
# a[1]=which.min(mse) #取mse最小的alpha对应的第几次
# a[2]=min(mse)
# 
# l.en=cv.glmnet(x=condata2[,2:40],y=condata2[,1],alpha=a[1]/p,family = "gaussian",
#                type.measure="mse")
# beta.en=data.matrix(coef(l.en,s=l.en$lambda.min)) #确定的系数
# rownames(beta.en)[which(beta.en != 0)]

library(caret) 
library(MASS)
library(glmnet)

c=10 #10折交叉验证
p=100 #alpha精度为1/p
set.seed(1234)
r.en=matrix(0,c,1)
r.lasso=matrix(0,c,1)
r.aic=matrix(0,c,1)
r.bic=matrix(0,c,1)
a=matrix(0,c,2)
#lam=matrix(0,c,1)

for (k in 1:c) {
  index=createFolds(condata2[,1], k=c, list = TRUE, returnTrain = FALSE) #分成10份
  train=condata2[-index[[k]], ] 
  test=condata2[index[[k]], ]
  colnames(train)=c('y','x1','x2','x7^2','x7','x17',paste("x3",0:1,sep="="),paste("x4",0:6,sep="="),
                    paste("x5","=",0:1,sep=""),paste("x6","=",0:1,sep=""),#paste("x7","=",0:8,sep=""),
                    paste("x8","=",0:1,sep=""),paste("x9","=",0:1,sep=""),paste("x10","=",0:1,sep=""),
                    paste("x11","=",0:1,sep=""),paste("x12","=",0:1,sep=""),paste("x13","=",0:2,sep=""),
                    paste("x14","=",0:3,sep=""),paste("x15","=",0:1,sep=""),paste("x16","=",0:1,sep=""))
  colnames(test)=c('y','x1','x2','x7^2','x7','x17',paste("x3",0:1,sep="="),paste("x4",0:6,sep="="),
                   paste("x5","=",0:1,sep=""),paste("x6","=",0:1,sep=""),#paste("x7","=",0:8,sep=""),
                   paste("x8","=",0:1,sep=""),paste("x9","=",0:1,sep=""),paste("x10","=",0:1,sep=""),
                   paste("x11","=",0:1,sep=""),paste("x12","=",0:1,sep=""),paste("x13","=",0:2,sep=""),
                   paste("x14","=",0:3,sep=""),paste("x15","=",0:1,sep=""),paste("x16","=",0:1,sep=""))
  
  #Elastic Net, alpha=k/p
  mse=matrix(0,p,1)
  for (i in 0:p) {
    l.en=cv.glmnet(x=train[,2:40],y=train[,1], alpha=i/p,family = "gaussian",
                   type.measure="mse")
    yhat=predict(l.en, s=l.en$lambda.1se, newx=test[,2:40],type="response")
    mse[i]=mean((test[,1]-yhat)^2)
  }
  a[k,1]=which.min(mse) #取mse最小的alpha对应的第几次
  a[k,2]=min(mse)
  l.en1=cv.glmnet(x=train[,2:40],y=train[,1],alpha=a[k,1]/p,family = "gaussian",
                  type.measure="mse")
  #lam[k]=l.en1$lambda.min
  ytest.en=predict(l.en1,s=l.en1$lambda.min,newx=test[,2:40],type="response")
  r.en[k]=mean(abs(ytest.en-test[,1])/test[,1])
  
  #lasso
  l.lasso1=cv.glmnet(x=train[,2:40],y=train[,1], alpha=1,family = "gaussian",
                    type.measure="mse")
  ytest.lasso=predict(l.lasso1,s=l.lasso1$lambda.min,newx=test[,2:40],type="response")
  r.lasso[k]=mean(abs(ytest.lasso-test[,1])/test[,1])
  
  #aic
  l.aic=lm(y ~ x1 + x2 + x17 + x4.0 + x4.1 + x4.2 + x4.4 + 
             x10.0 + x13.0 + x13.1 + x14.0 + x15.0 + x16.0, data = data.frame(train))
  ytest.aic=predict(l.aic,newdata=data.frame(test[,2:40]))
  r.aic[k]=mean(abs(ytest.aic-test[,1])/test[,1])
  
  #bic
  l.bic=lm(y ~ x1 + x17 + x4.0 + x4.4 + x10.0 + x14.0 + x15.0 + x16.0, 
           data = data.frame(train))
  ytest.bic=predict(l.bic,newdata=data.frame(test[,2:40]))
  r.bic[k]=mean(abs(ytest.bic-test[,1])/test[,1])
}
mean(r.en)
mean(r.lasso)
mean(r.aic)
mean(r.bic)

#Elastic Net最终模型
alp=a[which.min(a[,2]),1] #mse最小的选出最小的alpha作为最终模型的alpha
l.en=cv.glmnet(x=condata2[,2:40],y=condata2[,1],alpha=alp/p,family = "gaussian",
               type.measure="mse")
print(l.en)
#min(lam)
beta.en=data.matrix(coef(l.en,s=l.en$lambda.min)) #确定的系数
rownames(beta.en)[which(beta.en != 0)]
rownames(beta.en)[which(beta.en == 0)]

#用户数据测试
user=as.matrix(user)
colnames(user)=c('x1','x2','x7^2','x7','x17',paste("x3",0:1,sep="="),paste("x4",0:6,sep="="),
               paste("x5","=",0:1,sep=""),paste("x6","=",0:1,sep=""),#paste("x7","=",0:8,sep=""),
               paste("x8","=",0:1,sep=""),paste("x9","=",0:1,sep=""),paste("x10","=",0:1,sep=""),
               paste("x11","=",0:1,sep=""),paste("x12","=",0:1,sep=""),paste("x13","=",0:2,sep=""),
               paste("x14","=",0:3,sep=""),paste("x15","=",0:1,sep=""),paste("x16","=",0:1,sep=""))  
yuser=data.frame(predict(l.new,newdata=data.frame(user)))
y.pre=data.frame(l.new$fitted.values)
plot(condata2[,'x4=0'],condata2[,'y'])

ll=lm(y ~ x1 + x2 + x17 + x4.0 + x4.1 + x4.2 + x4.4 + 
     x11.0 + x13.0 + x13.1 + x14.0 + x15.0 + x16.0, data = data.frame(condata2))
summary(ll)
yuser.ll=data.frame(predict(ll,newdata=data.frame(user)))

 

你可能感兴趣的:(R)