lm.R-20170905

#colnames(condata)=c('y','x1','x2','x17',paste("x",3:16,sep=""))
condata=as.matrix(condata)
colnames(condata)=c('y','x1','x2','x17',paste("x3",0:1,sep="="),paste("x4",0:6,sep="="),
                    paste("x5","=",0:1,sep=""),paste("x6","=",0:1,sep=""),paste("x7","=",0:8,sep=""),
                    paste("x8","=",0:1,sep=""),paste("x9","=",0:1,sep=""),paste("x10","=",0:1,sep=""),
                    paste("x11","=",0:1,sep=""),paste("x12","=",0:1,sep=""),paste("x13","=",0:2,sep=""),
                    paste("x14","=",0:3,sep=""),paste("x15","=",0:1,sep=""),paste("x16","=",0:1,sep=""))
# cdata=as.matrix(condata)
# cdata=cdata[,c("y","x1","x2","x17","x3=0",paste("x4",0:5,sep="="),"x5=0","x6=0",paste("x7",0:7,sep="="),paste("x",8:12,"=",0,sep=""),
#               paste("x13","=",0:1,sep=""),paste("x14","=",0:2,sep=""),paste("x",15:16,"=",0,sep=""))]
# write.table(cdata, file = "/Users/vicky/Documents/code/R/cdata.csv")
#lm=lm(y~.,data=data.frame(cdata))
# lm=lm(y~.,data=data.frame(condata))
# summary(lm)

#cor.test(condata[,'x1'],condata[,'y'])
# library(lmtest)
# gqtest(lm) #检验异方差

#变量选择:置换检验
l=lm(y~.,data=data.frame(cdata2))
summary(l)
t=as.matrix(summary(l)$coefficients[,'t value'])

#nlr=loess(y~x1+x2+x17+x3,data=data.frame(oadata))
# datap=cbind(condata[,1],pc)
# colnames(datap)=c('y',paste("x",1:9,sep=""))
# l=lm(y~.,data=data.frame(datap))
# summary(l)
# t=as.matrix(summary(l)$coefficients[,'t value'])

set.seed(1000)
R=5000 #有放回的抽样R次
pt=matrix(0,R,nrow(t))
for (i in 1:R){
  #产生抽样样本
  k=sample(1:length(cdata2[,1]), size=1000, replace=TRUE)
  pdata=cdata2[k,] #构造1000样本
  #colnames(pdata)=c('y','x1','x2','x17',paste("x",3:16,sep=""))
  pa = lm(y~.,data=data.frame(pdata)) 
  pt[i,]=summary(pa)$coefficients[,'t value']#计算样本的f值
}
ptsort=matrix(0,nrow(pt),ncol(pt))
for (i in 1:ncol(pt)) {
  ptsort[,i]=sort(pt[,i])}

ppvalue=matrix(0,1,ncol(pt))
for (i in 1:ncol(pt)){
  ppvalue[i]=length(which(abs(t[i])>abs(ptsort[,i])))/nrow(pt)} #构造得出的pvalue
colnames(ppvalue)=c("intercept","x1","x2","x17","x3=0",paste("x4",0:5,sep="="),"x5=0","x6=0",paste("x7",0:7,sep="="),
                    paste("x",8:12,"=",0,sep=""),paste("x13","=",0:1,sep=""),paste("x14","=",0:2,sep=""),paste("x",15:16,"=",0,sep=""))
ppvalue
sort(ppvalue,decreasing=TRUE)
which(ppvalue>0.05)

#散点图
# par(mfrow=c(4,3))
# for (k in c("x3=0","x11=0","x9=0","x12=0","x4=1","x4=4","x17","x4=3","x4=5","x6=0")){
# plot(condata[,k],condata[,'y'],xlab=k,type='p')
# }
# 
# for (k in c("y","x1","x2","x17","x3=0",paste("x4",0:5,sep="="),"x5=0","x6=0",paste("x7",0:7,sep="="),
#             paste("x",8:12,"=",0,sep=""),paste("x13","=",0:1,sep=""),paste("x14","=",0:2,sep=""),paste("x",15:16,"=",0,sep=""))){
#   plot(condata[,k],condata[,'y'],xlab=k,type='p')
#   quartz()
# }

#x3=0,x11=0,x9=0,x12=0,x4=1,x4=4,x4=3,x17,x4=3,x4=5,x6=0的p值大于0.05
#删去因素x3,x4,x6,x9,x11,x17
cdata2=condata
#cdata2=cdata2[,!(colnames(cdata2) %in% c("x3=0","x11=0","x9=0","x12=0","x4=1","x4=4","x17","x4=3","x4=5","x6=0"))]
cdata2=cdata2[,c("y","x1","x2","x5=0",paste("x7",0:7,sep="="),"x8=0","x10=0","x12=0",paste("x13","=",0:1,sep=""),
                 paste("x14","=",0:2,sep=""),paste("x",15:16,"=",0,sep=""))]
c=1000 #验证c次
set.seed(5000)
r1=matrix(0,c,1)
r2=matrix(0,c,1)
#r3=matrix(0,c,1)
r4=matrix(0,c,1)
r5=matrix(0,c,1)
#p=as.matrix(summary(lm)$coefficients[,'Pr(>|t|)'])
#p1=matrix(0,c,nrow(p))
#p2=matrix(0,c,nrow(p))
for (k in 1:c) {
  index <-createDataPartition(cdata2[,1], time=1, p=0.7, list=F)
  train=cdata2[index, ]
  test=cdata2[-index, ]
  train2=condata[index, ]
  test2=condata[-index, ]
  
  lmtrain = lm(y~.,data=data.frame(train)) #train集
  ytest=predict(lmtrain,data.frame(test)) 
  resi1=abs(ytest-test[,1])/test[,1]
  r1[k]=mean(resi1) #误差r1(k)
  # p1[k,]=as.matrix(summary(lmtrain)$coefficients[,'Pr(>|t|)']) #p-value
  
  #交叉验证
  lmtest = lm(y~.,data=data.frame(test)) 
  ytrain=predict(lmtest,data.frame(train)) 
  resi2=abs(ytrain-train[,1])/train[,1]
  r2[k]=mean(resi2) #误差r1(k)
  # p2[k,]=as.matrix(summary(lmtest)$coefficients[,'Pr(>|t|)']) #p-value

  # m=svm(train2[,2:47],train2[,1])
  # m.ytest=predict(m,test2[,2:47])
  # resi3=abs(m.ytest-test2[,1])/test2[,1]
  # r3[k]=mean(resi3) #误差
  # 
  #svm,kernel=RBF
  s=ksvm(train2[,2:47],train2[,1],kernel = "rbfdot")
  s.ytest=predict(s,test2[,2:47],type = "response")
  r4[k]=mean(abs(s.ytest-test2[,1])/test2[,1])
  
  s_c=ksvm(test2[,2:47],test2[,1],kernel = "rbfdot")
  s.ytrain=predict(s_c,train2[,2:47],type = "response")
  r5[k]=mean(abs(s.ytrain-train2[,1])/train2[,1])
}

mean(r1)
mean(r2)
mean(cbind(r1,r2))
mean(r4)
mean(r5)
mean(cbind(r4,r5))

#最终模型
lm = lm(y~.,data=data.frame(cdata2)) 
summary(lm)
beta=data.frame(lm$coefficients)

#svm,kernel=RBF
sf=ksvm(condata[,2:18],condata[,1],kernel = "rbfdot")
sf

 

你可能感兴趣的:(R)