assignment1.R-20170914

#数据预处理
data=data.frame(XY_train)
colnames(data)=c('y',paste("x",1:34,sep=""))
#离散因素数据处理
data[,'y']=factor(data[,'y'])
data[,'x33']=factor(data[,'x33'])
data[,'x34']=factor(data[,'x34'])
#转换因子水平为数值
levels(data[,'y'])=c(1,2,3) #L=1,M=2,s=3
levels(data[,'x33'])=c(1,2) #H=1,Z=2
levels(data[,'x34'])=c(1,2) #D=1,U=2
#删去包含inf的行
data=na.omit(data)
data=data[-which(data[,'x31']==Inf),]
data=data[-which(data[,'x31']==-Inf),]
data=data[-which(data[,'x32']==Inf),]
data=data[-which(data[,'x32']==-Inf),]
data=data.matrix(data)

#1. logistic regression by Elastic Net, alpha=k/p
library(MASS)
library(glmnet)
set.seed(1234)
p=10 #alpha精度为1/p
res=matrix(0,p,1)
for (i in 0:p) {
  l.en=cv.glmnet(x=cbind(data[,2:33],as.factor(data[,'x33']),as.factor(data[,'x34'])),y=data[,1], 
                 alpha=i/p,family = "multinomial",type.measure="class")
  yhat=predict(l.en, s=l.en$lambda.1se,newx=data[,2:35],type="class")
  rei[i]=(length(yhat)-length(which(yhat!=data[,1])))/length(yhat)
}
#取alpha为正确率最高对应的alpha
l.en1=cv.glmnet(x=cbind(data[,2:33],as.factor(data[,'x33']),as.factor(data[,'x34'])),
                y=data[,1],alpha=which.max(res)/p,family = "multinomial",type.measure="class")
ypre.en=predict(l.en1,s=l.en1$lambda.min,newx=data[,2:35],type="class")
print(l.en1)
#beta=coef(l.en1,s=l.en$lambda.min) #确定的系数
r.en=(length(ypre.en)-length(which(ypre.en!=data[,1])))/length(ypre.en)#正确率

#2. svm
# library(e1071)
# m1=svm(cbind(data[,2:33],as.factor(data[,'x33']),as.factor(data[,'x34'])),as.factor(data[,'y']))
# summary(m1)
# print(m1)
# ypre.svm1=predict(m1,data[,2:35])
# r.svm1=(length(ypre.svm1)-length(which(ypre.svm1!=data[,1])))/length(ypre.svm1) #正确率
#svm with kernel rbfdot
library(kernlab)
m2=ksvm(cbind(data[,2:33],as.factor(data[,'x33']),as.factor(data[,'x34'])),as.factor(data[,'y']),kernel = "rbfdot")
summary(m2)
print(m2)
ypre.svm2=predict(m2,data[,2:35])
r.svm2=(length(ypre.svm2)-length(which(ypre.svm2!=data[,1])))/length(ypre.svm2) #正确率

# #3. Random Forest
# library(randomForest)
# rf=randomForest(as.factor(data[,'y'])~.,data=cbind(data[,'y'],data[,2:33],as.factor(data[,'x33']),as.factor(data[,'x34'])),
#                 ntree=500)
# summary(rf)
# print(rf)
# ypre.rf= predict(rf,cbind(data[,'y'],data[,2:33],as.factor(data[,'x33']),as.factor(data[,'x34'])))
# r.rf=(length(ypre.rf)-length(which(ypre.rf!=data[,1])))/length(ypre.rf)

#4. knn
library(class)
library(caret) 
c=50 #验证c次来找最合适的k
set.seed(1234)
r.k=matrix(0,c,1)
index=createFolds(data[,1], k=c, list = TRUE, returnTrain = FALSE) #分成c份
for (k in 1:c) {
  train=data[-index[[k]], ] 
  test=data[index[[k]], ]
  colnames(train)=c('y',paste("x",1:34,sep=""))
  colnames(test)=c('y',paste("x",1:34,sep=""))
  ypre.kn=knn(train = train,test = test,cl=train[,'y'],k=c)
  r.k[k]=(length(ypre.kn)-length(which(ypre.kn!=test[,1])))/length(ypre.kn)
}
mean(r.k)

#5. naiveBayes
library(e1071)
data1=data.frame(data)
nb=naiveBayes(as.factor(data1[,'y'])~.,data=cbind(data1[,'y'],data1[,2:33],as.factor(data1[,'x33']),as.factor(data[,'x34'])),
              type = "class")
summary(nb)
print(nb)
ypre.nb=predict(nb,newdata=data[,2:35],type='class')
r.nb=(length(ypre.nb)-length(which(ypre.nb!=data[,1])))/length(ypre.nb)

# #6. decision tree
# library(rpart)
# data1=data.frame(data)
# t=rpart(as.factor(data1[,'y'])~., data=cbind(data1[,'y'],data1[,2:33],as.factor(data1[,'x33']),as.factor(data[,'x34'])),
#            method="class")
# summary(t)
# print(t)
# ypre.t=predict(t,cbind(data1[,'y'],data1[,2:33],as.factor(data1[,'x33']),as.factor(data[,'x34'])),type='class')
# r.tree=(length(ypre.t)-length(which(ypre.t!=data[,1])))/length(ypre.t)

# #7. k-means
# library(cluster)
# km=kmeans(cbind(data1[,'y'],data1[,2:33],as.factor(data1[,'x33']),as.factor(data[,'x34'])),3) 
# print(km)
# ypre.km=km$cluster
# r.km=(length(ypre.km)-length(which(ypre.km!=data[,1])))/length(ypre.km)
# 
# #8. Gradient Boosting Classifier 
# library(caret)
# library(gbm)
# fitControl=trainControl( method = "repeatedcv", number = 4, repeats = 4)
# gb=train(as.factor(data[,'y'])~ ., data=cbind(data[,'y'],data[,2:33],as.factor(data[,'x33']),as.factor(data[,'x34'])), 
#          method = "gbm", trControl = fitControl,verbose = FALSE)
# ypre.gb= predict(gb,data,type= "prob")[,2]
# 
# #Fisher LDA
# library(MASS)
# data1=data.frame(data)
# flda=lda(as.factor(data1[,'y'])~., data=cbind(data1[,'y'],data1[,2:33],as.factor(data1[,'x33']),as.factor(data1[,'x34'])))
# ypre.flda=predict(flda,cbind(data[,'y'],data[,2:33],as.factor(data[,'x33']),as.factor(data[,'x34'])))$class

library(marray)#保存数据
write.list(l.en1,file = "/Users/vicky/Documents/code/R/高统期末报告-王小薇/LR.csv")
write.csv(beta,file = "/Users/vicky/Documents/code/R/高统期末报告-王小薇/LRbeta.csv")
write.csv(ypre.en,file = "/Users/vicky/Documents/code/R/高统期末报告-王小薇/LRypre.csv")

 

你可能感兴趣的:(R)