用R语言处理确实数据并对比不同方式的效果+Caret包进行svm交叉验证

数据来源:

http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/ 
# (描述文件: http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Original%29 )

 

breast <- read.csv("breast-cancer-wisconsin.data.txt")

#get all but one columns
# reference: https://stackoverflow.com/questions/12868581/list-all-column-except-for-one-in-r
#mtcars[, !names(mtcars) %in% c("carb", "mpg")] 
head(breast[, !names(breast) %in% c("X1000025")])

# First of all, we need to find which column contanins missing data.
lapply(breast[, !names(breast) %in% c("X1000025")],function(x)unique(x))
#sapply(breast,function(x) unique(x))

#Now we know that column X1.3 contains '?' 
#Let's find out how many rows has value '?'
nrow(breast[breast$X1.3 =='?',])/nrow(breast)
breast[,'X1.3'] <- as.integer(as.character(breast[,'X1.3']))
# we have 16 missing data in 698 rows , missing data rate is 2.29% 
sapply(breast,function(x) class(x))


# 1. Use the mean/mode imputation method to impute values for the missing data.

breast.mean = breast
breast.mean[is.na(breast.mean)] <- round(mean(breast.mean[,'X1.3'],na.rm=TRUE))
breast.mean
#[1] 1
breast[,'X1.3'] <- as.integer(as.character(breast[,'X1.3']))
breast[breast$X1.3 =='?','X1.3'] 

names(breast.mean)
library(e1071)
#create a svm model
model.mean <- 
  svm(X2.1 ~ X5+X1+X1.1+X1.2+X2+X1.3+X3+X1.4+X1.5, 
      data = breast.mean,type = 'C',kernel = 'radial' )

pre_svm <- predict(model.mean, newdata = breast.mean)
table(breast.mean$X2.1,pre_svm,dnn=c("真实值","预测值"))

# Let's do this properly
# use cross validation
library(caret)

#newdata4=newdata3[,Profile$optVariables]
mean_inTrain = createDataPartition(breast.mean$X2.1, p=3/4, list=FALSE)
mean_inTrain
mean_trainx = breast.mean[mean_inTrain,]
mean_testx = breast.mean[-mean_inTrain,]
mean_trainy = breast.mean$X2.1[mean_inTrain]
mean_testy = breast.mean$X2.1[-mean_inTrain]
mean_testy
breast.mean

svmfitControl = trainControl(method = "repeatedcv", number = 10, repeats = 3,returnResamp = "all")
svmGrid = expand.grid(type=c('c'),kernel=c('radial'))
#svmGrid = expand.grid(.interaction.depth = c(1, 3),.n.trees = c(50, 100, 150, 200, 250, 300),.shrinkage = 0.1)
svmFit1 = train(mean_trainx,mean_trainy,method = "svmRadial",trControl = svmfitControl,verbose = FALSE)
svmFit1
names(getModelInfo())
?train
ctrl <- trainControl(method = "cv", savePred=T, classProb=T)
mod <- train(Species~., data=breast.mean, method = "svmLinear", trControl = ctrl)
head(mod$pred)

library("modeest")
mfv(as.numeric(as.character(breast[breast$X1.3 !='?','X1.3'])))
#[1] 1

breast[breast$X1.3 !='?','X1.3']
mean(breast[breast$X1.3 !='?','X1.3'])

sapply(mean(breast[breast$X1.3 !='?','X1.3']), mean, na.rm = TRUE)


 

你可能感兴趣的:(用R语言处理确实数据并对比不同方式的效果+Caret包进行svm交叉验证)