R语言数据科学分类预测(一)

数据科学与机器学习案例之客户的信用风险与预测

数据科学与机器学习之信用卡欺诈识别(严重类失衡数据建模)

数据科学与机器学习案例之汽车目标客户销售策略研究

数据科学与机器学习案例之WiFi定位系统的位置预测

数据科学与机器学习案例之Stacking集成方法对鸢尾花进行分类

数据科学案例之生存分析与二手车定价

R数据科学分类预测

  • Logistic regression
    • all variable model
    • some selected feature model
  • randomForest
    • all variable model
    • best variable model
  • svm
    • all variable model and best variable model

Logistic regression

数据我们会在文章的末尾提供下载,我们使用此部分数据进行分类,此篇博客中我们使用了四种方法进行分类预测。
数据处理

. <- 'E:\\浏览器下载\\Code\\Code\\机器学习'

load('数据.RData')
str(df) #查看数据

library(ggplot2)
library(Rmisc)
library(caret)
library(randomForest)
library(e1071)
library(class)
library(gmodels)
library(kernlab)
library(ROCR)
library(RColorBrewer)

class.discrete <- c()
for(variable in colnames(df)){
   if(length(unique(df[[variable]])) <= 4) class.discrete <- c(class.discrete,variable)}

length(class)

class.continue <- colnames(df)[! colnames(df) %in% class.discrete]

factor.discrete <- function(class.discrete){
       for(variable in class.discrete){
           df[[variable]] <<- as.factor(df[[variable]])}}

scale.continue <- function(class.continue){
      for(variable in class.continue){
         df[[variable]] <<- scale(df[[variable]],T,T)}}

p <- vector(mode = 'list',3)
head(class.continue,3)
apply(df[,class.continue],2,range)
col <- brewer.pal(9, 'Blues')

p[[1]] <- ggplot(df,aes(x = age))+
geom_histogram(binwidth = 5,colour = 'black',fill = '#08306B')

p[[2]] <- ggplot(df,aes(x = credit.duration.months))+
geom_histogram(binwidth = 5,colour = 'black',fill = '#08519C')

p[[3]] <- ggplot(df,aes(x = credit.amount))+
geom_histogram(binwidth = 500,colour = 'black',fill = '#08519C')

multiplot(plotlist = p[1:3], layout = matrix(c(1,2,3,3), nrow = 2))

setwd('C:\\Users\\Pictures')
svg(file = 'p.svg',height = 10,width = 10)
multiplot(plotlist = p[1:3], layout = matrix(c(1,2,3,3), nrow = 2))
dev.off()

setwd(.)

factor.discrete(class.discrete)
scale.continue(class.continue)
str(df)

R语言数据科学分类预测(一)_第1张图片

all variable model

num <- sample(1:nrow(df),size = 0.6 * nrow(df))
train <- df[num,]
table(df$credit.rating)
temp1 <- which(df$credit.rating == 0)
temp2 <- which(df$credit.rating == 1)
temp1 <- sample(temp1,0.4 * nrow(df) / 2)
temp2 <- sample(temp2,0.4 * nrow(df) / 2)
num.test <- c(temp1,temp2)
test <- df[num.test,] #划分训练集与测试集

fun <- function(data){#自编类似于py中的classification_report函数
         temp <- table(data[,1],data[,2])
         precision <- diag(temp) / apply(temp,2,sum)
         recall <- diag(temp) / apply(temp,1,sum)
         F1 = 2 / (1 / precision + 1 / recall)
         return(data.frame(F1,recall,precision))}

formula <- 'credit.rating ~.'
formula <- as.formula(formula)
lr.model.all <- glm(formula,train,family = 'binomial') #all variable lr model 
summary(lr.model.all)

lr.model.all.train.predictions <- predict(lr.model.all,train,type = 'response')
lr.model.all.train.predictions <- round(lr.model.all.train.predictions)
data <- data.frame(train[,1],lr.model.all.train.predictions)
fun(data)
lr.model.all.train.predictions <- predict(lr.model.all,train,type = 'response')
lr.model.all.train.predictions <- round(lr.model.all.train.predictions)
data <- data.frame(train[,1],lr.model.all.train.predictions)
fun(data)
lr.model.all.train.predictions <- predict(lr.model.all,train,type = 'response')
lr.model.all.train.predictions <- round(lr.model.all.train.predictions)
data <- data.frame(train[,1],lr.model.all.train.predictions)
fun(data) #训练集
         F1    recall precision
0 0.6109325 0.5337079 0.7142857
1 0.8638920 0.9099526 0.8222698

lr.model.all.predictions <- predict(lr.model.all,test,type = 'response')
lr.model.all.predictions <- round(lr.model.all.predictions)
data <- data.frame(test[,1],lr.model.all.predictions)
fun(data) #测试集
         F1 recall precision
0 0.5615142  0.445 0.7606838
1 0.7122153  0.860 0.6077739

some selected feature model

formula.lr.best <- 'credit.rating ~ account.balance + credit.duration.months + credit.amount + previous.credit.payment.status + savings + current.assets + age + guarantor + bank.credits + telephone'
formula.lr.best <- as.formula(formula.lr.best)
lr.model.best <- glm(formula.lr.best,train,family = 'binomial')
summary(lr.model.best)

lr.model.best.train.predictions <- predict(lr.model.best,train,type = 'response')
lr.model.best.train.predictions <- round(lr.model.best.train.predictions)
data <- data.frame(train[,1],lr.model.best.train.predictions)
fun(data) #训练集
         F1    recall precision
0 0.5102041 0.4213483 0.6465517
1 0.8410596 0.9028436 0.7871901

lr.model.best.predictions <- predict(lr.model.best,test,type = 'response')
lr.model.best.predictions <- round(lr.model.best.predictions)
data <- data.frame(test[,1],lr.model.best.predictions)
fun(data) #测试集
        F1 recall precision
0 0.5066667   0.38 0.7600000
1 0.7040000   0.88 0.5866667

ROC曲线

plot(lr.model.all.perf,col = '#E64B35FF',lty = 1,lwd = 2,main = "ROC曲线",xlab = "fpr",ylab = 'tpr')
lines(x = lr.model.best.perf@x.values[[1]],y = lr.model.best.perf@y.values[[1]],col = '#00A087FF',lty = 2,lwd = 2)
abline(0,1,col = 'black')
auc <- performance(lr.model.all.predictions,'auc')
auc <- unlist(slot(auc,'y.values'))
auc <- round(auc,2)
auc1 <- performance(lr.model.best.predictions,'auc')
auc1 <- unlist(slot(auc1,'y.values'))
auc1 <- round(auc1,2)
legend('topright',legend = c('auc',
       paste(c(auc,auc1))),col = c('#ffffff','#E64B35FF','#00A087FF'),
       lty = c(1,1,2))
       

R语言数据科学分类预测(一)_第2张图片

randomForest

all variable model

formula.rf.all <- 'credit.rating ~.'
formula.rf.all <- as.formula(formula.rf.all)

rf.model.all <- randomForest(formula.rf.all,data = train,importance=T, proximity=T)
rf.model.all.train.predictions <- predict(rf.model.all,train,type = 'class')
data <- data.frame(train[,1],rf.model.all.train.predictions)
fun(data) #训练集
 F1 recall precision
0  1      1         1
1  1      1         1

rf.model.all.predictions <- predict(rf.model.all,test,type = 'class')
data <- data.frame(test[,1],rf.model.all.predictions)
fun(data) #测试集
         F1 recall precision
0 0.8034682  0.695 0.9520548
1 0.8502203  0.965 0.7598425

best variable model

formula.rf.best <- 'credit.rating ~ account.balance + credit.duration.months + credit.amount + previous.credit.payment.status + savings + current.assets + age + guarantor + bank.credits + telephone'
formula.rf.best <- as.formula(formula.rf.best)

rf.model.best <- randomForest(formula.rf.best,train,importance=T, proximity=T)
rf.model.best.train.predictions <- predict(rf.model.best,train,type = 'class')
data <- data.frame(train[,1],rf.model.best.train.predictions)
fun(data) #训练集
 F1 recall precision
0  1      1         1
1  1      1         1

rf.model.best.predictions <- predict(rf.model.best,test,type = 'class')
data <- data.frame(test[,1],rf.model.all.predictions)
fun(data) #测试集
        F1 recall precision
0 0.8034682  0.695 0.9520548
1 0.8502203  0.965 0.7598425


nodesize.vals <- c(2, 3, 4, 5)
ntree.vals <- c(200, 500, 1000, 2000)
tuning.results <- tune.randomForest(formula.rf.best, 
                             data = train,
                             mtry=3, 
                             nodesize=nodesize.vals,
                             ntree=ntree.vals) #寻找最优参数
print(tuning.results)

rf.model.best <- tuning.results$best.model

rf.model.best.train.predictions <- predict(rf.model.best,train,type = 'class')
data <- data.frame(train[,1],rf.model.best.train.predictions)
fun(data) #训练集
        F1    recall precision
0 0.9317507 0.8820225 0.9874214
1 0.9733488 0.9952607 0.9523810

rf.model.best.predictions <- predict(rf.model.best,test,type = 'class')
data <- data.frame(test[,1],rf.model.all.predictions)
fun(data)
        F1 recall precision
0 0.8034682  0.695 0.9520548
1 0.8502203  0.965 0.7598425

ROC曲线

plot(rf.model.all.perf,col = 'red',lty = 1,lwd = 2,main = "ROC曲线",xlab = "fpr",ylab = 'tpr')
lines(x = rf.model.best.perf@x.values[[1]],y = rf.model.best.perf@y.values[[1]],col = 'blue',lty = 2,lwd = 2)
abline(0,1,col = 'black')
auc <- performance(rf.model.all.predictions,'auc')
auc <- unlist(slot(auc,'y.values'))
auc <- round(auc,2)
auc1 <- performance(rf.model.best.predictions,'auc')
auc1 <- unlist(slot(auc1,'y.values'))
auc1 <- round(auc1,2)
legend('topright',legend = c('auc',
       paste(c(auc,auc1))),col = c('#ffffff','red','blue'),
       lty = c(1,1,2))

R语言数据科学分类预测(一)_第3张图片

svm

all variable model and best variable model

formula.svm.all <- 'credit.rating ~. '
formula.svm.all <- as.formula(formula.svm.all)

svm.model.all <- svm(formula.svm.all,train,kernel="radial", cost=100, gamma=1)
svm.model.all.train.predictions <- predict(svm.model.all,train[,-1])
data <- data.frame(train[,1],svm.model.all.train.predictions)
fun(data)
  F1 recall precision
0  1      1         1
1  1      1         1

svm.model.predictions <- predict(svm.model.all,test[,-1])
data <- data.frame(test[,1],svm.model.predictions)
fun(data)
         F1 recall precision
0 0.7138264  0.555 1.0000000
1 0.8179959  1.000 0.6920415


formula.init <- "credit.rating ~ ."
formula.init <- as.formula(formula.init)
control <- trainControl(method="repeatedcv", number=10, repeats=2)
model <- train(formula.init, data=train, method="svmRadial", 
               trControl=control)
importance <- varImp(model, scale=FALSE)
plot(importance, cex.lab=0.5)

rownames(importance$importance)

formula.svm.best <- 'credit.rating ~ account.balance + credit.duration.months + previous.credit.payment.status + credit.purpose + credit.amount + savings + employment.duration + installment.rate + marital.status + guarantor'
formula.svm.best <- as.formula(formula.svm.best)

svm.model.best <- svm(formula.svm.best,train,kernel="radial", cost=100, gamma=1)
svm.model.best.train.predictions <- predict(svm.model.best,train[,-1])
data <- data.frame(train[,1],svm.model.best.train.predictions)

svm.model.best.predictions <- predict(svm.model.best,test[,-1])
data <- data.frame(test[,1],svm.model.best.predictions)
fun(data)

cost.weights <- c(0.1, 10, 100) #寻找最优参数
gamma.weights <- c(0.01, 0.25, 0.5, 1)
tuning.results <- tune(svm, formula.svm.best, 
                       data = train, kernel="radial", 
                       ranges=list(cost=cost.weights, gamma=gamma.weights))
svm.model.best <- tuning.results$best.model

svm.model.best.train.predictions <- predict(svm.model.best,train[,-1])
data <- data.frame(train[,1],svm.model.best.train.predictions)
fun(data)
      F1    recall precision
0 0.5204461 0.3977273 0.7526882
1 0.8614393 0.9457547 0.7909270

svm.model.best.predictions <- predict(svm.model.best,test[,-1],type = 'prob')
data <- data.frame(test[,1],svm.model.best.predictions)
fun(data)
        F1 recall precision
0 0.5460751  0.400 0.8602151
1 0.7376726  0.935 0.6091205

R语言数据科学分类预测(一)_第4张图片
ROC曲线

svm.model.all.predictions <- predict(svm.model.all,test[,-1],decision.values = T)
svm.model.all.predictions.values <- attributes(svm.model.all.predictions)$decision.values
svm.model.all.predictions <- prediction(svm.model.all.predictions.values,test[,1])

svm.model.best.predictions <- predict(svm.model.best,test[,-1],decision.values = T)
svm.model.best.predictions.values <- attributes(svm.model.best.predictions)$decision.values
svm.model.best.predictions <- prediction(svm.model.best.predictions.values,test[,1])

svm.model.all.perf <- performance(svm.model.all.predictions,'tpr','fpr')
svm.model.best.perf <- performance(svm.model.best.predictions,'tpr','fpr')

plot(svm.model.all.perf,col = 'red',lty = 1,lwd = 2,main = "ROC曲线",xlab = "fpr",ylab = 'tpr')
lines(x = svm.model.best.perf@x.values[[1]],y = svm.model.best.perf@y.values[[1]],col = 'blue',lty = 2,lwd = 2)
abline(0,1,col = 'black')
auc <- performance(svm.model.all.predictions,'auc')
auc <- unlist(slot(auc,'y.values'))
auc <- round(auc,2)
auc1 <- performance(svm.model.best.predictions,'auc')
auc1 <- unlist(slot(auc1,'y.values'))
auc1 <- round(auc1,2)
legend('topright',legend = c('auc',
       paste(c(auc,auc1))),col = c('#ffffff','red','blue'),
       lty = c(1,1,2))

R语言数据科学分类预测(一)_第5张图片
总结:相较于线性分类算法,集成学习方法可以获得更好的预测效果。
数据下载:整理好会传到GitHub数据下载

你可能感兴趣的:(R语言,r语言,svm,逻辑回归,随机森林,分类预测)