数据科学与机器学习案例之客户的信用风险与预测
数据科学与机器学习之信用卡欺诈识别(严重类失衡数据建模)
数据科学与机器学习案例之汽车目标客户销售策略研究
数据科学与机器学习案例之WiFi定位系统的位置预测
数据科学与机器学习案例之Stacking集成方法对鸢尾花进行分类
数据科学案例之生存分析与二手车定价
数据我们会在文章的末尾提供下载,我们使用此部分数据进行分类,此篇博客中我们使用了四种方法进行分类预测。
数据处理
. <- 'E:\\浏览器下载\\Code\\Code\\机器学习'
load('数据.RData')
str(df) #查看数据
library(ggplot2)
library(Rmisc)
library(caret)
library(randomForest)
library(e1071)
library(class)
library(gmodels)
library(kernlab)
library(ROCR)
library(RColorBrewer)
class.discrete <- c()
for(variable in colnames(df)){
if(length(unique(df[[variable]])) <= 4) class.discrete <- c(class.discrete,variable)}
length(class)
class.continue <- colnames(df)[! colnames(df) %in% class.discrete]
factor.discrete <- function(class.discrete){
for(variable in class.discrete){
df[[variable]] <<- as.factor(df[[variable]])}}
scale.continue <- function(class.continue){
for(variable in class.continue){
df[[variable]] <<- scale(df[[variable]],T,T)}}
p <- vector(mode = 'list',3)
head(class.continue,3)
apply(df[,class.continue],2,range)
col <- brewer.pal(9, 'Blues')
p[[1]] <- ggplot(df,aes(x = age))+
geom_histogram(binwidth = 5,colour = 'black',fill = '#08306B')
p[[2]] <- ggplot(df,aes(x = credit.duration.months))+
geom_histogram(binwidth = 5,colour = 'black',fill = '#08519C')
p[[3]] <- ggplot(df,aes(x = credit.amount))+
geom_histogram(binwidth = 500,colour = 'black',fill = '#08519C')
multiplot(plotlist = p[1:3], layout = matrix(c(1,2,3,3), nrow = 2))
setwd('C:\\Users\\Pictures')
svg(file = 'p.svg',height = 10,width = 10)
multiplot(plotlist = p[1:3], layout = matrix(c(1,2,3,3), nrow = 2))
dev.off()
setwd(.)
factor.discrete(class.discrete)
scale.continue(class.continue)
str(df)
num <- sample(1:nrow(df),size = 0.6 * nrow(df))
train <- df[num,]
table(df$credit.rating)
temp1 <- which(df$credit.rating == 0)
temp2 <- which(df$credit.rating == 1)
temp1 <- sample(temp1,0.4 * nrow(df) / 2)
temp2 <- sample(temp2,0.4 * nrow(df) / 2)
num.test <- c(temp1,temp2)
test <- df[num.test,] #划分训练集与测试集
fun <- function(data){#自编类似于py中的classification_report函数
temp <- table(data[,1],data[,2])
precision <- diag(temp) / apply(temp,2,sum)
recall <- diag(temp) / apply(temp,1,sum)
F1 = 2 / (1 / precision + 1 / recall)
return(data.frame(F1,recall,precision))}
formula <- 'credit.rating ~.'
formula <- as.formula(formula)
lr.model.all <- glm(formula,train,family = 'binomial') #all variable lr model
summary(lr.model.all)
lr.model.all.train.predictions <- predict(lr.model.all,train,type = 'response')
lr.model.all.train.predictions <- round(lr.model.all.train.predictions)
data <- data.frame(train[,1],lr.model.all.train.predictions)
fun(data)
lr.model.all.train.predictions <- predict(lr.model.all,train,type = 'response')
lr.model.all.train.predictions <- round(lr.model.all.train.predictions)
data <- data.frame(train[,1],lr.model.all.train.predictions)
fun(data)
lr.model.all.train.predictions <- predict(lr.model.all,train,type = 'response')
lr.model.all.train.predictions <- round(lr.model.all.train.predictions)
data <- data.frame(train[,1],lr.model.all.train.predictions)
fun(data) #训练集
F1 recall precision
0 0.6109325 0.5337079 0.7142857
1 0.8638920 0.9099526 0.8222698
lr.model.all.predictions <- predict(lr.model.all,test,type = 'response')
lr.model.all.predictions <- round(lr.model.all.predictions)
data <- data.frame(test[,1],lr.model.all.predictions)
fun(data) #测试集
F1 recall precision
0 0.5615142 0.445 0.7606838
1 0.7122153 0.860 0.6077739
formula.lr.best <- 'credit.rating ~ account.balance + credit.duration.months + credit.amount + previous.credit.payment.status + savings + current.assets + age + guarantor + bank.credits + telephone'
formula.lr.best <- as.formula(formula.lr.best)
lr.model.best <- glm(formula.lr.best,train,family = 'binomial')
summary(lr.model.best)
lr.model.best.train.predictions <- predict(lr.model.best,train,type = 'response')
lr.model.best.train.predictions <- round(lr.model.best.train.predictions)
data <- data.frame(train[,1],lr.model.best.train.predictions)
fun(data) #训练集
F1 recall precision
0 0.5102041 0.4213483 0.6465517
1 0.8410596 0.9028436 0.7871901
lr.model.best.predictions <- predict(lr.model.best,test,type = 'response')
lr.model.best.predictions <- round(lr.model.best.predictions)
data <- data.frame(test[,1],lr.model.best.predictions)
fun(data) #测试集
F1 recall precision
0 0.5066667 0.38 0.7600000
1 0.7040000 0.88 0.5866667
ROC曲线
plot(lr.model.all.perf,col = '#E64B35FF',lty = 1,lwd = 2,main = "ROC曲线",xlab = "fpr",ylab = 'tpr')
lines(x = lr.model.best.perf@x.values[[1]],y = lr.model.best.perf@y.values[[1]],col = '#00A087FF',lty = 2,lwd = 2)
abline(0,1,col = 'black')
auc <- performance(lr.model.all.predictions,'auc')
auc <- unlist(slot(auc,'y.values'))
auc <- round(auc,2)
auc1 <- performance(lr.model.best.predictions,'auc')
auc1 <- unlist(slot(auc1,'y.values'))
auc1 <- round(auc1,2)
legend('topright',legend = c('auc',
paste(c(auc,auc1))),col = c('#ffffff','#E64B35FF','#00A087FF'),
lty = c(1,1,2))
formula.rf.all <- 'credit.rating ~.'
formula.rf.all <- as.formula(formula.rf.all)
rf.model.all <- randomForest(formula.rf.all,data = train,importance=T, proximity=T)
rf.model.all.train.predictions <- predict(rf.model.all,train,type = 'class')
data <- data.frame(train[,1],rf.model.all.train.predictions)
fun(data) #训练集
F1 recall precision
0 1 1 1
1 1 1 1
rf.model.all.predictions <- predict(rf.model.all,test,type = 'class')
data <- data.frame(test[,1],rf.model.all.predictions)
fun(data) #测试集
F1 recall precision
0 0.8034682 0.695 0.9520548
1 0.8502203 0.965 0.7598425
formula.rf.best <- 'credit.rating ~ account.balance + credit.duration.months + credit.amount + previous.credit.payment.status + savings + current.assets + age + guarantor + bank.credits + telephone'
formula.rf.best <- as.formula(formula.rf.best)
rf.model.best <- randomForest(formula.rf.best,train,importance=T, proximity=T)
rf.model.best.train.predictions <- predict(rf.model.best,train,type = 'class')
data <- data.frame(train[,1],rf.model.best.train.predictions)
fun(data) #训练集
F1 recall precision
0 1 1 1
1 1 1 1
rf.model.best.predictions <- predict(rf.model.best,test,type = 'class')
data <- data.frame(test[,1],rf.model.all.predictions)
fun(data) #测试集
F1 recall precision
0 0.8034682 0.695 0.9520548
1 0.8502203 0.965 0.7598425
nodesize.vals <- c(2, 3, 4, 5)
ntree.vals <- c(200, 500, 1000, 2000)
tuning.results <- tune.randomForest(formula.rf.best,
data = train,
mtry=3,
nodesize=nodesize.vals,
ntree=ntree.vals) #寻找最优参数
print(tuning.results)
rf.model.best <- tuning.results$best.model
rf.model.best.train.predictions <- predict(rf.model.best,train,type = 'class')
data <- data.frame(train[,1],rf.model.best.train.predictions)
fun(data) #训练集
F1 recall precision
0 0.9317507 0.8820225 0.9874214
1 0.9733488 0.9952607 0.9523810
rf.model.best.predictions <- predict(rf.model.best,test,type = 'class')
data <- data.frame(test[,1],rf.model.all.predictions)
fun(data)
F1 recall precision
0 0.8034682 0.695 0.9520548
1 0.8502203 0.965 0.7598425
ROC曲线
plot(rf.model.all.perf,col = 'red',lty = 1,lwd = 2,main = "ROC曲线",xlab = "fpr",ylab = 'tpr')
lines(x = rf.model.best.perf@x.values[[1]],y = rf.model.best.perf@y.values[[1]],col = 'blue',lty = 2,lwd = 2)
abline(0,1,col = 'black')
auc <- performance(rf.model.all.predictions,'auc')
auc <- unlist(slot(auc,'y.values'))
auc <- round(auc,2)
auc1 <- performance(rf.model.best.predictions,'auc')
auc1 <- unlist(slot(auc1,'y.values'))
auc1 <- round(auc1,2)
legend('topright',legend = c('auc',
paste(c(auc,auc1))),col = c('#ffffff','red','blue'),
lty = c(1,1,2))
formula.svm.all <- 'credit.rating ~. '
formula.svm.all <- as.formula(formula.svm.all)
svm.model.all <- svm(formula.svm.all,train,kernel="radial", cost=100, gamma=1)
svm.model.all.train.predictions <- predict(svm.model.all,train[,-1])
data <- data.frame(train[,1],svm.model.all.train.predictions)
fun(data)
F1 recall precision
0 1 1 1
1 1 1 1
svm.model.predictions <- predict(svm.model.all,test[,-1])
data <- data.frame(test[,1],svm.model.predictions)
fun(data)
F1 recall precision
0 0.7138264 0.555 1.0000000
1 0.8179959 1.000 0.6920415
formula.init <- "credit.rating ~ ."
formula.init <- as.formula(formula.init)
control <- trainControl(method="repeatedcv", number=10, repeats=2)
model <- train(formula.init, data=train, method="svmRadial",
trControl=control)
importance <- varImp(model, scale=FALSE)
plot(importance, cex.lab=0.5)
rownames(importance$importance)
formula.svm.best <- 'credit.rating ~ account.balance + credit.duration.months + previous.credit.payment.status + credit.purpose + credit.amount + savings + employment.duration + installment.rate + marital.status + guarantor'
formula.svm.best <- as.formula(formula.svm.best)
svm.model.best <- svm(formula.svm.best,train,kernel="radial", cost=100, gamma=1)
svm.model.best.train.predictions <- predict(svm.model.best,train[,-1])
data <- data.frame(train[,1],svm.model.best.train.predictions)
svm.model.best.predictions <- predict(svm.model.best,test[,-1])
data <- data.frame(test[,1],svm.model.best.predictions)
fun(data)
cost.weights <- c(0.1, 10, 100) #寻找最优参数
gamma.weights <- c(0.01, 0.25, 0.5, 1)
tuning.results <- tune(svm, formula.svm.best,
data = train, kernel="radial",
ranges=list(cost=cost.weights, gamma=gamma.weights))
svm.model.best <- tuning.results$best.model
svm.model.best.train.predictions <- predict(svm.model.best,train[,-1])
data <- data.frame(train[,1],svm.model.best.train.predictions)
fun(data)
F1 recall precision
0 0.5204461 0.3977273 0.7526882
1 0.8614393 0.9457547 0.7909270
svm.model.best.predictions <- predict(svm.model.best,test[,-1],type = 'prob')
data <- data.frame(test[,1],svm.model.best.predictions)
fun(data)
F1 recall precision
0 0.5460751 0.400 0.8602151
1 0.7376726 0.935 0.6091205
svm.model.all.predictions <- predict(svm.model.all,test[,-1],decision.values = T)
svm.model.all.predictions.values <- attributes(svm.model.all.predictions)$decision.values
svm.model.all.predictions <- prediction(svm.model.all.predictions.values,test[,1])
svm.model.best.predictions <- predict(svm.model.best,test[,-1],decision.values = T)
svm.model.best.predictions.values <- attributes(svm.model.best.predictions)$decision.values
svm.model.best.predictions <- prediction(svm.model.best.predictions.values,test[,1])
svm.model.all.perf <- performance(svm.model.all.predictions,'tpr','fpr')
svm.model.best.perf <- performance(svm.model.best.predictions,'tpr','fpr')
plot(svm.model.all.perf,col = 'red',lty = 1,lwd = 2,main = "ROC曲线",xlab = "fpr",ylab = 'tpr')
lines(x = svm.model.best.perf@x.values[[1]],y = svm.model.best.perf@y.values[[1]],col = 'blue',lty = 2,lwd = 2)
abline(0,1,col = 'black')
auc <- performance(svm.model.all.predictions,'auc')
auc <- unlist(slot(auc,'y.values'))
auc <- round(auc,2)
auc1 <- performance(svm.model.best.predictions,'auc')
auc1 <- unlist(slot(auc1,'y.values'))
auc1 <- round(auc1,2)
legend('topright',legend = c('auc',
paste(c(auc,auc1))),col = c('#ffffff','red','blue'),
lty = c(1,1,2))