R语言逻辑回归分析

Logistic regression为非线性模型,回归系数是通过极大似然估计方法计算所得。响应变量取值为1(事件发生)或0(事件不发生)。

1. 数据集载入和划分

### 数据集载入和划分
library(AER) 
data()  # 查看AER包里面的数据集
data(Affairs) # 载入Affairs数据集
# 查看数据集
summary(Affairs)
str(Affairs)
View(Affairs)

# 定义一个变量,作为y
Affairs$ynaffairs <- ifelse(Affairs$affairs>1,1,0)
Affairs$affairs <- NULL # 去掉affairs列

set.seed(110)
is_train <- sample(c(TRUE,FALSE),nrow(Affairs),replace=TRUE,prob=c(0.7,0.3))
train <- Affairs[is_train,] # training dataset
test <- Affairs[!is_train,] # test dataset

library(caret)
Train <- createDataPartition(Affairs$ynaffairs, p=0.7, list=FALSE)
train <- Affairs[Train, ]
test <- Affairs[-Train, ]

2. 逻辑回归模型构建和拟合

### 逻辑回归模型构建和拟合
glm.control(epsilon = 1e-8, maxit = 100, trace = FALSE)
aff_glm <- glm(ynaffairs ~ ., family = binomial(),
               data = train)
# aff_glm <- glm(ynaffairs ~ ., family = binomial,
#                data = train)
# aff_glm <- glm(ynaffairs ~ ., family = binomial(link="logit"),
#                data = train)

# family(object, ...)
# binomial(link = "logit")
# gaussian(link = "identity")
# Gamma(link = "inverse")
# inverse.gaussian(link = "1/mu^2")
# poisson(link = "log")
# quasi(link = "identity", variance = "constant")
# quasibinomial(link = "logit")
# quasipoisson(link = "log")

summary(aff_glm)
# 根据因变量的显着性水平,找出和结果结果变量显著相关的因变量,重新建模
reduce_glm <- glm(ynaffairs ~ yearsmarried+religiousness+rating, 
                  family = binomial(),data = train)
summary(reduce_glm)

# 模型是否有过度离势
reduce_glm2 <- glm(ynaffairs ~ yearsmarried+religiousness+rating, 
                   family = quasibinomial(),data = train)

pchisq(summary(reduce_glm2)$dispersion * reduce_glm$df.residual,
       reduce_glm$df.residual,lower = F)

#模型比较
anova(reduce_glm,reduce_glm2,test="Chisq")
anova(aff_glm,reduce_glm,test="Chisq")

3. 模型参数

### 模型参数
coef(aff_glm) # 模型系数
exp(coef(aff_glm)) # OR值(e^^β)
# OR:比值比,为实验组的事件发生几率(odds1)/对照组的事件发生几率(odds2)。
confint(aff_glm) # 模型系数95%置信区间

4. 预测

### 预测
# 预测rating的影响,其他因子取均值
test_data <-data.frame(yearsmarried = mean(Affairs$yearsmarried),
                       religiousness = mean(Affairs$religiousness),
                       rating=seq(1:5))

y_pre <- predict(reduce_glm,test_data,type='response')

# type参数 the type of prediction required. 
# The default is on the scale of the linear predictors; 
# the alternative "response" is on the scale of the response variable. 
# Thus for a default binomial model the default predictions 
# are of log-odds (probabilities on logit scale) and 
# type = "response" gives the predicted probabilities. 
# The "terms" option returns a matrix giving the fitted values 
# of each term in the model formula on the linear predictor scale.

train_x <- train[,-9]
train_y <- train$ynaffairs
train_pre <- predict(aff_glm,train_x,type='response') # 概率
train_pre_classes <- ifelse(train_pre > 0.5, 1, 0)

#names(test)
test_x <- test[,-9]
test_y <- test$ynaffairs
test_pre <- predict(aff_glm,test_x,type='response') # 概率
test_pre_classes <- ifelse(test_pre > 0.5, 1, 0)
## 计算模型预测准确性
accuracy <- table(test_pre_classes, test_y)
sum(diag(accuracy))/sum(accuracy)

## confusionMatrix {caret}
confusionMatrix(data=as.factor(test_pre_classes),
                reference=as.factor(test$ynaffairs))


# K折交叉验证模型
# trainControl {caret}
ctrl <- trainControl(method = "repeatedcv", number = 5, savePredictions = TRUE)
# train {generics}:Estimate model parameters.
mod_fit <- train(ynaffairs ~ ., data=train, method="glm", family="binomial",
                 trControl = ctrl, tuneLength = 5)
pred = predict(mod_fit, newdata=test_x)

pred <- ifelse(pred > 0.5, 1, 0)

confusionMatrix(data=as.factor(pred), 
                reference = as.factor(test$ynaffairs))

5. ROC作图

### ROC作图
library(pROC)
roc_train <- roc(train_y~train_pre, data=train)
roc_train
roc_test <- roc(test_y~test_pre, data=test)
roc_test

roc.test(roc_train,roc_test)
#par(mfcol=c(2,1)) 
plot(roc_train,col="red")
plot(roc_test,col="blue")
# ?plot.roc

## 画多条ROC曲线
plot(roc_train, col="red")
plot.roc(roc_test, add=TRUE, col="blue")

## 计算auc (area under curve)
library(ROCR)
# Compute AUC for predicting ynaffairs with the model
prob <- predict(aff_glm, newdata=test_x, type="response")
pred <- prediction(prob, test$ynaffairs)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf)
auc <- performance(pred, measure = "auc")
auc <- [email protected][[1]]

6. 绘制列线图(nomogram)

###  绘制列线图(nomogram)
par(mgp=c(1.6,0.6,0),mar=c(2,2,2,2))  ##设置画布
library(rms)

# 设定环境参数
ddist <- datadist(Affairs)
options(datadist='ddist')
# lrm和nomogram来自rms包
rms_fit <- lrm(ynaffairs ~ ., data = Affairs)
nomogram <- nomogram(rms_fit,fun=function(x) 1/(1+exp(-x)), ##逻辑回归计算公式
                     fun.at = c(0.001,0.01,0.05,seq(0.1,0.9,by=0.1),0.95,0.99,0.999),#风险轴刻度
                     funlabel = "Risk of affairs", #风险轴便签
                     lp=F,  ##是否显示系数轴
                     conf.int = F, ##每个得分的置信度区间,用横线表示,横线越长置信度越
                     abbrev = F #是否用简称代表因子变量
                     )
plot(nomogram)

注:如果要加入正则项,如lasso回归,ridge回归,弹性网等用glmnet包。

你可能感兴趣的:(r语言,逻辑回归)