Logistic regression为非线性模型,回归系数是通过极大似然估计方法计算所得。响应变量取值为1(事件发生)或0(事件不发生)。
### 数据集载入和划分
library(AER)
data() # 查看AER包里面的数据集
data(Affairs) # 载入Affairs数据集
# 查看数据集
summary(Affairs)
str(Affairs)
View(Affairs)
# 定义一个变量,作为y
Affairs$ynaffairs <- ifelse(Affairs$affairs>1,1,0)
Affairs$affairs <- NULL # 去掉affairs列
set.seed(110)
is_train <- sample(c(TRUE,FALSE),nrow(Affairs),replace=TRUE,prob=c(0.7,0.3))
train <- Affairs[is_train,] # training dataset
test <- Affairs[!is_train,] # test dataset
library(caret)
Train <- createDataPartition(Affairs$ynaffairs, p=0.7, list=FALSE)
train <- Affairs[Train, ]
test <- Affairs[-Train, ]
### 逻辑回归模型构建和拟合
glm.control(epsilon = 1e-8, maxit = 100, trace = FALSE)
aff_glm <- glm(ynaffairs ~ ., family = binomial(),
data = train)
# aff_glm <- glm(ynaffairs ~ ., family = binomial,
# data = train)
# aff_glm <- glm(ynaffairs ~ ., family = binomial(link="logit"),
# data = train)
# family(object, ...)
# binomial(link = "logit")
# gaussian(link = "identity")
# Gamma(link = "inverse")
# inverse.gaussian(link = "1/mu^2")
# poisson(link = "log")
# quasi(link = "identity", variance = "constant")
# quasibinomial(link = "logit")
# quasipoisson(link = "log")
summary(aff_glm)
# 根据因变量的显着性水平,找出和结果结果变量显著相关的因变量,重新建模
reduce_glm <- glm(ynaffairs ~ yearsmarried+religiousness+rating,
family = binomial(),data = train)
summary(reduce_glm)
# 模型是否有过度离势
reduce_glm2 <- glm(ynaffairs ~ yearsmarried+religiousness+rating,
family = quasibinomial(),data = train)
pchisq(summary(reduce_glm2)$dispersion * reduce_glm$df.residual,
reduce_glm$df.residual,lower = F)
#模型比较
anova(reduce_glm,reduce_glm2,test="Chisq")
anova(aff_glm,reduce_glm,test="Chisq")
### 模型参数
coef(aff_glm) # 模型系数
exp(coef(aff_glm)) # OR值(e^^β)
# OR:比值比,为实验组的事件发生几率(odds1)/对照组的事件发生几率(odds2)。
confint(aff_glm) # 模型系数95%置信区间
### 预测
# 预测rating的影响,其他因子取均值
test_data <-data.frame(yearsmarried = mean(Affairs$yearsmarried),
religiousness = mean(Affairs$religiousness),
rating=seq(1:5))
y_pre <- predict(reduce_glm,test_data,type='response')
# type参数 the type of prediction required.
# The default is on the scale of the linear predictors;
# the alternative "response" is on the scale of the response variable.
# Thus for a default binomial model the default predictions
# are of log-odds (probabilities on logit scale) and
# type = "response" gives the predicted probabilities.
# The "terms" option returns a matrix giving the fitted values
# of each term in the model formula on the linear predictor scale.
train_x <- train[,-9]
train_y <- train$ynaffairs
train_pre <- predict(aff_glm,train_x,type='response') # 概率
train_pre_classes <- ifelse(train_pre > 0.5, 1, 0)
#names(test)
test_x <- test[,-9]
test_y <- test$ynaffairs
test_pre <- predict(aff_glm,test_x,type='response') # 概率
test_pre_classes <- ifelse(test_pre > 0.5, 1, 0)
## 计算模型预测准确性
accuracy <- table(test_pre_classes, test_y)
sum(diag(accuracy))/sum(accuracy)
## confusionMatrix {caret}
confusionMatrix(data=as.factor(test_pre_classes),
reference=as.factor(test$ynaffairs))
# K折交叉验证模型
# trainControl {caret}
ctrl <- trainControl(method = "repeatedcv", number = 5, savePredictions = TRUE)
# train {generics}:Estimate model parameters.
mod_fit <- train(ynaffairs ~ ., data=train, method="glm", family="binomial",
trControl = ctrl, tuneLength = 5)
pred = predict(mod_fit, newdata=test_x)
pred <- ifelse(pred > 0.5, 1, 0)
confusionMatrix(data=as.factor(pred),
reference = as.factor(test$ynaffairs))
### ROC作图
library(pROC)
roc_train <- roc(train_y~train_pre, data=train)
roc_train
roc_test <- roc(test_y~test_pre, data=test)
roc_test
roc.test(roc_train,roc_test)
#par(mfcol=c(2,1))
plot(roc_train,col="red")
plot(roc_test,col="blue")
# ?plot.roc
## 画多条ROC曲线
plot(roc_train, col="red")
plot.roc(roc_test, add=TRUE, col="blue")
## 计算auc (area under curve)
library(ROCR)
# Compute AUC for predicting ynaffairs with the model
prob <- predict(aff_glm, newdata=test_x, type="response")
pred <- prediction(prob, test$ynaffairs)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf)
auc <- performance(pred, measure = "auc")
auc <- [email protected][[1]]
### 绘制列线图(nomogram)
par(mgp=c(1.6,0.6,0),mar=c(2,2,2,2)) ##设置画布
library(rms)
# 设定环境参数
ddist <- datadist(Affairs)
options(datadist='ddist')
# lrm和nomogram来自rms包
rms_fit <- lrm(ynaffairs ~ ., data = Affairs)
nomogram <- nomogram(rms_fit,fun=function(x) 1/(1+exp(-x)), ##逻辑回归计算公式
fun.at = c(0.001,0.01,0.05,seq(0.1,0.9,by=0.1),0.95,0.99,0.999),#风险轴刻度
funlabel = "Risk of affairs", #风险轴便签
lp=F, ##是否显示系数轴
conf.int = F, ##每个得分的置信度区间,用横线表示,横线越长置信度越
abbrev = F #是否用简称代表因子变量
)
plot(nomogram)
注:如果要加入正则项,如lasso回归,ridge回归,弹性网等用glmnet包。