# 数据预处理
data(Affairs,package="AER")
a <- Affairs
a$ynaffair[a$affairs > 0] <- 1
a$ynaffair[a$affairs == 0] <- 0
a$ynaffair <- factor(a$ynaffair, levels=c(0,1),labels=c("No","Yes"))
table(a$ynaffair) # 检验是否平衡
# 将全部预测变量加入logistic回归模型
# fit.full <- glm(ynaffair ~ ., data=a, family=binomial());fit.full;
fit.full <- glm(ynaffair ~ gender + age + yearsmarried + children +
religiousness + education + occupation + rating,
data = a, family=binomial())
# 属性选择的两种方法
# 1、逐步回归方法,选择AIC最小的自变量(最后一个formula)
step.test <- step(fit.full)
glm.step.test <- glm(ynaffair ~ gender + age + yearsmarried + religiousness + rating, data = a, family="binomial");
summary(glm.step.test) # 结果可见 gender性别不够显著(没关系,之后用ROC曲线以及AUC值与去掉gender进行比较)
# 2、直接summary(fit.full)对所有自变量拟合,查看各自变量的显著性p值进行筛选,此处去掉gender
summary(fit.full)
fit.reduced <- glm(ynaffair ~ age + yearsmarried + religiousness + rating,
family=binomial(), data=a);
summary(fit.reduced) # 各自变量依然显著[坚挺]
# 通过方差的卡方独立性检验比较是否加入gender属性,模型的结果是否具有显著性差异
# p=0.06,应该无法拒绝原假设,没有充分的证据证明加入gender与否会对模型结果造成显著性差异.(暂且接受无差异的假设)
anova(fit.reduced, glm.step.test, test="Chisq")
# 3、ROCR[逐步回归与全部自变量加入通过显著性判断两种途径进行特征选择的ROC与AUC的比较]
# 拟合后的两个glm-lm对象 1: glm.step.test 2: fit.reduced
# response ROC curve:glm.step.test
library(ROCR)
par(mfrow=c(2,1))
# glm.step.test
pre.step <- predict(glm.step.test, type="response")
step.m <- prediction(pre.step, a$ynaffair)
plot(performance(step.m, 'tpr', 'fpr'))
abline(0,1, lty=8, col="red")
auc <- performance(step.m, "auc"); [email protected] # 0.7067406
# fit.reduced
pre.fit.reduced <- predict(fit.reduced, type="response")
reduced.m <- prediction(pre.fit.reduced, a$ynaffair)
plot(performance(reduced.m, 'tpr', 'fpr'))
abline(0,1, lty=10, col="green")
auc <- performance(reduced.m, "auc"); [email protected] # 0.7038877
# 使用逐步回归的auc较好,但是不明显,此处仅仅是技巧的考虑。即使用逐步回归和显著性检验基本没有太大差异。
# step(object, scope, scale = 0,direction = c("both", "backward", "forward"), trace = 1, keep = NULL, steps = 1000, k = 2, ...)
# 4.进一步延伸,ROC包的增强版pROC
# 此处仅以逐步回归的结果作为演示
library(pROC)
modelroc <- roc(a$ynaffair, pre.step)
plot(modelroc, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
grid.col=c("green", "red"), max.auc.polygon=TRUE,
auc.polygon.col="skyblue", print.thres=TRUE)
# 5、logistic模型适用条件(假设)
# 期望方差:(抽样于二项分布数据的期望方差=np(1-p),n为观测值,p为因变量=1的概率)
# 过度离势:观测到的因变量的方差大于期望的二项分布的方差导致奇异的标准误和不精确的显著性检验
# 如果离势 = 残差偏差/残差自由度 >>1,则存在过度离势,否则没有
# 如何处理? 适用类二项分布
# 先检验一下是否存在过度离势的情况。拟合两个模型,以逐步回归为例
# 最初的 binomial
glm.step.test <- glm(ynaffair ~ gender + age + yearsmarried + religiousness + rating, data = a, family="binomial");
# 后来的类二项分布 quasibinomial
fit.step.quasi <- glm(ynaffair ~ gender + age + yearsmarried + religiousness + rating, data = a, family="quasibinomial")
# 检验输出的p值大于0.05,没有出现过度离势的情况。原假设为:离势比值=1;H1:离势!=1
pchisq(summary(fit.step.quasi)$dispersion*glm.step.test$df.residual, fit.step.quasi$df.residual, lower=F)
# 如果出现离势的处理方式,采用fit.step.quasi的拟合过程即可
summary(fit.step.quasi)$dispersion