R语言-逻辑回归

##### 3.3 回归分析 #####
rm(list = ls())  # 清空工作空间
#### 3.3.2 逻辑回归(logistic regression) ####
#额外备注:线性回归主要用来解决因变量是连续变量时的分析问题(比如薪酬、房价等)
#逻辑回归是处理二分类的因变量与自变量之间关系的模型
# 加载所需R包
# install.packages("ggplot2")
library(ggplot2)
# 读入数据
dat1 = read.csv("JuneTrain.csv")
dat2 = read.csv("JulyTest.csv")
head(dat1)  # 查看数据的前几行
# 重新命名数据以及排列
dat1 = dat1[, c("tenure", "expense", "COUNT", "perperson", "entropy", "chgexpense", "chgcount", "churn")]
colnames(dat1) = c("tenure", "expense", "count", "perperson", "entropy", "chgexpense", "chgcount", "churn")
dat2 = dat2[, c("tenure", "expense", "COUNT", "perperson", "entropy", "chgexpense", "chgcount", "churn")]
colnames(dat2) = c("tenure", "expense", "count", "perperson", "entropy", "chgexpense", "chgcount", "churn")
# churn:是否流失;tenure:在网时长;expense:当月话费;count:通话人数;
# perperson:人均通话时长;entropy:通话时长分布;chgexpense:花费变化率;chgcount:通话人数变化率
# 描述分析,用ggplot2画对比箱线图。
# 描述分析内容 "tenure"     "expense"    "count"      "perperson"  "entropy"    "chgexpense" "chgcount"
# tenure和是否流失
p1 = ggplot(dat1, aes(x = as.factor(churn), y = tenure, fill = as.factor(churn)))  + geom_boxplot() +
  guides(fill = FALSE) + theme_minimal() + xlab("是否流失") + ylab("在网时长") +
  theme(axis.title.x = element_text(size = 16, face = "bold"),
        axis.text.x = element_text(size = 12, face = "bold"),
        axis.title.y = element_text(size = 16, face = "bold"),
        axis.text.y = element_text(size = 12, face = "bold")) +
  scale_fill_hue(c = 45, l = 80)
# expense和是否流失
p2 = ggplot(dat1, aes(x = as.factor(churn), y = expense, fill = as.factor(churn)))  + geom_boxplot() +
  guides(fill = FALSE) + theme_minimal() + xlab("是否流失") + ylab("当月话费") +
  theme(axis.title.x = element_text(size = 16, face = "bold"),
        axis.text.x = element_text(size =12, face = "bold"),
        axis.title.y = element_text(size = 16, face = "bold"),
        axis.text.y = element_text(size = 12, face = "bold")) +
  scale_fill_hue(c = 45, l = 80)
#还有很多,具体看书和给出的例子,这里不写了
# 加载所需R包
# install.packages("gridExtra")
library(gridExtra)
# 将2张图并列输出
grid.arrange(p1,p2,ncol = 2) #因为ggplot()直接将两张图分别输出,而gridExtra中的grid.arrange()将2张图并列输出
#逻辑回归是解决分类问题的一种分类模型
glm()#逻辑回归建模最常用的是广义线性回归语句
#glm()与lm()不同之处就在于参数family
#逻辑回归的family=binomial(link=logit),表示引用了二项分布族binomial中的logit连接函数
#glm()也包含了泊松回归等,family取值不一样
# 建立模型
lm1 = glm(churn ~., data = dat1, family = binomial())
summary(lm1)
#解读Page200
#训练集,测试集
#对测试集进行预测
predict(object,newdata = NULL,type = c("link","response","terms"))
#object指所需的回归模型;newdata指用于测试的数据集
#type指选择预测的类型,由于是二分类变量,所以选择reponse,表示输出结果预测响应变量为1的概率
# 模型预测
Yhat = predict(lm1, newdata = dat2, type = "response")
ypre1 = 1 * (Yhat > 0.5)
table(ypre1, dat2$churn)
##       
##  ypre1     0     1
##      0 46001   447
ypre2 = 1 * (Yhat > mean(dat2$churn))
table(ypre2, dat2$churn)
##       
##  ypre2     0     1
##      0 27242   108
##      1 18759   339
# 覆盖率捕获率曲线
sub = seq(0, 1, 1 / 7)
tol = sum(dat2$churn)
catch = sapply(sub, function(s) {
  ss = quantile(Yhat, 1 - s)
  res = sum(dat2$churn[Yhat > ss]) / tol
  return(res)
})
plot(sub, catch, type = "l", xlab = "覆盖率", ylab = "捕获率")
# 系数图示
coef = lm1$coefficients[-1]
coef = sort(coef)
barplot(coef, col = rainbow(10), width = 1)
# 加载所需R包
# install.packages("pROC")
library(pROC)
# 生成ROC曲线
plot.roc(dat2$churn, Yhat, col = "red", lwd = 2, xaxs = "i", yaxs = "i")
# 比较ROC曲线优劣
lm2 = glm(churn ~ chgcount, data = dat1, family = binomial())
Yhat2 = predict(lm2, newdata = dat2, type = "response")
plot.roc(dat2$churn, Yhat2, col = "blue", lwd = 2, xaxs = "i", yaxs = "i")
lines.roc(dat2$churn, Yhat, col = "red", lwd = 2)
# auc曲线
auc(dat2$churn, Yhat)
###############################
#总结#
library(gridExtra)
grid.arrange()#因为ggplot()直接将两张图分别输出,而gridExtra中的grid.arrange()将图并列输出
glm()#逻辑回归建模最常用的是广义线性回归语句
predict()
library(pROC)
# 生成ROC曲线
plot.roc()
auc()
#############################

你可能感兴趣的:(R,逻辑回归)