诊断模型中转化为二分类模型
rm(list = ls())
# install and load packages
install.packages("pROC")
install.packages("maxstat")
install.packages("survminer")
install.packages("survival")
install.packages("rms")
library(pROC)
library(maxstat)
library(survminer)
library(survival)
library(rms)
# read data
data_exercise <- read.csv('data_exercise.csv')
data <- data_exercise
## 1. variable transformation based on distribution
# 1.1 check distribution
hist_outlier(data$C1) # normal
hist_outlier(data$C6) # log-normal
hist_outlier(data$C5) # ??
# 1.2 try transformations
data$C6_log <- log(data$C6) #log
data$C5_log_minus <- log(max(data$C5)+1-(data$C5))
# 1.3 check distribution after transformation
hist_outlier(data$C6_log)
hist_outlier(data$C5_log_minus)
## 2. variable transformation based on relation Y~X
# 2.1 categorize continuous variable to categorical variable based on pre-defined cut-offs
data$X_group<-cut(data$X, c(-1000,36,38,1000), labels=c("low","normal","high"))
# 2.2 use splines (rcs) to explore relation Y~x
# create a new variable D1 (with a U-shape relation)
data$D1 <- -data$C4
data$D1 <- ifelse(data$D1<0,data$D1,data$C3)
dd <- datadist(data)
options(datadist="dd")
fit.C3 <- cph(Surv(Time_death, Status_death==1)~ rcs(C3,3),data=data)
plot(Predict(fit.C3,C3,ref.zero=TRUE))
fit.C4 <- cph(Surv(Time_death, Status_death==1)~ rcs(C4,3),data=data)
plot(Predict(fit.C4,C4,ref.zero=TRUE))
fit.C5 <- cph(Surv(Time_death, Status_death==1)~ rcs(C5,3),data=data)
plot(Predict(fit.C5,C5,ref.zero=TRUE))
fit.D1.cox <- cph(Surv(Time_death, Status_death==1)~ rcs(D1,3),data=data)
plot(Predict(fit.D1.cox,D1,ref.zero=TRUE))
# change the reference value to 0
dd$limits$D1[2] <- 0
options(datadist="dd")
fit.D1.cox <- cph(Surv(Time_death, Status_death==1)~ rcs(D1,3),data=data)
plot(Predict(fit.D1.cox,D1,ref.zero=TRUE))
# also applies to logistic regression
fit.D1.logistic <- lrm(Status_death ~ rcs(D1,3),data=data,x=TRUE,y=TRUE)
plot(Predict(fit.D1.logistic,D1,ref.zero=TRUE))
# also applies to ggplot
ggplot(Predict(fit.D1.cox,D1,ref.zero=TRUE))
# 2.3 Transformation U-shape
data$D1_sq <- (data$D1-(-0.1))^2
dd <- datadist(data)
options(datadist="dd")
fit.D1_sq.cox <- cph(Surv(Time_death, Status_death==1)~ rcs(D1_sq,3),data=data)
plot(Predict(fit.D1_sq.cox,D1_sq,ref.zero=TRUE))
# 2.4 categorize continuous variable to categorical variable based on shapes
plot(Predict(fit.D1.cox,D1,ref.zero=TRUE))
data$D1_group<-cut(data$D1, c(-1000,-0.5,0.5,1000), labels=c("low","normal","high"))
fit.D1_group.cox <- coxph(Surv(Time_death, Status_death==1)~ relevel(D1_group,ref="normal"),data=data)
summary(fit.D1_group.cox)