##external draft
table(mydata$stage,mydata$group)
fisher.test(mydata$stage,mydata$group)
############survival pipeline BY JIN
#load the data
setwd("C:/Users/Administrator/Desktop/mission/survival")
library(readxl)
mydata <- read_excel("mydata.xlsx")
#clean the data
mydata$afp_new[mydata$before_AFP<400]<-1
mydata$afp_new[mydata$before_AFP>=400]<-2
mydata$ALB_new[mydata$ALB<35]<-1
mydata$ALB_new[mydata$ALB>=35]<-2
mydata$ALT_new[mydata$ALT<40]<-1
mydata$ALT_new[mydata$ALT>=40]<-2
mydata$AST_new[mydata$AST<40]<-1
mydata$AST_new[mydata$AST>=40]<-2
mydata$TBIL_new[mydata$TBIL<17.1]<-1
mydata$TBIL_new[mydata$TBIL>=17.1]<-2
mydata$group[mydata$max_size<=5]<-1
mydata$group[mydata$max_size>5]<-2
mydata_selected<-mydata[c(1,2,23,27,24,9,10,12,25,26,11,28,14)]
#descriptive analysis
library(tableone)
vars <- names(mydata_selected[-c(12)])
factor_vars<- names(mydata_selected[c(2,3,4,5,6,7,8,9,10,13)])
tableone <- CreateTableOne(vars = vars, strata = c("group"),factorVars = factor_vars, data = mydata_selected)
print(tableone, quote = TRUE, noSpaces = TRUE)
#calculate the statistics value
#chisquare value
for (i in factor_vars){
print(i)
print(chisq.test(mydata_selected[[i]],mydata_selected$group))
}
#t value
t.test(mydata_selected$age[mydata_selected$group==1],mydata_selected$age[mydata_selected$group==2])
t.test(mydata_selected$max_size[mydata_selected$group==1],mydata_selected$max_size[mydata_selected$group==2])
#KM plot
library(survival)
library(survminer)
ggsurvplot(survfit(Surv(OS,OS_status) ~ group,
data = mydata),
risk.table = TRUE,
break.time.by = 10,
main = "Survival curve", xlab = 'Overall Survival(months)',
pval=TRUE,
legend.title = 'group',legend.labs = c('小肝癌','大肝癌'))
#log-rank test
survdiff(Surv(OS,OS_status) ~ group,data = mydata)
#factorize
for (i in factor_vars){
mydata_selected[[i]]<-as.numeric(mydata_selected[[i]])
}
mydata_selected$group<-as.numeric(mydata_selected$group)
#Univar COX
time=mydata[mydata_selected$group==2,]$OS
status=mydata[mydata_selected$group==2,]$OS_status
colnames(mydata_selected)
dcl2=as.data.frame(mydata_selected[mydata_selected$group==2,])
result0<-list()
HR0<-list()
HRCOEF0<-list()
for(i in 1:ncol(dcl2))
{
result0[[i]]<-anova(coxph(Surv(time, status)~dcl2[,i],data=dcl2))$Pr[2]
HR0[[i]]<-summary(coxph(Surv(time, status)~dcl2[,i],data=dcl2))$conf.int[1,] ##
HRCOEF0[[i]]<-summary(coxph(Surv(time, status)~dcl2[,i],data=dcl2))$coefficients[1,]
}
data_temp<-t(dcl2)
pval0<-unlist(result0)
gene0<-rownames(data_temp)
HRR0<-do.call(rbind, lapply(HR0, `[`, c(1:4)))#######??????
HRR0<-data.frame(HRR0)
names(HRR0) <- c("exp(coef)","exp(-coef)","lower .95","upper .95")######
head(HRR0)
HRCOEFF0<-do.call(rbind, lapply(HRCOEF0, `[`, c(1:5)))#######??????
HRCOEFF0<-data.frame(HRCOEFF0)
names(HRCOEFF0)<-c("coef1","exp(coef)1","se(coef)1","z1","Pvalue1")#coef coef coef z Pr(>|z|)
resultsata0<-data.frame(gene0,pval0,HRR0,HRCOEFF0)#####################the univariate cox model results
selectresult0<-resultsata0[which(resultsata0$Pvalue1<0.05),]
rownames(selectresult0)<-selectresult0[,1]
selected0<-as.data.frame(dcl2[,rownames(selectresult0)])
dat2<-selected0 #selected data
#multiple COX
surfit=coxph(Surv(time, status)~.,data=dat2)
summary(surfit)
surfit_f<-step(surfit)
summary(surfit_f)