生存分析至尊套餐

不同的cutoff 划分
随机森林生存分析
ROC阈值生存分析
outlier 去除

### use my own data
suppressMessages(library(TCGAbiolinks))
suppressMessages(library(survminer))
suppressMessages(library(survival))
suppressMessages(library(ggplot2))

fpkm<-read.table("~/ori_data/Merge_matrix.txt",header = T,row.names = 1,check.names = F)
row.names(fpkm)<-unlist(lapply(row.names(fpkm), FUN = function(x) {return(strsplit(x, split = ".",fixed = T)[[1]][1])}))
clinical<-read.table("~/ori_data/Clinical_matrix.txt",header = T,row.names = 1,check.names = F,sep="\t")
#Get normal tumor IDs using TCGAbiolinks
samplesNT <- TCGAquery_SampleTypes(barcode = colnames(fpkm),typesample = c("NT"))
samplesTP <- TCGAquery_SampleTypes(barcode = colnames(fpkm),typesample = c("TP"))
#get the gene expession matrix
gene_list<-read.csv("~/LIHC/results/DEGs/logFC0.5849625/LIHC_Humanized_DEGs_lnc.csv",header = T,row.names = 1)
gene_list<-as.character(gene_list[,1])
gexp <- fpkm[gene_list,samplesTP]
colnames(gexp) <-  sapply(strsplit(colnames(gexp),'-'),function(x) paste0(x[1:3],collapse="-"))
#normal vs tumor
suppressMessages(library(ggstatsplot))
library(ggplot2)
suppressMessages(library(ggpubr))
out_diff<-"~/LIHC/results/Normal_vs_tumor"
gexp_normal <- fpkm[gene_list,samplesNT]
gexp_tumor <- fpkm[gene_list,samplesTP]
gexp_new<-data.frame(t(cbind(gexp_normal,gexp_tumor)))
gexp_new[,"Group"]<-c(rep("Normal",length(samplesNT)),rep("Tumor",length(samplesTP)))
for (i in gene_list){
  x=gexp_normal[i,]
  y=gexp_tumor[i,]
  filename=paste0(out_diff,"/",i,"_LIHC_NASH_Group.pdf")
  pdf(filename,width=3,height=5)
  p <- ggboxplot(gexp_new, x = "Group", y = i,
                 color = "Group", palette =c("#DF3D8C","#0A5EB9"),
                 add = "jitter", shape = "Group")
  my_comparisons <- list( c("Normal", "Tumor" ))
  p<-p + stat_compare_means(comparisons = my_comparisons)
  plot(p)
  dev.off() 
}
#Grade
gene_list<-read.csv("~/LIHC/results/DEGs/logFC0.5849625/LIHC_Humanized_DEGs_lnc.csv",header = T,row.names = 1)
gene_list<-as.character(gene_list[,1])
gexp <- fpkm[gene_list,samplesTP]
colnames(gexp) <-  sapply(strsplit(colnames(gexp),'-'),function(x) paste0(x[1:3],collapse="-"))
com_ID<-intersect(colnames(gexp),row.names(clinical))
clinical<-read.table("~/ori_data/Clinical_matrix.txt",header = T,row.names = 1,check.names = F,sep="\t")
df<-cbind(t(gexp[,com_ID]),clinical[com_ID,c("gender","grade")])
out_grade="~/LIHC/results/Grade"
df<-df[df$grade!="",]
for (i in gene_list){
  filename=paste0(out_grade,"/",i,"_LIHC_Grade.pdf")
  pdf(filename,width=3.5,height=5)
  p <- ggboxplot(df, x = "grade", y = i,
                 color = "grade", palette =c("#727272", "#EBB208", "#0A5EB9","#DF3D8C"),
                 add = "jitter", shape = "grade")
  my_comparisons <- list( c("G1", "G2" ),c("G1","G3"),c("G1","G4"))
  p<-p + stat_compare_means(comparisons = my_comparisons)
  plot(p)
  dev.off() 
}

#Gender


###sur analysis
##prepare files
fpkm<-read.table("~/ori_data/Merge_matrix.txt",header = T,row.names = 1,check.names = F)
row.names(fpkm)<-unlist(lapply(row.names(fpkm), FUN = function(x) {return(strsplit(x, split = ".",fixed = T)[[1]][1])}))
clinical<-read.table("~/ori_data/Clinical_matrix.txt",header = T,row.names = 1,check.names = F,sep="\t")
#Get normal tumor IDs using TCGAbiolinks
samplesNT <- TCGAquery_SampleTypes(barcode = colnames(fpkm),typesample = c("NT"))
samplesTP <- TCGAquery_SampleTypes(barcode = colnames(fpkm),typesample = c("TP"))
#get the gene expession matrix
gene_list<-read.csv("~/LIHC/results/DEGs/logFC0.5849625/LIHC_Humanized_DEGs_lnc.csv",header = T,row.names = 1)
gene_list<-as.character(gene_list[,1])
gexp <- fpkm[gene_list,samplesTP]
colnames(gexp) <-  sapply(strsplit(colnames(gexp),'-'),function(x) paste0(x[1:3],collapse="-"))


##Regular analysis
#Quantile
Quantile<-function(x){
  ifelse(x>quantile(x,.75),"High",ifelse(xmedian(x),"High","Low")
}
clinical<-clinical[clinical$time!=0,]
com_ID<-intersect(row.names(gexp_levels),row.names(clinical))
gexp_levels<-t(apply(gexp[,com_ID],1,Median))
gexp_levels<-t(gexp_levels)

df<-cbind(gexp_levels[com_ID,],clinical[com_ID,c("status","time")])
out_sur="~/LIHC/results/Sur/plot"
for (i in gene_list){
  rt<-df[,c(i,"status","time")]
  rt[,"levels"]<-rt[,1]
  rt[,"time"]=rt[,"time"]/365
  myfit2<-coxph(Surv(time,status=="Dead") ~ levels,data=rt)
  pvalue <- anova(myfit2)$Pr[2]
  coxsummary<-summary(myfit2)
  HR<-coxsummary$coefficients[,2][1]
  filename=paste0(out_sur,"/",i,"_p=",as.character(pvalue ),"_HR=",as.character(HR),"_Median_sur.png")
  myfit1 <-survfit(Surv(time,status=="Dead") ~ levels,data=rt)
  p<-ggsurvplot(myfit1, pval = F, palette = c("#DF3D8C","#0A5EB9"),
                risk.table = TRUE, risk.table.col = "strata",pval.size=5,
                risk.table.height = 0.25#Useful when you have multiple groups
  )
  ggsave(filename,print(p),device="png")
}
#Mean
Mean<-function(x){
  ifelse(x>mean(x),"High","Low")
}
gexp_levels<-t(apply(gexp,1,Mean))
gexp_levels<-t(gexp_levels)
com_ID<-intersect(row.names(gexp_levels),row.names(clinical))
df<-cbind(gexp_levels[com_ID,],clinical[com_ID,c("status","time")])
out_sur="~/LIHC/results/Sur/plot/Mean"
for (i in gene_list){
  rt<-df[,c(i,"status","time")]
  rt[,"levels"]<-rt[,1]
  rt[,"time"]=rt[,"time"]/365
  myfit2<-coxph(Surv(time,status=="Dead") ~ levels,data=rt)
  pvalue <- anova(myfit2)$Pr[2]
  coxsummary<-summary(myfit2)
  HR<-coxsummary$coefficients[,2][1]
  filename=paste0(out_sur,"/",i,"_p=",as.character(pvalue ),"_HR=",as.character(HR),"_Mean_sur.png")
  myfit1 <-survfit(Surv(time,status=="Dead") ~ levels,data=rt)
  p<-ggsurvplot(myfit1, pval = F, palette = c("#DF3D8C","#0A5EB9"),
                risk.table = TRUE, risk.table.col = "strata",pval.size=5,
                risk.table.height = 0.25#Useful when you have multiple groups
  )
  ggsave(filename,print(p),device="png")
}
## Random Forests
library(randomForestSRC)
library(OIsurv)
library(survival)

our_dir="~/LIHC/results/Sur/plot/SRC"
Median<-function(x){
  ifelse(x>median(x),"High","Low")
}

gexp_levels<-apply(na.omit(t(gexp)),2,Median)
com_ID<-intersect(row.names(gexp_levels),row.names(clinical))
data<-cbind(gexp_levels[com_ID,],clinical[com_ID,c("status","time")])
data$"status"<-gsub("Dead",1,data$status)
data$"status"<-gsub("Alive",0,data$status)
write.csv(data,"~/LIHC/results/Sur/plot/SRC/data.csv")
data<-read.csv("~/LIHC/results/Sur/plot/SRC/data.csv",header = T,row.names = 1,check.names = F)
v.obj <- rfsrc(Surv(time, status) ~ ., data = data, ntree = 1000,splitrule = "logrank",method="vhVIMP",forest = T,proximity = T,var.used = "by.tree")
print.rfsrc(v.obj)
plot(v.obj)
plot.survival(v.obj)
pred <- predict(v.obj, data, OOB=TRUE, type="response")
pred$predicted
gene.vs<-var.select(object = v.obj,method="vh")
print(gene.vs)

v.obj<-rfsrc(Surv(time, status) ~ ., data=veteran)
#randomSurvivalForest use this
library(randomSurvivalForest)
com_ID<-intersect(colnames(gexp),row.names(clinical))
data<-cbind(t(gexp[,com_ID]),clinical[com_ID,c("status","time")])
data<-na.omit(data)
data<-data[data$time!=0,]
data$"time"<-data$time
v.obj <- rsf(Surv(time, status) ~ ., data = data, ntree = 2000,na.action = "na.impute",splitrule = "logrank",method="vhVIMP",forest = T,proximity = T,varUsed  = "by.tree")
print.rfsrc(v.obj)

#Roc to find the cutoff
library(survivalROC)
com_id<-intersect(row.names(clinical),colnames(gexp))
data<-cbind(t(gexp[gene_list,com_id]),clinical[com_id,c("status","time")])
data$"status"<-gsub("Dead",1,data$status)
data$"status"<-gsub("Alive",0,data$status)

out_sur="~/LIHC/results/Sur/plot/ROC_cutoff"
for (i in gene_list){
rt<-data[,c(i,"status","time")]
rt[,"time"]=rt[,"time"]/365
rt<-na.omit(rt)
rt<-rt[rt$time<5,]
SROC=survivalROC(Stime=rt$time,#生存时间
                 status=rt$status,#生存状态
                 marker=rt[,i],#需要分析对生存影响的数据
                 predict.time=5,#预期时间,一般是5年
                 method="KM")#生存模型KM
cut.op=SROC$cut.values[which.max(SROC$TP-SROC$FP)]
rt$"levels" <- cut(rt[,i], breaks=c(0, cut.op, Inf), labels=c("Low", "High"))
myfit2<-coxph(Surv(time,status=="1") ~ levels,data=rt)
pvalue <- anova(myfit2)$Pr[2]
coxsummary<-summary(myfit2)
HR<-coxsummary$coefficients[,2][1]
filename=paste0(out_sur,"/",i,"_p=",as.character(pvalue ),"_HR=",as.character(HR),"_ROCcutoff_sur.png")
myfit1 <-survfit(Surv(time,status=="1") ~ levels,data=rt)
p<-ggsurvplot(myfit1, pval = F, palette = c("#DF3D8C","#0A5EB9"),
              risk.table = TRUE, risk.table.col = "strata",pval.size=5,
              risk.table.height = 0.25#Useful when you have multiple groups
)
ggsave(filename,print(p),device="png")
}
#Remove outlier 
library(grid)
library(DMwR)
out_sur="~/LIHC/results/Sur/plot/Remove_outlier"
com_id<-intersect(colnames(gexp),row.names(clinical))
for (i in gene_list){
  rrt<-t(as.matrix(gexp[i,com_id]))
  rnub<-c(1:length(rrt[,1]))
  out <- boxplot.stats(rrt[,1])$out
  outlier<-c(which(rrt[,1] %in% out))
  row_without<-setdiff(rnub,outlier)
  new_rrt<-as.matrix(rrt[row_without,])
  colnames(new_rrt)<-i
  rrt_level<-apply(new_rrt,2,Median)
  rt<-cbind(rrt_level,clinical[row.names(rrt_level),c("status","time")])
  rt[,"levels"]<-rt[,1]
  rt[,"time"]=rt[,"time"]/365
  myfit2<-coxph(Surv(time,status=="Dead") ~ levels,data=rt)
  pvalue <- anova(myfit2)$Pr[2]
  coxsummary<-summary(myfit2)
  HR<-coxsummary$coefficients[,2][1]
  filename=paste0(out_sur,"/",i,"_p=",as.character(pvalue ),"_HR=",as.character(HR),"_Median_no_Outlier_sur.png")
  myfit1 <-survfit(Surv(time,status=="Dead") ~ levels,data=rt)
  p<-ggsurvplot(myfit1, pval = F, palette = c("#DF3D8C","#0A5EB9"),
                risk.table = TRUE, risk.table.col = "strata",pval.size=5,
                risk.table.height = 0.25#Useful when you have multiple groups
  )
  ggsave(filename,print(p),device="png")
}


#low than 5 years
Median<-function(x){
  ifelse(x>median(x),"High","Low")
}
gexp_levels<-t(apply(gexp,1,Median))
gexp_levels<-t(gexp_levels)
com_ID<-intersect(row.names(gexp_levels),row.names(clinical))
df<-cbind(gexp_levels[com_ID,],clinical[com_ID,c("status","time")])
out_sur="~/LIHC/results/Sur/plot"
df[,"time"]=df[,"time"]/365
df<-df[df[,"time"]<5,]
for (i in gene_list){
  rt<-df[,c(i,"status","time")]
  rt[,"levels"]<-rt[,1]
  myfit2<-coxph(Surv(time,status=="Dead") ~ levels,data=rt)
  pvalue <- anova(myfit2)$Pr[2]
  coxsummary<-summary(myfit2)
  HR<-coxsummary$coefficients[,2][1]
  filename=paste0(out_sur,"/",i,"_p=",as.character(pvalue ),"_HR=",as.character(HR),"_Median_sur_5_years.png")
  myfit1 <-survfit(Surv(time,status=="Dead") ~ levels,data=rt)
  p<-ggsurvplot(myfit1, pval = F, palette = c("#DF3D8C","#0A5EB9"),
                risk.table = TRUE, risk.table.col = "strata",pval.size=5,
                risk.table.height = 0.25#Useful when you have multiple groups
  )
  ggsave(filename,print(p),device="png")
}

#low than 2 years

Median<-function(x){
  ifelse(x>median(x),"High","Low")
}
gexp_levels<-t(apply(gexp,1,Median))
gexp_levels<-t(gexp_levels)
com_ID<-intersect(row.names(gexp_levels),row.names(clinical))
df<-cbind(gexp_levels[com_ID,],clinical[com_ID,c("status","time")])
out_sur="~/LIHC/results/Sur/plot"
df[,"time"]=df[,"time"]/365
df<-df[df[,"time"]<2,]
for (i in gene_list){
  rt<-df[,c(i,"status","time")]
  rt[,"levels"]<-rt[,1]
  myfit2<-coxph(Surv(time,status=="Dead") ~ levels,data=rt)
  pvalue <- anova(myfit2)$Pr[2]
  coxsummary<-summary(myfit2)
  HR<-coxsummary$coefficients[,2][1]
  filename=paste0(out_sur,"/",i,"_p=",as.character(pvalue ),"_HR=",as.character(HR),"_Median_sur_2_years.png")
  myfit1 <-survfit(Surv(time,status=="Dead") ~ levels,data=rt)
  p<-ggsurvplot(myfit1, pval = F, palette = c("#DF3D8C","#0A5EB9"),
                risk.table = TRUE, risk.table.col = "strata",pval.size=5,
                risk.table.height = 0.25#Useful when you have multiple groups
  )
  ggsave(filename,print(p),device="png")
}

##Male vs Female use the mdeian 07212019
out_male_sur="~/LIHC/results/Sur/plot/Gender/Male"
out_female_sur="~/LIHC/results/Sur/plot/Gender/Female"

male_id<-row.names(clinical[clinical$gender=="MALE",])
female_id<-row.names(clinical[clinical$gender=="FEMALE",])
male_id<-intersect(male_id,colnames(gexp))
female_id<-intersect(female_id,colnames(gexp))
gexp_male<-gexp[,male_id]
gexp_female<-gexp[,female_id]
Median<-function(x){
  ifelse(x>median(x),"High","Low")
}

gexp_male_levels<-t(apply(gexp_male,1,Median))
gexp_male_levels<-t(gexp_male_levels)
com_male_ID<-intersect(row.names(gexp_male_levels),row.names(clinical))
df_male<-cbind(gexp_male_levels[com_male_ID,],clinical[com_male_ID,c("status","time")])
df_male[,"time"]=df_male[,"time"]/365

gexp_female_levels<-t(apply(gexp_female,1,Median))
gexp_female_levels<-t(gexp_female_levels)
com_female_ID<-intersect(row.names(gexp_female_levels),row.names(clinical))
df_female<-cbind(gexp_female_levels[com_female_ID,],clinical[com_female_ID,c("status","time")])
df_female[,"time"]=df_female[,"time"]/365

for (i in gene_list){
  rt_male<-df_male[,c(i,"status","time")]
  rt_male[,"levels"]<-rt_male[,1]
  myfit2<-coxph(Surv(time,status=="Dead") ~ levels,data=rt_male)
  pvalue <- anova(myfit2)$Pr[2]
  coxsummary<-summary(myfit2)
  HR<-coxsummary$coefficients[,2][1]
  filename=paste0(out_male_sur,"/",i,"_p=",as.character(pvalue ),"_HR=",as.character(HR),"_Median_male.png")
  myfit1 <-survfit(Surv(time,status=="Dead") ~ levels,data=rt_male)
  p<-ggsurvplot(myfit1, pval = F, palette = c("#DF3D8C","#0A5EB9"),
                risk.table = TRUE, risk.table.col = "strata",pval.size=5,
                risk.table.height = 0.25#Useful when you have multiple groups
  )
  ggsave(filename,print(p),device="png")
  rt_female<-df_female[,c(i,"status","time")]
  rt_female[,"levels"]<-rt_female[,1]
  myfit2<-coxph(Surv(time,status=="Dead") ~ levels,data=rt_female)
  pvalue <- anova(myfit2)$Pr[2]
  coxsummary<-summary(myfit2)
  HR<-coxsummary$coefficients[,2][1]
  filename=paste0(out_female_sur,"/",i,"_p=",as.character(pvalue ),"_HR=",as.character(HR),"_Median_female.png")
  myfit1 <-survfit(Surv(time,status=="Dead") ~ levels,data=rt_female)
  g<-ggsurvplot(myfit1, pval = F, palette = c("#DF3D8C","#0A5EB9"),
                risk.table = TRUE, risk.table.col = "strata",pval.size=5,
                risk.table.height = 0.25#Useful when you have multiple groups
  )
  ggsave(filename,print(g),device="png")
}

你可能感兴趣的:(生存分析至尊套餐)