下载数据
TCGA官网下载TCGA-COAD projiect的HTSeq-counts 和clinical的manifest文件。
然后在电脑终端用gdc-client下载数据
得到clinical文件459个,mrna文件521个。
整理临床数据
setwd("D:\\COAD")
library(XML)
result <- xmlParse("./clinical/007b27c5-d36e-4bec-9321-b03b773fd3b8/nationwidechildrens.org_clinical.TCGA-A6-6651.xml")
rootnode <- xmlRoot(result)
rootsize <- xmlSize(rootnode)
print(rootnode[1])
#print(rootnode[2])
xmldataframe <- xmlToDataFrame(rootnode[2])
head(t(xmlToDataFrame(rootnode[2])))
# [,1]
#additional_studies ""
#tumor_tissue_site "Colon"
#histological_type "Colon Adenocarcinoma"
#other_dx "No"
#gender "FEMALE"
#vital_status "Alive"
xmls = dir("clinical/",pattern = "*.xml$",recursive = T)
td = function(x){
result <- xmlParse(file.path("clinical/",x))
rootnode <- xmlRoot(result)
xmldataframe <- xmlToDataFrame(rootnode[2])
return(t(xmldataframe))
}
cl = lapply(xmls,td)
cl_df <- t(do.call(cbind,cl))
cl_df[1:3,1:3]
# additional_studies tumor_tissue_site histological_type
#[1,] "" "Colon" "Colon Adenocarcinoma"
#[2,] "" "Colon" "Colon Adenocarcinoma"
#[3,] "" "Colon" "Colon Mucinous Adenocarcinoma"
clinical = data.frame(cl_df)
clinical[1:4,1:4]
# additional_studies tumor_tissue_site histological_type other_dx
#1 Colon Colon Adenocarcinoma No
#2 Colon Colon Adenocarcinoma No
#3 Colon Colon Mucinous Adenocarcinoma Yes
#4 Colon Colon Adenocarcinoma No
整理表达矩阵
探索数据:任选两个counts文件读取,观察geneid的顺序是否一致
x = read.table("mrna/02734d4d-fc8f-4ef7-ac82-1b4d7184cc5e/28004569-048d-4f8c-99aa-7a8c69a98dcc.htseq.counts.gz")
x2 = read.table("mrna/0826add5-571b-41be-b186-936959fc9d79/dfeb2d54-28da-4521-b488-bdac246bb59b.htseq.counts.gz")
identical(x$V1,x2$V1)
#[1] TRUE
geneid顺序是一致的,可以直接cbind,不会导致顺序错乱。
批量读取所有的counts.gz文件。
count_files = dir("mrna/",pattern = "*.htseq.counts.gz$",recursive = T)
ex = function(x){
result <- read.table(file.path("mrna/",x),row.names = 1,sep = "\t")
return(result)
}
head(ex("02734d4d-fc8f-4ef7-ac82-1b4d7184cc5e/28004569-048d-4f8c-99aa-7a8c69a98dcc.htseq.counts.gz"))
# V2
#ENSG00000000003.13 8357
#ENSG00000000005.5 37
#ENSG00000000419.11 2278
#ENSG00000000457.12 835
#ENSG00000000460.15 697
#ENSG00000000938.11 687
exp = lapply(count_files,ex)
exp <- do.call(cbind,exp)
dim(exp)
#60488 521 有6万多行,521个样本
exp[1:4,1:4]
# V2 V2.1 V2.2 V2.3
#ENSG00000000003.13 7501 5080 4721 8357
#ENSG00000000005.5 29 6 58 37
#ENSG00000000419.11 2108 1581 2203 2278
#ENSG00000000457.12 411 796 1001 835
这个数据框没有列名,需从TCGA官网上下载JSON文件(mrn对应的json文件!!!),将文件名与样本配对
#加入列名
meta <- jsonlite::fromJSON("metadata.cart.2020-08-09.json")
colnames(meta)
#[1] "md5sum" "associated_entities" "file_size" "annotations" "access" "data_format" "data_category"
# [8] "data_type" "file_id" "file_name" "submitter_id" "state"
ids <- meta$associated_entities;class(ids)
#[1] "list"
ids[[1]]
# entity_id entity_submitter_id entity_type case_id
#1 381cf7f3-cd7b-4660-8175-ff8062f7bf78 TCGA-AA-A00L case 381cf7f3-cd7b-4660-8175-ff8062f7bf78
class(ids[[1]][,2]) #并不是每一个数据都是第2列,需要自己手动检查
#[1] "character"
meta$associated_entities是个列表,这个列表里包含数据框,数据框的第一列内容就是tcga样本id。
ID = sapply(ids,function(x){x[,2]})#同样与上面对应
file2id = data.frame(file_name = meta$file_name,
ID = ID)
文件名与TCGA样本ID的对应关系已经得到,接下来是将其添加到表达矩阵中,成为行名。需要找到读取文件的顺序,一 一对应修改。
head(file2id$file_name)
#[1] "a8e396d2-9528-4d8d-8e22-f56be5397cc1.htseq.counts.gz" "8f9232c6-4ae9-4e3f-af06-335f107b09bc.htseq.counts.gz"
#[3] "dd9a3ab6-6c93-4b47-9b4e-62e56767054b.htseq.counts.gz" "31cf7af3-5576-4201-912d-5948fa2c0296.htseq.counts.gz"
#[5] "06c8dfda-b057-47cb-b435-9c2739694f6a.htseq.counts.gz" "d753b828-373f-4ebc-a300-598e0c6e79ba.htseq.counts.gz"
head(count_files)
#[1] "0096f6b3-94f5-49d9-aa43-66d9ab6b2b5c/17510ea6-0c97-4bd0-94a4-63dd76e5b7c7.htseq.counts.gz"
#[2] "01f5a228-a2cf-444f-8102-a9d4a2d9a27d/7324b2d6-dd30-4f99-a101-7c3d497bbcb8.htseq.counts.gz"
#[3] "02566c4e-e3d2-43eb-aca3-754c1bd5bd5f/1e50d68e-ac09-42ad-9678-bc0598328037.htseq.counts.gz"
#[4] "02734d4d-fc8f-4ef7-ac82-1b4d7184cc5e/28004569-048d-4f8c-99aa-7a8c69a98dcc.htseq.counts.gz"
#[5] "029ffdfa-c2ff-439e-be14-a7b68342fbdb/1a587b92-0a11-47f0-83c2-77a2b6de1088.htseq.counts.gz"
#[6] "02d4cebf-e7f6-4c13-aca1-af16fd6d69cb/2f6070c7-99d0-49bb-83ca-33c032650cde.htseq.counts.gz"
count_files2 = stringr::str_split(count_files,"/",simplify = T)[,2]
count_files2[1] %in% file2id$file_name
#[1] TRUE
count_files2的顺序就是列名的顺序,根据它来调整file2id的顺序。match函数需要了解一下(我懂意思,但我写不出来。)
file2id = file2id[match(count_files2,file2id$file_name),]
colnames(exp) = file2id$ID
exp[1:4,1:4]
# TCGA-AA-3979-01A-01R-1022-07 TCGA-AA-3655-01A-02R-1723-07 TCGA-D5-6535-01A-11R-1723-07 TCGA-CK-4950-01A-01R-1723-07
#ENSG00000000003.13 7501 5080 4721 8357
#ENSG00000000005.5 29 6 58 37
#ENSG00000000419.11 2108 1581 2203 2278
#ENSG00000000457.12 411 796 1001 835
过滤低表达样本
dim(exp)
#[1] 60488 521
exp = exp[apply(exp, 1, function(x) sum(x > 1) > 41), ]
dim(exp)
#[1] 35267 521
exp[1:4,1:4]
# TCGA-AA-3979-01A-01R-1022-07 TCGA-AA-3655-01A-02R-1723-07 TCGA-D5-6535-01A-11R-1723-07 TCGA-CK-4950-01A-01R-1723-07
#ENSG00000000003.13 7501 5080 4721 8357
#ENSG00000000005.5 29 6 58 37
#ENSG00000000419.11 2108 1581 2203 2278
#ENSG00000000457.12 411 796 1001 835
分组,根据样本ID的第14-15位,给样本分组(tumor和normal),<10是肿瘤,其他是癌旁组织
table(stringr::str_sub(colnames(exp),14,15))
# 01 02 06 11
#478 1 1 41
group_list = ifelse(as.numeric(substr(colnames(exp),14,15)) < 10,'tumor','normal')
group_list = factor(group_list,levels = c("normal","tumor"))
table(group_list)
#group_list
#normal tumor
# 41 480
save(exp,clinical,group_list,cancer_type,file = paste0(cancer_type,"gdc.Rdata"))
接下来从TCGA得到的表达矩阵中分别提取出mRNA和lncRNA矩阵。
大概路线:1.找到TCGA数据对应的参考基因组版本,2.下载该版本的参考基因组文件,找到mRNA和lncRNA对应的ensembl id 3.在表达矩阵中筛选。
下载gencode.v22.annotation.gtf.gz后解压
#step1:读取并探索gtf文件----
options(stringsAsFactors = F)
if(!file.exists("anno.Rdata"))
BiocManager::install("rtracklayer")
library(rtracklayer)
x = rtracklayer::import("gencode.v22.annotation.gtf")
class(x)
#[1] "GRanges"
#attr(,"package")
#[1] "GenomicRanges
x2 = as.data.frame(x);dim(x2)
#[1] 2563671 27
colnames(x2)
# [1] "seqnames" "start" "end" "width" "strand"
# [6] "source" "type" "score" "phase" "gene_id"
#[11] "gene_type" "gene_status" "gene_name" "level" "havana_gene"
#[16] "transcript_id" "transcript_type" "transcript_status" "transcript_name" "tag"
#[21] "transcript_support_level" "havana_transcript" "exon_number" "exon_id" "ont" #[26] "protein_id" "ccdsid"
tj = as.data.frame(table(x2$type));tj
# Var1 Freq
#1 gene 60483
#2 transcript 198442
#3 exon 1172082
#4 CDS 699443
#5 start_codon 82228
#6 stop_codon 74337
#7 UTR 276542
#8 Selenocysteine 114
tj2 = as.data.frame(table(x2$gene_type));tj2
#step2:先筛选出gene对应的行
nrow(x2);x2 = x2[x2$type=="gene",];nrow(x2)
#[1] 2563671
#[1] 60483
tj3 = as.data.frame(table(x2$gene_type));tj3
#step3:提取lnc和mRNA及其对应的ensambelid和symbol
lnc_bype = c("3prime_overlapping_ncRNA", "antisense", "bidirectional_promoter_lncRNA", "lincRNA", "macro_lncRNA", "non_coding", "processed_transcript", "sense_intronic" , "sense_overlapping")
table(x2$gene_type %in% lnc_bype)
#FALSE TRUE
#45657 14826
table(x2$gene_type == "protein_coding")
#FALSE TRUE
#40669 19814
lnc_anno = x2[x2$gene_type %in% lnc_bype,c("gene_name","gene_id","gene_type")]
mRNA_anno = x2[x2$gene_type == "protein_coding",c("gene_name","gene_id","gene_type")]
save(lnc_anno,mRNA_anno,file = "anno.Rdata")
load("anno.Rdata")
#step4:表达矩阵拆分和注释
load("TCGA-COADgdc.Rdata")
table(rownames(exp) %in% mRNA_anno$gene_id)
#FALSE TRUE
#17263 18004
mRNA_exp = exp[rownames(exp) %in% mRNA_anno$gene_id,]
tmp = data.frame(gene_id = rownames(exp))
x = dplyr::inner_join(tmp,mRNA_anno,by = "gene_id")
#inner_join不改变顺序,可以确认一下
identical(tmp$gene_id,rownames(exp))
#[1] TRUE
table(!duplicated(x$gene_name))
#FALSE TRUE
# 36 17968
mRNA_exp = mRNA_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(mRNA_exp) = x$gene_name
mRNA_exp[1:4,1:4] #这里的mRNA是所有样本521个样本的mRNA
# TCGA-AA-3979-01A-01R-1022-07 TCGA-AA-3655-01A-02R-1723-07 TCGA-D5-6535-01A-11R-1723-07 TCGA-CK-4950-01A-01R-1723-07
#TSPAN6 7501 5080 4721 8357
#TNMD 29 6 58 37
#DPM1 2108 1581 2203 2278
#SCYL3 411 796 1001 835
mRNA_exp = na.omit(mRNA_exp)
#同样办法拆出lncRNA:也是所有样本的(521个样本)
lnc_exp = exp[rownames(exp) %in% lnc_anno$gene_id,]
tmp = data.frame(gene_id = rownames(exp))
x = dplyr::inner_join(tmp,lnc_anno,by = "gene_id")
identical(tmp$gene_id,rownames(exp))
#[1] TRUE
table(!duplicated(x$gene_name))
#FALSE TRUE
# 4 8714
lnc_exp = lnc_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(lnc_exp) = x$gene_name
lnc_exp[1:4,1:4]
# TCGA-AA-3979-01A-01R-1022-07 TCGA-AA-3655-01A-02R-1723-07 TCGA-D5-6535-01A-11R-1723-07 TCGA-CK-4950-01A-01R-1723-07
#LINC01587 0 0 1 0
#AC000111.6 1 18 9 2
#XXbac-B461K10.4 3 3 9 2
#IGF2-AS 1 2 5 3
lnc_exp = na.omit(lnc_exp)
配对样本的lnc、mRNA 表达矩阵,供差异分析用
挑选样本,是选列,与行(基因)无关
1.选出与normal样本匹配的tumor样本,合并为新的表达矩阵
exp2 = rbind(mRNA_exp,lnc_exp) #mRNA和lncRNA合并,不改变顺序
identical(rownames(mRNA_exp),head(rownames(exp2),nrow(mRNA_exp)))
#[1] TRUE
table(stringr::str_sub(colnames(exp2),14,15)) #正常样本个数?41个大于10的那就是41个正常样本
# 01 02 06 11
#478 1 1 41
library(stringr)
exp2_nor = exp2[,str_sub(colnames(exp2),14,15)==11]
exp2_tum = exp2[,str_sub(colnames(exp2),14,15)!=11]
patient = str_sub(colnames(exp2_nor),1,12) #有配对样本的病人id
table(str_sub(colnames(exp2_tum),1,12) %in% str_sub(patient))
#FALSE TRUE
#434 46
# 看看肿瘤样本有重复吗:
exp2_tum = exp2_tum[,str_sub(colnames(exp2_tum),1,12) %in% str_sub(patient)]
exp2_tum = exp2_tum[,str_sort(colnames(exp2_tum))] #str_sort是排序
exp2_tum = exp2_tum[,!duplicated(str_sub(colnames(exp2_tum),1,12) )] #排序去重,这样样本有01A和01B可选时,就会留下01A。
#让正常样本顺序与肿瘤样本顺序一致,这样才方便写配对向量
tmp = match(str_sub(colnames(exp2_tum),1,12),str_sub(colnames(exp2_nor),1,12))
exp2_nor = exp2_nor[,tmp]
identical(str_sub(colnames(exp2_tum),1,12),str_sub(colnames(exp2_nor),1,12))
#TRUE
exp2_small = cbind(exp2_nor,exp2_tum)
拆分回lnc和mRNA
mRNA_exp_small = head(exp2_small,nrow(mRNA_exp))
lnc_exp_small = tail(exp2_small,nrow(lnc_exp))
总共得到了6个表达矩阵
dim(exp) #原始ensambel矩阵 35267 521
dim(exp2) #lncRNA和mRNA矩阵合并得到的symbol矩阵 26682 521
dim(lnc_exp) #lncRNA 8714 521
dim(mRNA_exp) #mRNA 17968 521
dim(lnc_exp_small) #配对的lncRNA矩阵 8714 82
dim(mRNA_exp_small)#配对的mRNA矩阵 17968 82
save(lnc_exp,mRNA_exp,file = paste0(cancer_type,"lmexp.Rdata"))
save(lnc_exp_small,mRNA_exp_small,file = paste0(cancer_type,"lmexp_small.Rdata"))
终于来到差异分析了!
if(!require(stringr))install.packages('stringr')
if(!require(ggplotify))install.packages("ggplotify")
if(!require(patchwork))install.packages("patchwork")
if(!require(cowplot))install.packages("cowplot")
if(!require(edgeR))BiocManager::install('edgeR')
用edgeR 进行lnc_RNA差异分析
rm(list = ls())
load("TCGA-COADgdc.Rdata")
load("TCGA-COADlmexp_small.Rdata")
group_list = rep(c("normal","tumor"),each = ncol(lnc_exp_small)/2)
pairinfo = rep(1:(ncol(lnc_exp_small)/2),times = 2)
table(group_list)
#group_list
#normal tumor
# 41 41
library(edgeR)
dge <- DGEList(counts=lnc_exp_small,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
design <- model.matrix(~pairinfo+group_list)
dge <- estimateDisp(dge,design)
fit <- glmQLFit(dge, design)
fit2 <- glmQLFTest(fit)
result = topTags(fit2)
DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG)
#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff <- 1.5
DEG$change = as.factor(
ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
head(DEG)
# logFC logCPM F PValue FDR change
#ELFN1-AS1 5.176298 9.166588 356.4789 3.946428e-33 2.711619e-29 UP
#RP11-474D1.3 9.110774 10.067577 352.0110 6.223591e-33 2.711619e-29 UP
#RP11-138J23.1 7.056960 5.148157 334.7920 3.764552e-32 1.093477e-28 UP
#MAFG-AS1 2.853802 8.371429 320.2209 1.829845e-31 3.986317e-28 UP
#RP11-474D1.4 7.542251 5.577248 271.9532 5.318818e-29 9.269636e-26 UP
#PVT1 2.464724 9.594802 267.0938 9.814251e-29 1.425356e-25 UP
table(DEG$change)
#DOWN NOT UP
# 685 7022 1007
lncRNA_DEG <- DEG
cg = rownames(DEG)[DEG$change!="NOT"]
cgexp = lnc_exp_small[cg,]
table(head(DEG[cg,"change"],580))
#DOWN NOT UP
# 265 0 315
if(!require(devtools))install.packages("devtools")
library("devtools")
if(!require(tinyplanet))devtools::install_github("xjsun1221/tinyplanet",upgrade = F)#小洁老师的R包,需从github上面安装
library("tinyplanet")
library("ggplot2")
lnc_exp_small[1:4,1:4]
# TCGA-A6-2671-11A-01R-A32Z-07 TCGA-A6-2675-11A-01R-1723-07 TCGA-A6-2678-11A-01R-A32Z-07 TCGA-A6-2679-11A-01R-A32Z-07
#LINC01587 0 1 3 0
#AC000111.6 1 2 4 1
#XXbac-B461K10.4 1 20 3 4
#IGF2-AS 0 2 2 0
#PCA
dat = log(lnc_exp_small+1)
pca.plot = draw_pca(dat,group_list);pca.plot
#热图火山图
x = lnc_exp_small[cg,]
library(tinyplanet)
h = draw_heatmap(x,group_list,scale_before = T)
v = draw_volcano(lncRNA_DEG,logFC_cutoff = 1.5,pkg = 2)
save(cg,file = paste0(cancer_type,"lnccg.Rdata"))
接下来是mRNA的差异分析
rm(list = ls())
load("TCGA-COADgdc.Rdata")
load("TCGA-COADlmexp_small.Rdata")
group_list = rep(c("normal","tumor"),each = ncol(mRNA_exp_small)/2)
pairinfo = rep(1:41,times = 2)
table(group_list)
#group_list
#normal tumor
# 41 41
library(edgeR)
dge <- DGEList(counts=mRNA_exp_small,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
design <- model.matrix(~pairinfo+group_list)
dge <- estimateDisp(dge,design)
fit <- glmQLFit(dge, design)
fit2 <- glmQLFTest(fit)
result = topTags(fit2)
DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG)
#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff <- 1.5
DEG$change = as.factor(
ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
head(DEG)
# logFC logCPM F PValue FDR change
#AJUBA 3.033725 3.826374 615.1156 3.157165e-40 3.984587e-36 UP
#KRT80 6.801211 4.460591 609.4493 4.435203e-40 3.984587e-36 UP
#ETV4 5.383569 5.963017 560.2076 9.619414e-39 5.761388e-35 UP
#CEMIP 5.292491 6.300332 546.5535 2.354213e-38 1.057512e-34 UP
#ESM1 5.521886 1.697701 478.7325 2.734995e-36 9.828479e-33 UP
#CLDN1 4.649788 5.536313 450.9748 2.265810e-35 6.785344e-32 UP
table(DEG$change)
# DOWN NOT UP
# 1611 14826 1531
lncRNA_DEG <- DEG
cg = rownames(DEG)[DEG$change!="NOT"]
cgexp = mRNA_exp_small[cg,]
table(head(DEG[cg,"change"],580))
#DOWN NOT UP
#272 0 308
library("ggplot2")
library("tinyplanet")
mRNA_exp_small[1:4,1:4]
# TCGA-A6-2671-11A-01R-A32Z-07 TCGA-A6-2675-11A-01R-1723-07 TCGA-A6-2678-11A-01R-A32Z-07 TCGA-A6-2679-11A-01R-A32Z-07
#TSPAN6 5884 2731 7881 4593
#TNMD 28 15 61 52
#DPM1 1355 912 1453 1148
#SCYL3 895 527 1141 825
#PCA
dat = log(mRNA_exp_small+1)
pca.plot = draw_pca(dat,group_list);pca.plot
#热图火山图
x = mRNA_exp_small[cg,]
library(tinyplanet)
h = draw_heatmap(x,group_list,scale_before = T)
v = draw_volcano(lncRNA_DEG,logFC_cutoff = 1.5,pkg = 2)
save(cg,file = paste0(cancer_type,"mRNAcg.Rdata"))
火山图没画出来,所以我接着mrna的尝试了一下
#版本一
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(lncRNA_DEG[lncRNA_DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(lncRNA_DEG[lncRNA_DEG$change =='DOWN',])
)
g = ggplot(data=lncRNA_DEG,
aes(x=logFC, y=-log10(PValue),
color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
ggsave(g,filename = paste0(3,'_volcano.png'))
dev.off()
版本二
#重点关注的基因
lncRNA_DEG$sign <- ifelse(lncRNA_DEG$PValue < 0.001 & abs(lncRNA_DEG$logFC) > 8,#可以改
rownames(lncRNA_DEG),
NA)
table(lncRNA_DEG$sign)
ggplot(data= lncRNA_DEG, aes(x = logFC, y = -log10(PValue), color = change)) +
geom_point(alpha=0.8, size = 1) +
theme_bw(base_size = 15) +
theme(plot.title=element_text(hjust=0.5), # 标题居中
panel.grid.minor = element_blank(),
panel.grid.major = element_blank()) + # 网格线设置为空白
geom_hline(yintercept=2 ,linetype=4) +
geom_vline(xintercept=c(-2,2) ,linetype=4 ) +
scale_color_manual(name = "",
values = c("red", "green", "black"),
limits = c("UP", "DOWN", "NOT")) +
geom_label_repel(aes(label=sign), # 防止标签过多重叠
fontface="bold",
color="grey50",
box.padding=unit(0.35, "lines"), # 文本框周边填充
point.padding=unit(0.5, "lines"), # 点周边填充
segment.colour = "grey50", # 连接点与标签的线段的颜色
force = T) +
labs(title = 'COAD DEG volcano')
#若不想标出基因就直接这样
ggplot(data= lncRNA_DEG, aes(x = logFC, y = -log10(PValue), color = change)) +
geom_point(alpha=0.8, size = 1) +
theme_bw(base_size = 15) +
theme(plot.title=element_text(hjust=0.5), # 标题居中
panel.grid.minor = element_blank(),
panel.grid.major = element_blank()) + # 网格线设置为空白
geom_hline(yintercept=2 ,linetype=4) +
geom_vline(xintercept=c(-2,2) ,linetype=4 ) +
scale_color_manual(name = "",
values = c("red", "green", "black"),
limits = c("UP", "DOWN", "NOT")) +
labs(title = 'COAD DEG volcano')
轮到生存分析了
生存分析只需要tumor数据,将normal去掉,新表达矩阵数据命名为exprSet;
clinical信息需要进一步整理,成为生存分析需要的格式,新临床信息数据命名为meta。
由于不同癌症的临床信息表格组织形式不同,这里的代码需要根据实际情况修改。
rm(list=ls())
options(stringsAsFactors = F)
load("TCGA-COADgdc.Rdata")
load("TCGA-COADlmexp.Rdata")
load("TCGA-COADlnccg.Rdata")
library(stringr)
#clinical通常有几十列,选出其中有用的几列。
tmp = data.frame(colnames(clinical))
clinical = clinical[,c(
'bcr_patient_barcode',
'vital_status',
'days_to_death',
'days_to_last_followup',
'race_list',
'age_at_initial_pathologic_diagnosis',
'gender' ,
'stage_event'
)]
dim(clinical)
#[1] 459 8
#rownames(clinical) <- NULL
rownames(clinical) <- clinical$bcr_patient_barcode
clinical[1:4,1:4]
# bcr_patient_barcode vital_status days_to_death days_to_last_followup
#TCGA-AA-3518 TCGA-AA-3518 Alive 31
#TCGA-A6-6651 TCGA-A6-6651 Alive 249
#TCGA-DM-A285 TCGA-DM-A285 Dead 179
#TCGA-AA-3495 TCGA-AA-3495 Alive 31
meta = clinical[clinical$days_to_death != 0,]
#简化meta的列名
colnames(meta)=c('ID','event','death','last_followup','race','age','gender','stage')
#表达矩阵处理
exprSet=lnc_exp[cg,group_list=='tumor']
table(str_sub(colnames(exprSet),1,12) %in% meta$ID)
#FALSE TRUE
# 7 473
exprSet = exprSet[,str_sub(colnames(exprSet),1,12) %in% meta$ID]
exprSet = exprSet[,str_sort(colnames(exprSet))]
exprSet = exprSet[,!duplicated(str_sub(colnames(exprSet),1,12) )]
#调整meta的ID顺序与exprSet列名一致
meta=meta[match(str_sub(colnames(exprSet),1,12),meta$ID),]
all(meta$ID==str_sub(colnames(exprSet),1,12))
#[1] TRUE
#整理生存分析的输入数据----
#1.1由随访时间和死亡时间计算生存时间(月)
is.empty.chr = function(x){
ifelse(stringr::str_length(x)==0,T,F)
}
is.empty.chr(meta[2,3])
meta[,3][is.empty.chr(meta[,3])]=0
meta[,4][is.empty.chr(meta[,4])]=0
meta$time=(as.numeric(meta[,3])+as.numeric(meta[,4]))/30
#1.2 根据生死定义event,活着是0,死的是1
meta$event=ifelse(meta$event=='Alive',0,1)
#1.3 年龄和年龄分组
meta$age=as.integer(meta$age)
meta$age_group=ifelse(meta$age>median(meta$age),'older','younger')
#1.4 stage
library(stringr)
meta$stage
tmp = str_split(meta$stage,' ',simplify = T)[,2]
str_count(tmp,"T")
str_locate(tmp,"T")[,1]
tmp = str_sub(tmp,1,str_locate(tmp,"T")[,1]-1)
table(tmp)
table(is.na(tmp))
#FALSE TRUE
# 438 11
meta$stage = tmp
#1.5 race
table(meta$race)
save(meta,exprSet,group_list,cancer_type,file = paste0(cancer_type,"sur_model.Rdata"))
下面就是生存分析的可视化:logrank批量生存分析和cox批量生存分析
1.logrank批量生存分析
对每个基因(lncRNA)求了p值,结果结果是10个基因的p<0.01,72个基因的p<0.05。
logrankfile = paste0(cancer_type,"log_rank_p.Rdata")
library(survival)
library(survminer)
if(!file.exists(logrankfile)){
mySurv=with(meta,Surv(time, event))
log_rank_p <- apply(exprSet , 1 , function(gene){
# gene=exprSet[1,]
meta$group=ifelse(gene>median(gene),'high','low')
data.survdiff=survdiff(mySurv~group,data=meta)
p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
return(p.val)
})
log_rank_p=sort(log_rank_p)
save(log_rank_p,file = logrankfile)
}
load(logrankfile)
table(log_rank_p<0.01)
#FALSE TRUE
# 1682 10
table(log_rank_p<0.05)
#FALSE TRUE
# 1620 72
2.cox批量生存分析
coxfile = paste0(cancer_type,"cox.Rdata")
if(!file.exists(coxfile)){
mySurv=with(meta,Surv(time, event))
cox_results <-apply(exprSet , 1 , function(gene){
# gene= exprSet[1,]
group=ifelse(gene>median(gene),'high','low')
survival_dat <- data.frame(group=group,stage=meta$stage,age=meta$age,
gender=meta$gender,
stringsAsFactors = F)
m=coxph(mySurv ~ gender + age + stage + group, data = survival_dat)
beta <- coef(m)
se <- sqrt(diag(vcov(m)))
HR <- exp(beta)
HRse <- HR * se
#summary(m)
tmp <- round(cbind(coef = beta, se = se, z = beta/se, p = 1 - pchisq((beta/se)^2, 1),
HR = HR, HRse = HRse,
HRz = (HR - 1) / HRse, HRp = 1 - pchisq(((HR - 1)/HRse)^2, 1),
HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 3)
return(tmp['grouplow',])
})
cox_results=t(cox_results)
save(cox_results,file = coxfile)
}
load(coxfile)
table(cox_results[,4]<0.01)
#FALSE TRUE
#1675 17
table(cox_results[,4]<0.05)
#FALSE TRUE
# 1611 81
lr = names(log_rank_p)[log_rank_p<0.05]
cox = rownames(cox_results)[cox_results[,4]<0.05]
length(intersect(lr,cox))
save(cox,lr,file = paste0(cancer_type,"cox_lr.Rdata"))
结果:17个基因的p<0.01,81个基因的p<0.05。
Lasso回归
1.载入数据
load("TCGA-COADsur_model.Rdata")
#load("TCGA-CHOLsur_model.Rdata")
load("TCGA-COADcox_lr.Rdata")
exprSet = exprSet[cox,]
2.构建lasso回归模型
lasso回归的输入数据是表达矩阵(仅含tumor样本)和对应的生死。
x=t(log2(exprSet+1))
y=meta$event
install.packages("glmnet")
library("glmnet")
model_lasso <- glmnet(x, y,nlambda=10, alpha=1)
print(model_lasso)
#: glmnet(x = x, y = y, alpha = 1, nlambda = 10)
# Df %Dev Lambda
#1 0 0.00 0.073980
#2 18 12.22 0.026590
#3 39 25.53 0.009555
#4 61 30.80 0.003434
#5 76 32.15 0.001234
#6 81 32.42 0.000444
#7 81 32.45 0.000159
#8 81 32.46 0.000057
#9 81 32.46 0.000021
#10 81 32.46 0.000007
这里是举例子,所以只计算了10个λ值,输出结果三列的含义如下:
-
Df
是自由度 -
%Dev
代表了由模型解释的残差的比例,对于线性模型来说就是模型拟合的R^2(R-squred)。它在0和1之间,越接近1说明模型的表现越好,如果是0,说明模型的预测结果还不如直接把因变量的均值作为预测值来的有效。 -
Lambda
是构建模型的重要参数。
解释的残差百分比越高越好,但是构建模型使用的基因的数量也不能太多,需要取一个折中值。
2.1挑选合适的λ值
计算1000个,画图,筛选表现最好的λ值
set.seed(13098)
cv_fit <- cv.glmnet(x=x, y=y, nlambda = 1000,alpha = 1)
plot(cv_fit)
纵坐标市MSE(均方误差),两条虚线分别指示了两个特殊的λ值,一个是lambda.min,一个是lambda.1se,这两个值之间的lambda都认为是合适的。lambda.1se构建的模型最简单,即使用的基因数量少,而lambda.min则准确率更高一点,使用的基因数量更多一点。
2.2 用这两个λ值重新建模
model_lasso_min <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.min)
model_lasso_1se <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.1se)
这两个值体现在参数lambda上。有了模型,可以将筛选的基因挑出来了。所有基因存放于模型的子集beta中,用到的基因有一个s0值,没用的基因只记录了“.”,所以可以用下面代码挑出用到的基因。
head(model_lasso_min$beta)
#6 x 1 sparse Matrix of class "dgCMatrix"
# s0
#RP11-474D1.3 .
#RP5-884M6.1 0.010229910
#RP11-167H9.4 .
#AC007182.6 0.004850337
#RP11-278H7.4 .
#RP11-109M17.2 .
choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
choose_gene_1se=rownames(model_lasso_1se$beta)[as.numeric(model_lasso_1se$beta)!=0]
length(choose_gene_min)
#[1] 34
length(choose_gene_1se)
#[1] 1
load("TCGA-COADcox_lr.Rdata")
for_cox = intersect(choose_gene_min,lr)
save(for_cox,choose_gene_min,file = "for_cox.Rdata")
3.模型预测和评估
3.1自己预测自己
newx参数是预测对象。输出结果lasso.prob是一个矩阵,第一列是min的预测结果,第二列是1se的预测结果,预测结果是概率,或者说百分比,不是绝对的0和1。
将每个样本的生死和预测结果放在一起,直接cbind即可。
lasso.prob <- predict(cv_fit, newx=x , s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
re=cbind(y ,lasso.prob)
head(re)
# y 1 2
#TCGA-3L-AA1B-01A-11R-A37K-07 0 0.10521478 0.1158129
#TCGA-4N-A93T-01A-11R-A37K-07 0 0.38231417 0.1158129
#TCGA-4T-AA8H-01A-11R-A41B-07 0 0.09498643 0.1158129
#TCGA-5M-AAT4-01A-11R-A41B-07 1 0.37359793 0.1158129
#TCGA-5M-AAT6-01A-11R-A41B-07 1 0.50234844 0.1158129
#TCGA-5M-AATE-01A-11R-A41B-07 0 0.24833076 0.1158129
3.2 箱线图
对预测结果进行可视化。以实际的生死作为分组,画箱线图整体上查看预测结果。
re=as.data.frame(re)
colnames(re)=c('event','prob_min','prob_1se')
re$event=as.factor(re$event)
library(ggpubr)
p1 = ggboxplot(re, x = "event", y = "prob_min",
color = "event", palette = "jco",
add = "jitter")+ stat_compare_means()
p2 = ggboxplot(re, x = "event", y = "prob_1se",
color = "event", palette = "jco",
add = "jitter")+ stat_compare_means()
library(patchwork)
p1+p2
可以看到,真实结果是死(0)的样本,预测的值就小一点(靠近0),真实结果是活着(1)的样本,预测的值就大一点(靠近1),整体上趋势是对的,但不是完全准确,模型是可用的。
对比两个λ值构建的模型,1se的箱线图有点奇怪,后面再探究了
3.3 ROC曲线
计算AUC取值范围在0.5-1之间,越接近于1越好。可以根据预测结果绘制ROC曲线。
install.packages("ROCR")
install.packages("caret")
library(ROCR)
library(caret)
# 自己预测自己
#min
pred_min <- prediction(re[,2], re[,1])
auc_min = performance(pred_min,"auc")@y.values[[1]]
perf_min <- performance(pred_min,"tpr","fpr")
plot(perf_min,colorize=FALSE, col="blue")
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.2, labels = paste0("AUC = ",round(auc_min,3)))
#1se
pred_1se <- prediction(re[,3], re[,1])
auc_1se = performance(pred_1se,"auc")@y.values[[1]]
perf_1se <- performance(pred_1se,"tpr","fpr")
plot(perf_1se,colorize=FALSE, col="red")
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.2, labels = paste0("AUC = ",round(auc_1se,3)))
为什么一样的数据做出的图不一样?
cox-风险森林图
1.载入数据
load("TCGA-COADsur_model.Rdata")
load("for_cox.Rdata")
library(stringr)
1.挑选感兴趣的基因构建coxph模型
e=t(exprSet[for_cox,])
e=log2(e+1)
colnames(e)=str_replace(colnames(e),"-","_")
dat=cbind(meta,e)
dat$gender=factor(dat$gender)
dat$stage=factor(dat$stage)
colnames(dat)
#[1] "ID" "event" "death" "last_followup" "race" "age" "gender" "stage" "time"
#[10] "age_group" "RP5_884M6.1" "AC007182.6" "CTD_2023N9.1" "VAC14_AS1" "RP11_148K1.12" "RP11_742B18.1" "RP11_256I9.2" "AC079612.1"
#[19] "RP11_190J1.3" "LINC00402" "THRB_AS1" "CTD_2147F2.2" "WT1_AS" "AC007392.4" "RP11_309M7.1" "RP11_413M3.4" "RP11_334E6.3"
#[28] "WASIR1" "MAFA_AS1" "CTD_2013N17.4" "MACROD2_IT1"
用survival::coxph()函数构建模型
library(survival)
library(survminer)
s = as.formula(paste("Surv(time, event) ~ ",paste(colnames(e),collapse = "+")))
model <- coxph(s, data = dat )
# 去掉不显著的基因
summary(model)$concordance
# C se(C)
#0.85125719 0.02311193
summary(model)
#Call:
#coxph(formula = s, data = dat)
#n= 449, number of events= 52
# coef exp(coef) se(coef) z Pr(>|z|)
#RP5_884M6.1 0.112751 1.119353 0.094305 1.196 0.231854
#AC007182.6 0.127995 1.136548 0.099580 1.285 0.198672
#CTD_2023N9.1 -0.044700 0.956285 0.527983 -0.085 0.932531
#VAC14_AS1 -0.185046 0.831066 0.130355 -1.420 0.155738
#RP11_148K1.12 0.361166 1.435002 0.186106 1.941 0.052301 .
#RP11_742B18.1 0.108267 1.114345 0.094063 1.151 0.249728
#RP11_256I9.2 0.088944 1.093019 0.176557 0.504 0.614423
#AC079612.1 -0.146865 0.863410 0.179227 -0.819 0.412538
#RP11_190J1.3 0.006628 1.006650 0.097532 0.068 0.945817
#LINC00402 -0.217686 0.804378 0.128876 -1.689 0.091197 .
#THRB_AS1 -0.400994 0.669654 0.130617 -3.070 0.002141 **
#CTD_2147F2.2 -0.067850 0.934400 0.108002 -0.628 0.529851
#WT1_AS 0.014672 1.014780 0.081921 0.179 0.857863
#AC007392.4 -0.591876 0.553288 0.384834 -1.538 0.124048
#RP11_309M7.1 0.324670 1.383574 0.183396 1.770 0.076674 .
#RP11_413M3.4 -0.648309 0.522929 0.195152 -3.322 0.000894 ***
#RP11_334E6.3 0.521098 1.683876 0.198750 2.622 0.008745 **
#WASIR1 -0.115116 0.891263 0.127139 -0.905 0.365234
#MAFA_AS1 0.279864 1.322949 0.170886 1.638 0.101481
#CTD_2013N17.4 0.319699 1.376714 0.238104 1.343 0.179373
#MACROD2_IT1 0.840480 2.317479 0.269586 3.118 0.001823 **
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# exp(coef) exp(-coef) lower .95 upper .95
#RP5_884M6.1 1.1194 0.8934 0.9305 1.3466
#AC007182.6 1.1365 0.8799 0.9350 1.3815
#CTD_2023N9.1 0.9563 1.0457 0.3398 2.6916
#VAC14_AS1 0.8311 1.2033 0.6437 1.0730
#RP11_148K1.12 1.4350 0.6969 0.9964 2.0666
#RP11_742B18.1 1.1143 0.8974 0.9267 1.3399
#RP11_256I9.2 1.0930 0.9149 0.7733 1.5449
#AC079612.1 0.8634 1.1582 0.6077 1.2268
#RP11_190J1.3 1.0067 0.9934 0.8315 1.2187
#LINC00402 0.8044 1.2432 0.6248 1.0355
#THRB_AS1 0.6697 1.4933 0.5184 0.8650
#CTD_2147F2.2 0.9344 1.0702 0.7561 1.1547
#WT1_AS 1.0148 0.9854 0.8643 1.1915
#AC007392.4 0.5533 1.8074 0.2602 1.1763
#RP11_309M7.1 1.3836 0.7228 0.9658 1.9820
#RP11_413M3.4 0.5229 1.9123 0.3567 0.7666
#RP11_334E6.3 1.6839 0.5939 1.1406 2.4859
#WASIR1 0.8913 1.1220 0.6947 1.1435
#MAFA_AS1 1.3229 0.7559 0.9464 1.8493
#CTD_2013N17.4 1.3767 0.7264 0.8633 2.1954
#MACROD2_IT1 2.3175 0.4315 1.3663 3.9309
#Concordance= 0.851 (se = 0.023 )
#Likelihood ratio test= 69.43 on 21 df, p=4e-07
#Wald test = 59.6 on 21 df, p=1e-05
#Score (logrank) test = 73.72 on 21 df, p=9e-08
tmp = summary(model)$coefficients[,5]
re = names(tmp[tmp<0.05])
#s=Surv(time, event) ~ RP5_884M6.1+RP1_79C4.4+AC004510.3
s = as.formula(paste("Surv(time, event) ~ ",paste(re,collapse = "+")))
model <- coxph(s, data = dat )
summary(model)
#Call:
#coxph(formula = s, data = dat)
#n= 449, number of events= 52
# coef exp(coef) se(coef) z Pr(>|z|)
#THRB_AS1 -0.2388 0.7875 0.1080 -2.211 0.027028 *
#RP11_413M3.4 -0.4353 0.6471 0.1524 -2.856 0.004284 **
#RP11_334E6.3 0.5853 1.7956 0.1758 3.329 0.000871 ***
#MACROD2_IT1 0.5153 1.6741 0.2090 2.466 0.013674 *
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# exp(coef) exp(-coef) lower .95 upper .95
#THRB_AS1 0.7875 1.2698 0.6373 0.9732
#RP11_413M3.4 0.6471 1.5454 0.4800 0.8723
#RP11_334E6.3 1.7956 0.5569 1.2722 2.5343
#MACROD2_IT1 1.6741 0.5973 1.1115 2.5216
#Concordance= 0.72 (se = 0.05 )
#Likelihood ratio test= 23.4 on 4 df, p=1e-04
#Wald test = 26.56 on 4 df, p=2e-05
#Score (logrank) test = 27.44 on 4 df, p=2e-05
3.模型可视化-森林图
options(scipen=1)
ggforest(model, data =dat,
main = "Hazard ratio",
cpositions = c(0.10, 0.22, 0.4),
fontsize = 1.0,
refLabel = "1", noDigits = 4)
可以继续美化森林图
4.模型预测
#预测模型
fp <- predict(model,type = "risk")
boxplot(fp)
library(Hmisc)
options(scipen=200)
with(dat,rcorr.cens(fp,Surv(time, event))
# C Index Dxy S.D. n missing uncensored Relevant Pairs Concordant Uncertain
# 0.27953650 -0.44092699 0.09997376 449.00000000 0.00000000 52.00000000 13204.00000000 3691.00000000 187936.00000000
# 若要找到最佳模型,我们可以进行变量选择,可以采用逐步回归法进行分析
C-index用于计算生存分析中的COX模型预测值与真实之间的区分度(discrimination),也称为Harrell's concordanceindex。C-index在0.5-1之间。0.5为完全不一致,说明该模型没有预测作用,1为完全一致,说明该模型预测结果与实际完全一致。
5.划分高低风险并画生存分析图
names(fp) = rownames(dat)
ri = ifelse(fp
fp_dat=data.frame(patientid=1:length(fp),fp=as.numeric(sort(fp)))
sur_dat=data.frame(patientid=1:length(fp),
time=meta[names(sort(fp)),'time'] ,
event=meta[names(sort(fp )),'event'] )
sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
exp_dat=dat[names(sort(fp)),11:ncol(dat)]
###第一个图----
p1=ggplot(fp_dat,aes(x=patientid,y=fp))+geom_point()+theme_bw();p1
#第二个图
p2=ggplot(sur_dat,aes(x=patientid,y=time))+geom_point(aes(col=event))+theme_bw();p2
#第三个图
mycolors <- colorRampPalette(c("black", "green", "red"), bias = 1.2)(100)
tmp=t(scale(exp_dat[,re]))
tmp[tmp > 1] = 1
tmp[tmp < -1] = -1
#p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = T)
p3=pheatmap::pheatmap(tmp,
col= mycolors,
show_colnames = F,
cluster_cols = F)
#拼图实现三图联动
library(ggplotify)
plots = list(p1,p2,as.ggplot(as.grob(p3)))
library(gridExtra)
lay1 = rbind(c(rep(1,7),NA),
c(rep(2,7)),
c(rep(3,7))) #布局矩阵
grid.arrange(grobs = plots,
layout_matrix = lay1,
heigths = c(1, 2,3))
按照预测值划分高低风险,加上注释
fp_dat=data.frame(patientid=1:length(fp),
fp=as.numeric(sort(fp)),
ri= ri[order(fp)])
sur_dat=data.frame(patientid=1:length(fp),
time=meta[names(sort(fp)),'time'] ,
event=meta[names(sort(fp )),'event'] )
sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
exp_dat=dat[names(sort(fp)),11:ncol(dat)]
###第一个图----
p1=ggplot(fp_dat,aes(x=patientid,y=fp))+
geom_point(aes(color = ri))+
scale_color_manual(values = c("red","blue"))+
theme_bw();p1
#第二个图
p2=ggplot(sur_dat,aes(x=patientid,y=time))+
geom_point(aes(col=event))+
scale_color_manual(values = c("red","blue"))+
theme_bw();p2
#第三个图
mycolors <- colorRampPalette(c("black", "green", "red"), bias = 1.2)(100)
tmp=t(scale(exp_dat[,re]))
tmp[tmp > 1] = 1
tmp[tmp < -1] = -1
#p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = T)
annotation_col = data.frame(group = ri,
row.names = names(ri))
ann_colors = list(
group = c(lowrisk="blue", highrisk="red")
)
p3=pheatmap::pheatmap(tmp,
col= mycolors,
show_colnames = F,
cluster_cols = F,
annotation_col = annotation_col,
annotation_colors =ann_colors,
annotation_legend = F)
#拼图实现三图联动
library(ggplotify)
plots = list(p1,p2,as.ggplot(as.grob(p3)))
library(gridExtra)
lay1 = rbind(c(rep(1,7)),
c(rep(2,7)),
c(rep(3,7))) #布局矩阵
grid.arrange(grobs = plots,
layout_matrix = lay1,
heigths = c(1,2,3))
参考文章:1.TCGA文章复现——LncRNA-CRC预后
2.[小洁老师的TCGA系列文章](https://www.jianshu.com/u/c93ac360691a)