一、重要相关网站
1.http://www.oncolnc.org/
2.https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/
3.https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables
二、重要代码
1、生存曲线
rm(list=ls())
options(stringsAsFactors = F)基因|癌种生存分析http://www.oncolnc.org/
a<-read.csv('./BRCA_7157_50_50.csv')
看一下做生存分析需要的数据内容
head(a)
Patient Days Status Expression Group
1 TCGA-AR-A24H 4894 Alive 193.89 Low
2 TCGA-A2-A0YE 554 Alive 198.99 Low
3 TCGA-AN-A0FL 231 Alive 205.44 Low
4 TCGA-A8-A08B 1156 Alive 219.08 Low
5 TCGA-B6-A0X1 7455 Dead 220.14 Low
6 TCGA-E9-A1RG 647 Alive 230.59 Low
#######理解一下high和low的来源
tmp<- ifelse(aExpression),'Low','High')
table(tmp==aStatus=='Alive','Status']<-1
a[a$Status=='Dead','Status']<-2a[a$Status=='Alive','Status']<-0
a[a$Status=='Dead','Status']<-1
#######生存分析的步骤
library(survival)通过Status对生存时间进行标记,+代表的是alive,无+代表的是dead
fit.surv <-Surv(aStatus))
相应处理之后得到的结果为当前时间段的survival rate
km_2<-survfit(fit.surv~Group,data=a)
#########以High为例;
summary(km_2)
(469-1)/469
[1] 0.9978678
(469-1)/469*(463-1)/463
[1] 0.9957126
#######对两个分组的survival rate进行相应的统计检验
library(ggplot2)
Warning message:
In dontCheck(fnname) : reached elapsed time limit
library(ggpubr)
library(magrittr)
library(survminer)
surv_pvalue(km_2)
variable pval method pval.txt
1 Group 0.1394986 Log-rank p = 0.14
ggsurvplot (km_2,pval=TRUE,risk.table = T,title='TP53_BRCA_50_50')
ggsurvplot(km_2,palette = c("#E7B800", "#2E9FDF"),
- risk.table =TRUE,pval =TRUE,
- conf.int =TRUE,xlab ="Days",
- ggtheme =theme_light(),
-
ncensor.plot = TRUE)
2、生存曲线代码理解
library(ggplot2)
rm(list=ls())
options(stringsAsFactors = F)
a<-read.csv('./BRCA_7157_50_50.csv')
a[aStatus=='Dead','Status']<-1
clin_group<- a[,c('Status','Days')]
group <- a$Group
table(group)
group
High Low
503 503
取出对应分组的临床信息
group_1 <- clin_group[group=='High',]
group_2 <- clin_group[group=='Low',]取出对应分组的对应Status(Alive|Death)信息
group_1_A<- group_1[group_1Status==1,]
group_2_A<- group_2[group_2Status==1,]对2组下的Status信息按照Days进行排序
A_2<- group_2_A[order(group_2_ADays),]
将相同Days的event的计数进行加和
D_2<- data.frame(Days = as.numeric(D_2Days)]),
event= as.numeric(by(D_2,D_2$Days, function(x){length(x[,1])})))
将相同Days的censor(alive)的计数进行加和
A_2<- data.frame(Days = as.numeric(A_2Days)]),
censor= as.numeric(by(A_2,A_2$Days, function(x){length(x[,1])})))
这一组的总人数-event(death)-censor(alive|依据时间排序后,
此death对应时间
之前的alive)y<- function(x){nrow(group_2)-sum(D_2Days
D_2$n.risk<- apply(D_2,1,y) 每一个时间点的survival=(risk(number of alive)-event)/risk(number of alive)
D_2$step.survival <- apply(D_2,1,function(x){(x[3]-x[2])/x[3]})
截止到此时间点的survival=此时间点之前的survival*此时间点的survival
D_2step.survival[D_2$Days<=x[1]])})
3.下载brca_data
下载数据
########IHC相关数据的读取
rm(list=ls())
a <- read.csv('nationwidechildrens.org_clinical_patient_brca.txt',sep='\t')
a<- a[3:nrow(a),]
clin<- a[,c('bcr_patient_barcode','her2_status_by_ihc','pr_status_by_ihc','er_status_by_ihc')]
#######三阴和其他进行区分(可能与作者的分组不一致)
TNBC_barcode<-clin[,1][grep('NegativeNegativeNegative',paste0(clin[,2],clin[,3],clin[,4]))]
non_TNBC<-clin[,1][!clin[,1]%in%TNBC_barcode]
表达矩阵下载,有多种方法,见https://www.bilibili.com/video/av49363776?from=search&seid=860607939803005976
GDCquery一般包括3步,GDCquery、GDCdownload和GDCprepare
library(TCGAbiolinks)
library(SummarizedExperiment)
确定需要下载的data
GDCquery下载的整理:https://www.jianshu.com/p/33bdb5ef7689
query <- GDCquery(project = "TCGA-BRCA",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts",
barcode = TNBC_barcode)
到这的时候有可能出现报错
安装最新版本的TCGAbiolinks 不知道是否可行:
先安装devtools包
install.packages("devtools")
devtools::install_github("BioinformaticsFMRP/TCGAbiolinks")
依然不行
后面的程序全部无法运行
GDCdownload(query1)
BRC_DATA2<-GDCprepare(query1,save=FALSE)
BRC_NONTNBC <- assay(BRC_DATA2)
表型信息下载
clinic <- GDCquery_clinic(project = "TCGA-BRCA",
type = 'Clinical')
save(BRC_TNBC,BRC_NONTNBC,clinic,file='result01_BRCA.Rdata')
明天继续寻找解决办法。