TCGA数据下载系列之二:RTCGA

library(RTCGA)

library(RTCGA.clinical)

library(RTCGA.rnaseq)

library(RTCGA.mRNA)

library(RTCGA.mutations)

all_TCGA_cancers=infoTCGA()

DT::datatable(all_TCGA_cancers)


#指定任意基因从任意癌症里面获取芯片表达数据(这里是MRNA)

expr<-expressionsTCGA(BRCA.mRNA,OV.mRNA,LUSC.mRNA,extract.cols = c("GATA3","PTEN","XBP1","ESR1","MUC1"))


#简化表达模式

expr$dataset<-gsub(".mRNA","",expr$dataset)#替换,dataset这一列,将.mRNA去掉


expr$bcr_patient_barcode<-paste0(expr$dataset,c(1:150,1:561,1:154))#barcode换掉


#绘制指定基因在不同癌症的表达量区别boxplot

library(ggpubr)

ggboxplot(expr,x="dataset",y="GATA3",title = "GATA3",ylab = "Expression",color = "dataset",palette = "jco")###expr中dataset作为横坐标,GATA3表达量作为纵坐标,颜色根据dataset分组,具体颜色为jco系列


这里选择的是 ggsci 包的配色方案,包括: “npg”, “aaas”, “lancet”, “jco”, “ucscgb”, “uchicago”, “simpsons” and “rickandmorty”,针对常见的SCI杂志的需求开发的。

#还可以加上不同癌症之间比较的p值

my_comparisons<-list(c("BRCA","OV"),c("OV","LUSC"))

ggboxplot(expr,x="dataset",y="GATA3",title = "GATA3",ylab = "Expression",color = "dataset",palette = "jco")+stat_compare_means(comparisons = my_comparisons)


#用ggplot2也可以画箱型图

library(ggplot2)

p<-ggplot(expr,aes(x=expr$dataset,y=expr$GATA3))

p<-p+geom_boxplot(aes(fill=expr$dataset))

p+xlab("1")+ylab("2")+ggtitle("3")+guides(fill=guide_legend(title="4"))


###另附小技巧

label.select.criteria <- list(criteria = "`y` > 3.9 & `x` %in% c('BRCA', 'OV')")

ggboxplot(expr, x = "dataset",

         y = c("GATA3", "PTEN", "XBP1"),

         combine = TRUE,

         color = "dataset", palette = "jco",

         ylab = "Expression",

         label = "bcr_patient_barcode",              # column containing point labels

         label.select = label.select.criteria,       # Select some labels to display

         font.label = list(size = 9, face = "italic"), # label font

         repel = TRUE                                # Avoid label text overplotting

         )


##一般用到是几个同时呈现

ggboxplot(expr,x="dataset",y=c("GATA3","PTEN","XBP1"),ylab = "Expression",color = "dataset",palette = "jco",combine = TRUE)


#指定任意基因从任意癌症里面获取测序表达数据(rnaseq)

expr <- expressionsTCGA(BRCA.rnaseq, OV.rnaseq, LUSC.rnaseq,extract.cols = c("GATA3|2625", "PTEN|5728", "XBP1|7494","ESR1|2099", "MUC1|4582"))#需要symbol还要entrez的ID:symbol|extrezID

ggboxplot(expr,x="dataset",y="`GATA3|2625`",title = "GATA3|2625",ylab = "Expression",color = "dataset",palette = "jco")



#用全部的rnaseq的表达数据来做主成分分析

library(RTCGA.rnaseq)

library(dplyr)#   R包dplyr可用于处理R内部或者外部的结构化数据,相较于plyr包,dplyr专注接受dataframe对象, 大幅提高了速度,并且提供了更稳健的数据库接口。同时,dplyr包可用于操作Spark的dataframe。本文只是基础的dplyr包学习笔记,所以并不会讨论一些高级应用,或者与data.table包的性能比较。

expressionsTCGA(BRCA.rnaseq,OV.rnaseq,LUSC.rnaseq)%>%dplyr::rename(cohort=dataset)%>%filter(substr(bcr_patient_barcode,14,15)=="01")->BRCA.OV.LUSC.rnaseq.cancer#筛选出了“TCGA-GM-A2DA-01A-11R-A18M-07”bcr_barcode都是“01”的,%>%管道符,左边赋于右边

pcaTCGA(BRCA.OV.LUSC.rnaseq.cancer, "cohort") -> pca_plot

plot(pca_plot)


#用突变数据做生存分析

library(RTCGA.mutations)

library(survminer)

library(dplyr)

mutationsTCGA(BRCA.mutations,OV.mutations,LUSC.mutations)%>%filter(Hugo_Symbol=="TP53")%>%filter(substr(bcr_patient_barcode,14,15)=="01")%>%mutate(bcr_patient_barcode=substr(bcr_patient_barcode,1,12))->BRCA_OV.mutations###斜体部分筛选了“01”样本中含TP53的,删除线部分是将第一列barcode重命名了,只取前12个字符


survivalTCGA(BRCA.clinical,OV.clinical,extract.cols = "admin.disease_code")%>%dplyr::rename(disease=admin.disease_code)->BRCA_OV.clinical

BRCA_OV.clinical %>%  left_join(    BRCA_OV.mutations,  by = "bcr_patient_barcode" ) %>%mutate(TP53 =  ifelse(!is.na(Variant_Classification), "Mut","WILDorNOINFO")) -> BRCA_OV.clinical_mutations#斜体是说按照barcode将clinical和mutations合并,删除线是说增加一列TP53 的信息,如果variant_classification是空值,则表示wildornoinfo,如果不是空值,则表示其mut


BRCA_OV.clinical_mutations %>%

select(times, patient.vital_status, disease, TP53) -> BRCA_OV.2plot#选取生存分析需要的内容

kmTCGA(

   BRCA_OV.2plot,

   explanatory.names = c("TP53", "disease"),

   break.time.by = 400,

   xlim = c(0,2000),

    pval = TRUE) -> km_plot

print(km_plot)



你可能感兴趣的:(TCGA数据下载系列之二:RTCGA)