照着老大的这篇教程来的TCGA

主要照着老大的这篇教程来的TCGA

https://mp.weixin.qq.com/s/JB_329LCWqo5dY6MLawfEA

生存分析的统计学探究 (老大的)

https://mp.weixin.qq.com/s?__biz=MzAxMDkxODM1Ng==&mid=2247483699&idx=1&sn=56ec5ceef6c8496f2daeb7e4a2412396&scene=21#wechat_redirect

Surv：用于创建生存数据对象

survfit：创建KM生存曲线或是Cox调整生存曲线

survdiff：用于不同组的统计检验

###先把这些代码保存下来
library(survival)
my.surv <- Surv(OS_MONTHS,OS_STATUS==’LIVING’)  ##这个生存对象是看看病人的总生存期与死亡状态的关系
##这个Surv函数第一个参数必须是数值型的时间，第二个参数是逻辑向量，1,0表示死亡与否
kmfit1 <- survfit(my.surv~1) ## 直接对生存对象拟合生存函数
summary(kmfit1)
plot(kmfit1)   ##画出生存曲线
survdiff(my.surv~type, data=dat)     ### 根据生存对象再加上一个分组因子来拟合生存函数，并且比较不同因子分组的生存效果。


library(cgdsr)
mycgds <- CGDS(‘http://www.cbioportal.org/public-portal/‘)
test(mycgds)
all_TCGA_studies <- getCancerStudies(mycgds)
all_tables <- getCaseLists(mycgds, ‘ov_tcga_pub’)
all_dataset<- getGeneticProfiles(mycgds, ‘ov_tcga_pub’)
BRCA1 <- getProfileData(mycgds, my_gene, my_dataset, my_table)
ov_tcga_pub_meth1<- getClinicalData(mycgds, all_tables[8,1])
ov_tcga_pub_meth2<- getClinicalData(mycgds, all_tables[9,1])
ov_tcga_pub_meth3<- getClinicalData(mycgds, all_tables[10,1])
ov_tcga_pub_meth4<- getClinicalData(mycgds, all_tables[11,1])


library(survival)
attach(ov_tcga_pub_meth1)
## 估计KM生存曲线
my.surv <- Surv(OS_MONTHS,OS_STATUS==’LIVING’)
kmfit1 <- survfit(my.surv~1)
summary(kmfit1)
plot(kmfit1)



kmfit2 <- survfit(my.surv~TUMOR_STAGE_2009)
summary(kmfit2)
plot(kmfit2,col = rainbow(length(unique(ov_tcga_pub_meth1[,13]))))
detach(ov_tcga_pub_meth1)


ov_tcga_pub_meth1$sample<- rownames(ov_tcga_pub_meth1)
ov_tcga_pub_meth2$sample<- rownames(ov_tcga_pub_meth2)
ov_tcga_pub_meth3$sample<- rownames(ov_tcga_pub_meth3)
ov_tcga_pub_meth4$sample<- rownames(ov_tcga_pub_meth4)
ov_tcga_pub_meth1$type<- ‘meth1′
ov_tcga_pub_meth2$type<- ‘meth2′
ov_tcga_pub_meth3$type<- ‘meth3′
ov_tcga_pub_meth4$type<- ‘meth4′
dat=rbind(ov_tcga_pub_meth1,ov_tcga_pub_meth2,ov_tcga_pub_meth3,ov_tcga_pub_meth4)
attach(dat)
## 根据分组type估计KM生存曲线
my.surv <- Surv(OS_MONTHS,OS_STATUS==’LIVING’)
kmfit3 <- survfit(my.surv~type)
summary(kmfit3)
plot(kmfit3,col = rainbow(4))


##由图中可以看到甲基化数据分成的4个组，生存率差异还是蛮大的
##用图形方法检验PH假设
plot(kmfit3,fun=’cloglog’,col = rainbow(4))
# 检验显著性
survdiff(my.surv~type, data=dat)
# 用strata来控制协变量的影响
survdiff(my.surv~type+strata(TUMOR_STAGE_2009),data=dat)

2.文章中提到：使用R包cgdsr来下载TCGA的数据http://www.bio-info-trainee.com/1257.html

getCancerStudies，getCaseLists，getGeneticProfiles，getProfileData

TCGA的28篇教程- 使用R语言的RTCGA包获取TCGA数据

https://mp.weixin.qq.com/s/JB_329LCWqo5dY6MLawfEA

1.指定任意基因从任意癌症里面获取芯片表达数据

image-20190110192530175

image-20190110192559307

image-20190110192722155

compare_means(c(GATA3, PTEN, XBP1) ~ dataset, data = expr)

image-20190110180537874

image-20190110184549281

image-20190110192815685

更多可视化见：http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/77-facilitating-exploratory-data-visualization-application-to-tcga-genomic-data/STHDA网址

2.指定任意基因从任意癌症里面获取测序表达数据

image-20190110190332681

image-20190110193528686

3.用全部的rnaseq的表达数据来做主成分分析

image-20190110201803048

4.用5个基因在3个癌症的表达量做主成分分析

image-20190110203621191

5.用突变数据做生存分析

image-20190110210350353

6.多个基因在多种癌症的表达量热图

image-20190110211820768

==代码==

#1.指定任意基因从任意癌症里面获取芯片表达数据
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
library(BiocManager)
BiocManager::install("RTCGA.clinical") 
BiocManager::install("RTCGA.mRNA")
BiocManager::install('RTCGA.mutations')
library(RTCGA.mutations)
expr <- expressionsTCGA(BRCA.mRNA, OV.mRNA, LUSC.mRNA,
                        extract.cols = c("GATA3", "PTEN", "XBP1","ESR1", "MUC1"))
expr
head <- head(expr)
nb_samples <- table(expr$dataset)
nb_samples
expr$dataset <- gsub(pattern = ".mRNA", replacement = "",  expr$dataset)
expr$dataset
expr$bcr_patient_barcode <- paste0(expr$dataset, c(1:590, 1:561, 1:154))
expr$bcr_patient_barcode

library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
# GATA3
ggboxplot(expr, x = "dataset", y = "GATA3",
          title = "GATA3", ylab = "Expression",
          color = "dataset", palette = "jco")

ggboxplot(expr, x = "dataset", y = "PTEN",
          title = "PTEN", ylab = "Expression",
          color = "dataset", palette = "jco")


my_comparisons <- list(c("BRCA", "OV"), c("OV", "LUSC"))#加上Pvalues
ggboxplot(expr, x = "dataset", y = "GATA3",
          title = "GATA3", ylab = "Expression",
          color = "dataset", palette = "jco")+
  stat_compare_means(comparisons = my_comparisons)


compare_means(c(GATA3, PTEN, XBP1) ~ dataset, data = expr)

label.select.criteria <- list(criteria = "`y` > 10 & `x` %in% c('BRCA', 'OV')")
ggboxplot(expr, x = "dataset",
          y = c("GATA3", "PTEN", "XBP1"),
          combine = TRUE,
          color = "dataset", palette = "jco",
          ylab = "Expression", 
          label = "bcr_patient_barcode",              # column containing point labels
          label.select = label.select.criteria,       # Select some labels to display
          font.label = list(size = 9, face = "italic"), # label font
          repel = TRUE                                # Avoid label text overplotting
)


ggboxplot(expr, x = "dataset", y = "GATA3",
          title = "GATA3", ylab = "Expression",
          color = "dataset", palette = "jco",
          rotate = TRUE)

#2.指定任意基因从任意癌症里面获取测序表达数据
library(RTCGA)
library(RTCGA.rnaseq)
expr <- expressionsTCGA(BRCA.rnaseq, OV.rnaseq, LUSC.rnaseq,
                        extract.cols = c("GATA3|2625", "PTEN|5728", "XBP1|7494","ESR1|2099", "MUC1|4582"))
expr


nb_samples <- table(expr$dataset)
nb_samples


library(ggpubr)
# ESR1|2099
ggboxplot(expr, x = "dataset", y = "`PTEN|5728`",
          title = "PTEN|5728", ylab = "Expression",
          color = "dataset", palette = "jco")

#3.用全部的rnaseq的表达数据来做主成分分析
library(RTCGA.rnaseq)
library(dplyr)

expressionsTCGA(BRCA.rnaseq, OV.rnaseq, HNSC.rnaseq) %>%
  dplyr::rename(cohort = dataset) %>%  
  filter(substr(bcr_patient_barcode, 14, 15) == "01") -> BRCA.OV.HNSC.rnaseq.cancer
pcaTCGA(BRCA.OV.HNSC.rnaseq.cancer, "cohort") -> pca_plot
plot(pca_plot)


save(pca_plot,file = "pca-png")

#4.用5个基因在3个癌症的表达量做主成分分析
BiocManager::install('DT')
library(DT)

expr %>%  
  filter(substr(bcr_patient_barcode, 14, 15) == "01") -> rnaseq.5genes.3cancers
DT::datatable(rnaseq.5genes.3cancers)
pcaTCGA(rnaseq.5genes.3cancers, "dataset") -> pca_plot
plot(pca_plot)

#5.用突变数据做生存分析
library(RTCGA.mutations)
# library(dplyr) if did not load at start
library(survminer)
mutationsTCGA(BRCA.mutations, OV.mutations) %>%
  filter(Hugo_Symbol == 'TP53') %>%
  filter(substr(bcr_patient_barcode, 14, 15) ==
           "01") %>% # cancer tissue
  mutate(bcr_patient_barcode =
           substr(bcr_patient_barcode, 1, 12)) ->
  BRCA_OV.mutations
library(RTCGA.clinical)
survivalTCGA(
  BRCA.clinical,
  OV.clinical,
  extract.cols = "admin.disease_code"
) %>%
  dplyr::rename(disease = admin.disease_code) ->
  BRCA_OV.clinical
BRCA_OV.clinical %>%
  left_join(
    BRCA_OV.mutations,
    by = "bcr_patient_barcode"
  ) %>%
  mutate(TP53 =
           ifelse(!is.na(Variant_Classification), "Mut","WILDorNOINFO")) ->
  BRCA_OV.clinical_mutations
BRCA_OV.clinical_mutations %>%
  select(times, patient.vital_status, disease, TP53) -> BRCA_OV.2plot
kmTCGA(
  BRCA_OV.2plot,
  explanatory.names = c("TP53", "disease"),
  break.time.by = 400,
  xlim = c(0,2000),
  pval = TRUE) -> km_plot

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
print(km_plot)

#6.多个基因在多种癌症的表达量热图
library(RTCGA.rnaseq)
# perfrom plot
# library(dplyr) if did not load at start
expressionsTCGA(
  ACC.rnaseq,
  BLCA.rnaseq,
  BRCA.rnaseq,
  OV.rnaseq,
  extract.cols = 
    c("MET|4233",
      "ZNF500|26048",
      "ZNF501|115560")
) %>%
  dplyr::rename(cohort = dataset,
                MET = `MET|4233`) %>%
  #cancer samples
  filter(substr(bcr_patient_barcode, 14, 15) ==
           "01") %>%
  mutate(MET = cut(MET,
                   round(quantile(MET, probs = seq(0,1,0.25)), -2),
                   include.lowest = TRUE,
                   dig.lab = 5)) -> ACC_BLCA_BRCA_OV.rnaseq
ACC_BLCA_BRCA_OV.rnaseq %>%
  select(-bcr_patient_barcode) %>%
  group_by(cohort, MET) %>%
  summarise_each(funs(median)) %>%
  mutate(ZNF500 = round(`ZNF500|26048`),
         ZNF501 = round(`ZNF501|115560`)) ->
  ACC_BLCA_BRCA_OV.rnaseq.medians
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`
heatmapTCGA(ACC_BLCA_BRCA_OV.rnaseq.medians,
            "cohort", "MET", "ZNF500",
            title = "Heatmap of ZNF500 expression")