下载并预处理TCGA数据

本文为TCGA数据的下载,并整理为行名为基因名的数据结构

方法一

#数据下载的网站,下载下来并命名为HNSC_RSEM_genes_normalized.txt
#http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/HNSC/20160128/gdac.broadinstitute.org_HNSC.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0.tar.gz.md5
library(stringr)
hnsc<-read.table("your_dir/HNSC_RSEM_genes_normalized.txt",header = T,check.names = F,sep="\t")
hnsc<-hnsc[-1,]
row_name<-as.character(hnsc[,1])
row_name<-unlist(lapply(row_name, FUN = function(x) {return(strsplit(x, split = "|",fixed = T)[[1]][1])}))
hnsc[,1]<-row_name
hnsc<-hnsc[!duplicated(hnsc[,1]),]
row.names(hnsc)<-as.character(hnsc[,1])
hnsc<-hnsc[,-1]
col_names<-colnames(hnsc)
new_names<-unlist(lapply(col_names, FUN = function(x) {return(substr(x,1,16))}))
colnames(hnsc)<-new_names
write.csv(hnsc,"your_dir/hnsc_clean_data.csv")

方法二

乳腺癌PAM50

suppressMessages(library(TCGAbiolinks))
BRCA_path_subtypes <- TCGAquery_subtype(tumor = "brca")

另一种方法,生存分析

我觉得这确实目前最好的方法,同时这个是官方下载并实时更新的数据下载方式,但是很容易出现报错(Error in x operator is invalid for atomic vectors),解决方式为用github源进行安装,这个很重要。
具体代码代码如下

#https://bioconductor.org/packages/release//workflows/vignettes/SingscoreAMLMutations/inst/doc/workflow_transcriptional_mut_sig_chinese.html
library(devtools)
devtools::install_github("Bioconductor-mirror/biomaRt")
devtools::install_github("BioinformaticsFMRP/TCGAbiolinks")#IF Error: $ operator is invalid for atomic vectors should run this


rm(list=ls())
library(stringr)
cancer_type="TCGA-BRCA"
#get GDC version information
gdc_info = getGDCInfo()
Release<-as.character(gdc_info$data_release)
release<-substr(Release,1,17)

###download BRCA counts data

Download_TCGA<-function(cancer_type,release){
  suppressMessages(library(TCGAbiolinks)) 
  suppressMessages(library(SummarizedExperiment))
  suppressMessages(library(dplyr))
  suppressMessages(library(DT))
  counts_query <- GDCquery(project = cancer_type, 
                           data.category = "Transcriptome Profiling", 
                           data.type = "Gene Expression Quantification", 
                           workflow.type = "HTSeq - Counts")
  GDCdownload(counts_query)
  counts_expdat <- GDCprepare(query =counts_query)
  count_matrix= as.data.frame(assay(counts_expdat))
  write.csv(count_matrix,file = paste(cancer_type,"Counts_",release,".csv",sep = "-"))
  sample_NT <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("NT"))
  sample_TP <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("TP"))
  write.csv(count_matrix[,sample_NT],file = paste(cancer_type,"Counts_normal_",release,".csv",sep = "-"))
  write.csv(count_matrix[,sample_TP],file = paste(cancer_type,"Counts_tumor_",release,".csv",sep = "-"))
  
  ###download BRCA FPKM data
  FPKM_query <- GDCquery(project = cancer_type, 
                         data.category = "Transcriptome Profiling", 
                         data.type = "Gene Expression Quantification", 
                         workflow.type = "HTSeq - FPKM")
  GDCdownload(FPKM_query)
  FPKM_expdat <- GDCprepare(query =FPKM_query)
  FPKM_matrix= as.data.frame(assay(FPKM_expdat))
  write.csv(FPKM_matrix,file = paste(cancer_type,"FPKM_",release,".csv",sep = "-"))
  sample_NT <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("NT"))
  sample_TP <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("TP"))
  write.csv(FPKM_matrix[,sample_NT],file = paste(cancer_type,"FPKM_normal_",release,".csv",sep = "-"))
  write.csv(FPKM_matrix[,sample_TP],file = paste(cancer_type,"FPKM_tumor_",release,".csv",sep = "-"))
  
  #########
  clinical <- GDCquery_clinic(project = cancer_type, type = "clinical")
  
  ## ----echo=TRUE, message=FALSE, warning=FALSE-----------------------------
  datatable(clinical, filter = 'top', 
            options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
            rownames = FALSE)
  write.csv(clinical,"clinical_full.csv")
  
  ##origanize the clinical data (shuould do some ajustion maybe try next time)
  
  CLC_query <- GDCquery(project = cancer_type, 
                        data.category = "Clinical", 
                        file.type = "xml")
  GDCdownload(CLC_query)
  clinical <- GDCprepare_clinic(CLC_query, clinical.info = "patient")
  
  
  
  clinical_trait <- clinical  %>%
    dplyr::select(bcr_patient_barcode,gender,vital_status,                            
                  days_to_death,days_to_last_followup,race_list,
                  person_neoplasm_cancer_status,
                  stage_event_pathologic_stage,             
                  stage_event_tnm_categories  ) %>%
    distinct( bcr_patient_barcode, .keep_all = TRUE)  
  
  
  #organize sur data
  dead_patient <- clinical_trait  %>%
    dplyr::filter(vital_status == 'Dead') %>%
    dplyr::select(-days_to_last_followup) %>%
    reshape::rename(c(bcr_patient_barcode = 'Barcode',
                      gender = 'Gender',
                      vital_status = 'OS',
                      days_to_death='OS.Time',
                      race_list = 'Race',
                      person_neoplasm_cancer_status='cancer_status',
                      age_at_initial_pathologic_diagnosis = 'Age',
                      neoplasm_histologic_grade = 'Grade',
                      stage_event_pathologic_stage = 'Stage',
                      stage_event_tnm_categories = 'TNM' )) %>%
    mutate(OS=ifelse(OS=='Dead',1,0))%>%
    mutate(OS.Time=OS.Time/365)
  
  
  
  #organize the clc data
  alive_patient <- clinical_trait %>%
    dplyr::filter(vital_status == 'Alive') %>%
    dplyr::select(-days_to_death) %>%
    reshape::rename(c(bcr_patient_barcode = 'Barcode',
                      gender = 'Gender',
                      vital_status = 'OS',
                      days_to_last_followup='OS.Time',
                      race_list = 'Race',
                      person_neoplasm_cancer_status='cancer_status',
                      age_at_initial_pathologic_diagnosis = 'Age',
                      neoplasm_histologic_grade = 'Grade',
                      stage_event_pathologic_stage = 'Stage',
                      stage_event_tnm_categories = 'TNM' )) %>%
    mutate(OS=ifelse(OS=='Dead',1,0))%>%
    mutate(OS.Time=OS.Time/365)
  
  #combine clincial data
  survival_data <- rbind(dead_patient,alive_patient)
  write.csv(survival_data , file = paste(cancer_type,"clinical_",release,".csv",sep = "-"))
  
  #download Copy Number Variation data
  CNV_query <- GDCquery(project = cancer_type, 
                        data.category = "Copy Number Variation", 
                        data.type = "Copy Number Segment")
  
  GDCdownload(CNV_query)
  CNV_expdat <- GDCprepare(query = CNV_query)
  CNV_count_matrix=as.data.frame(assay(CNV_expdat))
  write.csv(CNV_count_matrix,file = paste(cancer_type,"Copy-Number-Variation_",release,".csv",sep = "-"))
  
  #download methylation
  meth_query <- GDCquery(project =cancer_type,
                         legacy = TRUE,
                         data.category = "DNA methylation")
  GDCdownload(meth_query)
  meth_expdat <- GDCprepare(query = meth_query)
  meth_count_matrix=assay(meth_expdat)
  write.csv(meth_count_matrix,file = paste(cancer_type,"methylation_",release,"2.csv",sep = "-"))
  ####download miR data
  miR_query <- GDCquery(project = cancer_type, 
                        data.category = "Transcriptome Profiling", 
                        data.type = "miRNA Expression Quantification", 
                        workflow.type = "BCGSC miRNA Profiling")
  GDCdownload(miR_query)
  miR_expdat <- GDCprepare(query = miR_query)
  write.csv(miR_expdat,file = paste(cancer_type,"miRNAs_",release,".csv",sep = "-"))
  row.names(miR_expdat)<-as.character(miR_expdat[,1])
  miR_expdat<-miR_expdat[,-1]
 col_name<-unlist(lapply(colnames(miR_expdat), FUN = function(x) {return(strsplit(x, split = "TCGA",fixed = T)[[1]][2])}))
 col_name<-col_name[!duplicated(col_name)]
 rpkm_names<-paste("reads_per_million_miRNA_mapped_TCGA",col_name,sep = "")
 count_names<-paste("read_count_TCGA",col_name,sep = "")
 write.csv(miR_expdat[,rpkm_names],file = paste(cancer_type,"miRNAs_RPKM",release,".csv",sep = "-"))
 write.csv(miR_expdat[,count_names],file = paste(cancer_type,"miRNAs_",release,".csv",sep = "-"))
}
Download_TCGA(cancer_type,release)

三、多线程批量下载所有TCGA

调用19线程

#!/usr/bin/env Rscript
rm(list=ls())
library(stringr)
library(parallel)
cancerType<-read.csv("projects.csv",header = T)
cancer_type<-as.character(cancerType$project_id)


###download counts data

Download_TCGA<-function(cancer_type){
  suppressMessages(library(TCGAbiolinks)) 
  suppressMessages(library(SummarizedExperiment))
  suppressMessages(library(dplyr))
  suppressMessages(library(DT))
  dir= "~/Desktop/tcga_test" #should change this before you run
  out_dir=paste0(dir,"/",cancer_type)
  dir.create(out_dir,recursive = T)
  setwd(out_dir)
  #get GDC version information
  gdc_info = getGDCInfo()
  Release<-as.character(gdc_info$data_release)
  release<-substr(Release,1,17)
  counts_query <- GDCquery(project = cancer_type, 
                           data.category = "Transcriptome Profiling", 
                           data.type = "Gene Expression Quantification", 
                           workflow.type = "HTSeq - Counts")
  GDCdownload(counts_query)
  counts_expdat <- GDCprepare(query =counts_query)
  count_matrix= as.data.frame(assay(counts_expdat))
  write.csv(count_matrix,file = paste(cancer_type,"Counts_",release,".csv",sep = "-"))
  sample_NT <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("NT"))
  sample_TP <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("TP"))
  write.csv(count_matrix[,sample_NT],file = paste(cancer_type,"Counts_normal_",release,".csv",sep = "-"))
  write.csv(count_matrix[,sample_TP],file = paste(cancer_type,"Counts_tumor_",release,".csv",sep = "-"))
  
  ###download FPKM data
  FPKM_query <- GDCquery(project = cancer_type, 
                         data.category = "Transcriptome Profiling", 
                         data.type = "Gene Expression Quantification", 
                         workflow.type = "HTSeq - FPKM")
  GDCdownload(FPKM_query)
  FPKM_expdat <- GDCprepare(query =FPKM_query)
  FPKM_matrix= as.data.frame(assay(FPKM_expdat))
  write.csv(FPKM_matrix,file = paste(cancer_type,"FPKM_",release,".csv",sep = "-"))
  sample_NT <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("NT"))
  sample_TP <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("TP"))
  write.csv(FPKM_matrix[,sample_NT],file = paste(cancer_type,"FPKM_normal_",release,".csv",sep = "-"))
  write.csv(FPKM_matrix[,sample_TP],file = paste(cancer_type,"FPKM_tumor_",release,".csv",sep = "-"))
  
  #########
  clinical <- GDCquery_clinic(project = cancer_type, type = "clinical")
  
  ## ----echo=TRUE, message=FALSE, warning=FALSE-----------------------------
  datatable(clinical, filter = 'top', 
            options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
            rownames = FALSE)
  write.csv(clinical,"clinical_full.csv")
  
  ##origanize the clinical data (shuould do some ajustion maybe try next time)
  
  CLC_query <- GDCquery(project = cancer_type, 
                        data.category = "Clinical", 
                        file.type = "xml")
  GDCdownload(CLC_query)
  clinical <- GDCprepare_clinic(CLC_query, clinical.info = "patient")
  
  
  
  clinical_trait <- clinical  %>%
    dplyr::select(bcr_patient_barcode,gender,vital_status,                            
                  days_to_death,days_to_last_followup,race_list,
                  person_neoplasm_cancer_status,
                  stage_event_pathologic_stage,             
                  stage_event_tnm_categories  ) %>%
    distinct( bcr_patient_barcode, .keep_all = TRUE)  
  
  
  #organize sur data
  dead_patient <- clinical_trait  %>%
    dplyr::filter(vital_status == 'Dead') %>%
    dplyr::select(-days_to_last_followup) %>%
    reshape::rename(c(bcr_patient_barcode = 'Barcode',
                      gender = 'Gender',
                      vital_status = 'OS',
                      days_to_death='OS.Time',
                      race_list = 'Race',
                      person_neoplasm_cancer_status='cancer_status',
                      age_at_initial_pathologic_diagnosis = 'Age',
                      neoplasm_histologic_grade = 'Grade',
                      stage_event_pathologic_stage = 'Stage',
                      stage_event_tnm_categories = 'TNM' )) %>%
    mutate(OS=ifelse(OS=='Dead',1,0))%>%
    mutate(OS.Time=OS.Time/365)
  
  
  
  #organize the clc data
  alive_patient <- clinical_trait %>%
    dplyr::filter(vital_status == 'Alive') %>%
    dplyr::select(-days_to_death) %>%
    reshape::rename(c(bcr_patient_barcode = 'Barcode',
                      gender = 'Gender',
                      vital_status = 'OS',
                      days_to_last_followup='OS.Time',
                      race_list = 'Race',
                      person_neoplasm_cancer_status='cancer_status',
                      age_at_initial_pathologic_diagnosis = 'Age',
                      neoplasm_histologic_grade = 'Grade',
                      stage_event_pathologic_stage = 'Stage',
                      stage_event_tnm_categories = 'TNM' )) %>%
    mutate(OS=ifelse(OS=='Dead',1,0))%>%
    mutate(OS.Time=OS.Time/365)
  
  #combine clincial data
  survival_data <- rbind(dead_patient,alive_patient)
  write.csv(survival_data , file = paste(cancer_type,"clinical_",release,".csv",sep = "-"))
  
  #download Copy Number Variation data
  CNV_query <- GDCquery(project = cancer_type, 
                        data.category = "Copy Number Variation", 
                        data.type = "Copy Number Segment")
  
  GDCdownload(CNV_query)
  CNV_expdat <- GDCprepare(query = CNV_query)
  CNV_count_matrix=as.data.frame(assay(CNV_expdat))
  write.csv(CNV_count_matrix,file = paste(cancer_type,"Copy-Number-Variation_",release,".csv",sep = "-"))
  
  #download methylation
  meth_query <- GDCquery(project =cancer_type,
                         legacy = TRUE,
                         data.category = "DNA methylation")
  GDCdownload(meth_query)
  meth_expdat <- GDCprepare(query = meth_query)
  meth_count_matrix=assay(meth_expdat)
  write.csv(meth_count_matrix,file = paste(cancer_type,"methylation_",release,"2.csv",sep = "-"))
  ####download miR data
  miR_query <- GDCquery(project = cancer_type, 
                        data.category = "Transcriptome Profiling", 
                        data.type = "miRNA Expression Quantification", 
                        workflow.type = "BCGSC miRNA Profiling")
  GDCdownload(miR_query)
  miR_expdat <- GDCprepare(query = miR_query)
  miR_expdat_matrix=assay(miR_expdat)
  write.csv(miR_expdat_matrix,file = paste(cancer_type,"miRNAs_",release,"2.csv",sep = "-"))
  message(paste0(cancer_type," Download Finished!"))
}
cl <- makeCluster(19)
parLapply(cl,cancer_type,Download_TCGA)
stopCluster(cl)

你可能感兴趣的:(下载并预处理TCGA数据)