将TCGA下载的数据合并成矩阵

TCGA.R

/bin/Rscript TCGA.R --help

Options:
        --my_dir=MY_DIR
                设置文件根目录

        --mainfest=MAINFEST
                MAINFEST.txt文件路径

        --clinical_file=CLINICAL_FILE
                clinical文件路径

        --sample_sheet=SAMPLE_SHEET
                samplesheet文件路径

        --exclude_file_pattern=EXCLUDE_FILE_PATTERN
                排除的文件正则模式

        --output_dir=OUTPUT_DIR
                设置输出文件路径

        -h, --help
                Show this help message and exit


#示例
/bin/Rscript TCGA.R  --my_dir /home/whq/myproject/zjl20211220/code/analysis/otherfile/TCGAfile/ --mainfest ../TCGA_dowloadfile/gdc_manifest_20220117_113651.txt --clinical_file ../TCGA_dowloadfile/clinical.tsv --sample_sheet ../TCGA_dowloadfile/gdc_sample_sheet.2022-01-17.tsv --exclude_file_pattern star_gene_counts.tsv.gz --output_dir ../

具体代码

if(T){
  library(dplyr,quietly = T)
  library(stringr,quietly = T)
  library(openxlsx,quietly = T)
  library(optparse,quietly = T)
  library(plyr,quietly = T)
}

option_list<-list(make_option("--my_dir",type = "character",default = "./",help = "设置文件根目录"),
                  make_option("--mainfest",type = "character",help = "MAINFEST.txt文件路径"),
                  make_option("--clinical_file",type = "character",help = "clinical文件路径"),
                  make_option("--sample_sheet",type = "character",help = "samplesheet文件路径"),
                  make_option("--exclude_file_pattern",type = "character",default = "xxxxxxxxxxxxx",help = "排除的文件正则模式"),
                  make_option("--output_dir",type = "character",default = "./",help = "设置输出文件路径")
                  )

args <- parse_args(OptionParser(option_list=option_list))

if(F){
  #示例,不运行
  my_dir<-"/home/whq/myproject/zjl20211220/code/analysis/otherfile/TCGAfile/"
  mainfest<-"../TCGA_dowloadfile/gdc_manifest_20220117_113651.txt"
  clinical_file<-"../TCGA_dowloadfile/clinical.tsv"
  sample_sheet<-"../TCGA_dowloadfile/gdc_sample_sheet.2022-01-17.tsv"
  exclude_file_pattern<-"star_gene_counts.tsv.gz"
  output_dir="../"
}

#设置参数
if(T){
  my_dir=args$my_dir
  mainfest=args$mainfest
  clinical_file=args$clinical_file
  sample_sheet=args$sample_sheet
  exclude_file_pattern=args$exclude_file_pattern
  output_dir=args$output_dir
}


setwd(my_dir)

#整理文件夹信息
tcga_mainfest<-read.csv(mainfest,sep = "\t")
tcga_clinical<-read.csv(clinical_file,sep = "\t")
tcga_sample_sheet<-read.csv(sample_sheet,sep = "\t")


#整理表达矩阵
tcga_sample_sheet_filter<-tcga_sample_sheet%>%mutate(file_dir=paste(File.ID,File.Name,sep = "/"),
                                                     exclud=ifelse(str_detect(file_dir,exclude_file_pattern),T,F))
tcga_mat_name_filter<-tcga_sample_sheet_filter%>%filter(exclud==F)
tcga_mat<-tcga_mat_name_filter%>%dplyr::select(file_dir,Case.ID)%>%mlply(.,function(file_dir,Case.ID){
  read.csv(file_dir,sep = "\t",row.names = 1,header = F)%>%
    setNames(.,Case.ID)
})

tcga_mat_rownames<-lapply(tcga_mat,rownames)%>%Reduce(intersect,.)

tcga_mat<-lapply(tcga_mat,function(x){
  x[tcga_mat_rownames,,drop=F]
})%>%do.call(cbind,.)

#整理annotation

tcga_anno<-lapply(tcga_sample_sheet_filter$File.ID,function(x){
  list.files(x,pattern = "annotation",full.names = T) ->a
  if(length(a)>0){a}else{NULL}
})%>%.[sapply(.,function(x){!is.null(x)})]%>%
  lapply(.,function(x){
    read.csv(x,header = T,sep = "\t")
  })%>%do.call(rbind,.)

#整合annotation和clinical
tcga_intagrated_info<-merge(tcga_clinical,tcga_anno,by.x="case_id",by.y="entity_id",all.x=T)

write.csv(tcga_mat,paste0(output_dir,"tcga_mat.csv"),quote = F,row.names = T)
write.xlsx(tcga_intagrated_info,paste0(output_dir,"tcga_intagrated_info.xlsx"),overwrite = T)

你可能感兴趣的:(将TCGA下载的数据合并成矩阵)