TCGA.R
/bin/Rscript TCGA.R --help
Options:
--my_dir=MY_DIR
设置文件根目录
--mainfest=MAINFEST
MAINFEST.txt文件路径
--clinical_file=CLINICAL_FILE
clinical文件路径
--sample_sheet=SAMPLE_SHEET
samplesheet文件路径
--exclude_file_pattern=EXCLUDE_FILE_PATTERN
排除的文件正则模式
--output_dir=OUTPUT_DIR
设置输出文件路径
-h, --help
Show this help message and exit
#示例
/bin/Rscript TCGA.R --my_dir /home/whq/myproject/zjl20211220/code/analysis/otherfile/TCGAfile/ --mainfest ../TCGA_dowloadfile/gdc_manifest_20220117_113651.txt --clinical_file ../TCGA_dowloadfile/clinical.tsv --sample_sheet ../TCGA_dowloadfile/gdc_sample_sheet.2022-01-17.tsv --exclude_file_pattern star_gene_counts.tsv.gz --output_dir ../
具体代码
if(T){
library(dplyr,quietly = T)
library(stringr,quietly = T)
library(openxlsx,quietly = T)
library(optparse,quietly = T)
library(plyr,quietly = T)
}
option_list<-list(make_option("--my_dir",type = "character",default = "./",help = "设置文件根目录"),
make_option("--mainfest",type = "character",help = "MAINFEST.txt文件路径"),
make_option("--clinical_file",type = "character",help = "clinical文件路径"),
make_option("--sample_sheet",type = "character",help = "samplesheet文件路径"),
make_option("--exclude_file_pattern",type = "character",default = "xxxxxxxxxxxxx",help = "排除的文件正则模式"),
make_option("--output_dir",type = "character",default = "./",help = "设置输出文件路径")
)
args <- parse_args(OptionParser(option_list=option_list))
if(F){
#示例,不运行
my_dir<-"/home/whq/myproject/zjl20211220/code/analysis/otherfile/TCGAfile/"
mainfest<-"../TCGA_dowloadfile/gdc_manifest_20220117_113651.txt"
clinical_file<-"../TCGA_dowloadfile/clinical.tsv"
sample_sheet<-"../TCGA_dowloadfile/gdc_sample_sheet.2022-01-17.tsv"
exclude_file_pattern<-"star_gene_counts.tsv.gz"
output_dir="../"
}
#设置参数
if(T){
my_dir=args$my_dir
mainfest=args$mainfest
clinical_file=args$clinical_file
sample_sheet=args$sample_sheet
exclude_file_pattern=args$exclude_file_pattern
output_dir=args$output_dir
}
setwd(my_dir)
#整理文件夹信息
tcga_mainfest<-read.csv(mainfest,sep = "\t")
tcga_clinical<-read.csv(clinical_file,sep = "\t")
tcga_sample_sheet<-read.csv(sample_sheet,sep = "\t")
#整理表达矩阵
tcga_sample_sheet_filter<-tcga_sample_sheet%>%mutate(file_dir=paste(File.ID,File.Name,sep = "/"),
exclud=ifelse(str_detect(file_dir,exclude_file_pattern),T,F))
tcga_mat_name_filter<-tcga_sample_sheet_filter%>%filter(exclud==F)
tcga_mat<-tcga_mat_name_filter%>%dplyr::select(file_dir,Case.ID)%>%mlply(.,function(file_dir,Case.ID){
read.csv(file_dir,sep = "\t",row.names = 1,header = F)%>%
setNames(.,Case.ID)
})
tcga_mat_rownames<-lapply(tcga_mat,rownames)%>%Reduce(intersect,.)
tcga_mat<-lapply(tcga_mat,function(x){
x[tcga_mat_rownames,,drop=F]
})%>%do.call(cbind,.)
#整理annotation
tcga_anno<-lapply(tcga_sample_sheet_filter$File.ID,function(x){
list.files(x,pattern = "annotation",full.names = T) ->a
if(length(a)>0){a}else{NULL}
})%>%.[sapply(.,function(x){!is.null(x)})]%>%
lapply(.,function(x){
read.csv(x,header = T,sep = "\t")
})%>%do.call(rbind,.)
#整合annotation和clinical
tcga_intagrated_info<-merge(tcga_clinical,tcga_anno,by.x="case_id",by.y="entity_id",all.x=T)
write.csv(tcga_mat,paste0(output_dir,"tcga_mat.csv"),quote = F,row.names = T)
write.xlsx(tcga_intagrated_info,paste0(output_dir,"tcga_intagrated_info.xlsx"),overwrite = T)