R语言 2022 TCGA数据库提取 新版TCGA 表格提取

点击看新版本

下面为历史旧版本

list.files(pattern = "\\.tsv")
dir()

dir(all.files=TRUE)

dir()
getwd()


filename=1
for (i in 1:20) { 
  a=as.character(list.files(list.files()[i])[1])
  ifelse( a%in% NA==TRUE, NA,'b')
  b=paste(getwd(),"/",list.files()[i],"/",list.files(list.files()[i])[1],sep = "")
          filename[[i]]=b
}
filename
filename <- as.data.frame(filename)
filename2=apply(filename, 2, 
                function(x){gsub(pattern = ".*(NA).*", 
                                 replacement = "\\21",x) })
filename2 <- as.data.frame(filename2)
filename2 <- filename2[filename2$filename!=1,]
filename2


#复制文件到同一文件夹
getwd()
dir.create("C:/Users/shao/Desktop/TCGAxin/data" ) #创建一个目录

file.copy(filename2,"data") 


#提取jason文本信息
library(rjson)
result <- fromJSON(file="metadata.cart.2022-05-04.json") #先读取

#转向data目录

setwd("C:/Users/shao/Desktop/TCGAxin/data")




#提取到数据框
Metadata=data.frame()
for (i in 1:1000) {
  a <- result[[i]][["file_name"]]
  b <- result[[i]][["associated_entities"]][[1]][["case_id"]]
  c <- result[[i]][["associated_entities"]][[1]][["entity_submitter_id"]]
  Metadata[i,1] =a
  Metadata[i,2] =b
  Metadata[i,3] =c
}
names(Metadata)
names(Metadata)[1] <- "file_name"
names(Metadata)[2] <- "case_id"
names(Metadata)[3] <- "entity_submitter_id"
names(Metadata)


table(duplicated(Metadata$case_id))
write.csv(Metadata,"metadataID1.csv")

# 有一个病例没有对照

# 【用linux改】

#循环读取正式
lf <-list.files(pattern = ".tsv$") #以report.tsv 结尾的
files <- gsub("\\.tsv", "", lf)   #切掉后缀.tsv,获得这些名称,为循环准备
files
for (i in seq_along(files))
  assign(files[i], read.table(lf[i], sep = '\t', header = TRUE))


#提取要的部分
TCGA_CN_5360_01A_01R_1436_07 = TCGA_CN_5360_01A_01R_1436_07[-c(1:4), c("gene_id","fpkm_unstranded")]
names(TCGA_CN_5360_01A_01R_1436_07)[2] <- "TCGA_CN_5360_01A_01R_1436_07"



#【借助ECXEL提取】


#多个数据合并
multimerge<-function(dat=list(),...){
  if(length(dat)<2)return(as.data.frame(dat))
  mergedat<-dat[[1]]
  dat[[1]]<-NULL
  for(i in dat){
    mergedat<-merge(all=TRUE,mergedat,i,...)
  }
  return(mergedat)
}


dataALL <- multimerge(list(TCGA_CN_A497_01A_11R_A24H_07,
                TCGA_CV_5430_01A_02R_1686_07,
                TCGA_CV_6962_01A_11R_1915_07,
                TCGA_CV_7177_01A_11R_2016_07,
                TCGA_CV_7248_01A_11R_2016_07,
                TCGA_CV_7410_01A_21R_2081_07,
                TCGA_D6_6517_01A_11R_1873_07,
                TCGA_F7_A50I_01A_11R_A28V_07,
                TCGA_UF_A7JH_01A_21R_A34R_07))


######
#新变量的代码,导出复制到新Untitled执行
#【生成新tax】
a=list()
for (i in seq_along(files)){
  a[[i]]=cbind(assign(paste0('q',i),
                      gsub('W',files[i],"W$tax <- paste(W$name,'=',W$taxID)")))
}
a
a2 <- unlist(a)
a2 <- as.data.frame(a2)
write.csv(a2,'w1.csv')

##【保留两个变量】
#b <- b[,c('tax','numUniqueReads')]
b=list()
for (i in seq_along(files)){
  b[[i]] <- cbind(assign(paste0('q',i), 
                         paste0(files[i], '<-', files[i],
                                "[-c(1:4),c("gene_id","fpkm_unstranded")]")))
}
b
b2 <- unlist(b)
b2 <- as.data.frame(b2)
write.csv(b2,'w2.csv')

#改名字
#names(a)[2] <- 'Control105A'
c <- list()
for (i in seq_along(files)){
  c[[i]] <- cbind(assign(paste0('q',i), 
                         paste0('names(',files[i], ')[2] <- ','"', files[i])))
}
c
c2 <- unlist(c)
c2 <- as.data.frame(c2)
c2
c2[,1] <- sub(pattern = "_report$", replacement = "\\1", c2[,1]) 
c2
c2[,1] <- paste0(c2[,1],'"')
c2
write.csv(c2,'w3.csv')
######







M1 <- read.csv(file = 'clinical.csv') 
M1 <- M1[,1:3]
library(tidyverse)
names(M1)
head(M1,5)

Metadata2 <- full_join(Metadata,M1,by=c('case_id'))  

write.csv(Metadata2,"metadataID.csv")













##
# getwd()
# 




# #正则提取
# library(inferregex)
# zhengzhe <- read.csv("正则提取.csv")
# #提取正则
# library(inferregex)
# #s <- zhengzhe[7,1]
# #infer_regex(s)$regex
# 
# #zhengzhe2 <- grep(pattern="^\\s{6}\"[a-z]{4}_[a-z]{2}\":",zhengzhe[,1])
# zhengzhe2 <- grep(pattern="case_id",zhengzhe[,1])
# zhengzhe3 <- as.data.frame(zhengzhe[zhengzhe2,])
# 
# #s <- zhengzhe[11,1]
# #infer_regex(s)$regex
# 
# zhengzhe2_2 <- grep(pattern="file_name",zhengzhe[,1])
# zhengzhe2_2 <- as.data.frame(zhengzhe[zhengzhe2_2,])
# zhengzhe2_2_2 <- grep(pattern=".tsv",zhengzhe2_2[,1])
# zhengzhe2_2_2 <- as.data.frame(zhengzhe2_2[zhengzhe2_2_2,])
# 
# dim(zhengzhe3)
# dim(zhengzhe2_2_2)
# 
# s <- zhengzhe[1,1]
# infer_regex(s)$regex
# zhengzhe4 <- grep(pattern="^\\s{2}\"[a-z]{4}_[a-z]{6}\":\\s\"[A-Z]{3}\",\\s$",zhengzhe[,1])
# zhengzhe4 <- as.data.frame(zhengzhe[zhengzhe4,])
# dim(zhengzhe4)








你可能感兴趣的:(r语言)