下面为历史旧版本
list.files(pattern = "\\.tsv")
dir()
dir(all.files=TRUE)
dir()
getwd()
filename=1
for (i in 1:20) {
a=as.character(list.files(list.files()[i])[1])
ifelse( a%in% NA==TRUE, NA,'b')
b=paste(getwd(),"/",list.files()[i],"/",list.files(list.files()[i])[1],sep = "")
filename[[i]]=b
}
filename
filename <- as.data.frame(filename)
filename2=apply(filename, 2,
function(x){gsub(pattern = ".*(NA).*",
replacement = "\\21",x) })
filename2 <- as.data.frame(filename2)
filename2 <- filename2[filename2$filename!=1,]
filename2
#复制文件到同一文件夹
getwd()
dir.create("C:/Users/shao/Desktop/TCGAxin/data" ) #创建一个目录
file.copy(filename2,"data")
#提取jason文本信息
library(rjson)
result <- fromJSON(file="metadata.cart.2022-05-04.json") #先读取
#转向data目录
setwd("C:/Users/shao/Desktop/TCGAxin/data")
#提取到数据框
Metadata=data.frame()
for (i in 1:1000) {
a <- result[[i]][["file_name"]]
b <- result[[i]][["associated_entities"]][[1]][["case_id"]]
c <- result[[i]][["associated_entities"]][[1]][["entity_submitter_id"]]
Metadata[i,1] =a
Metadata[i,2] =b
Metadata[i,3] =c
}
names(Metadata)
names(Metadata)[1] <- "file_name"
names(Metadata)[2] <- "case_id"
names(Metadata)[3] <- "entity_submitter_id"
names(Metadata)
table(duplicated(Metadata$case_id))
write.csv(Metadata,"metadataID1.csv")
# 有一个病例没有对照
# 【用linux改】
#循环读取正式
lf <-list.files(pattern = ".tsv$") #以report.tsv 结尾的
files <- gsub("\\.tsv", "", lf) #切掉后缀.tsv,获得这些名称,为循环准备
files
for (i in seq_along(files))
assign(files[i], read.table(lf[i], sep = '\t', header = TRUE))
#提取要的部分
TCGA_CN_5360_01A_01R_1436_07 = TCGA_CN_5360_01A_01R_1436_07[-c(1:4), c("gene_id","fpkm_unstranded")]
names(TCGA_CN_5360_01A_01R_1436_07)[2] <- "TCGA_CN_5360_01A_01R_1436_07"
#【借助ECXEL提取】
#多个数据合并
multimerge<-function(dat=list(),...){
if(length(dat)<2)return(as.data.frame(dat))
mergedat<-dat[[1]]
dat[[1]]<-NULL
for(i in dat){
mergedat<-merge(all=TRUE,mergedat,i,...)
}
return(mergedat)
}
dataALL <- multimerge(list(TCGA_CN_A497_01A_11R_A24H_07,
TCGA_CV_5430_01A_02R_1686_07,
TCGA_CV_6962_01A_11R_1915_07,
TCGA_CV_7177_01A_11R_2016_07,
TCGA_CV_7248_01A_11R_2016_07,
TCGA_CV_7410_01A_21R_2081_07,
TCGA_D6_6517_01A_11R_1873_07,
TCGA_F7_A50I_01A_11R_A28V_07,
TCGA_UF_A7JH_01A_21R_A34R_07))
######
#新变量的代码,导出复制到新Untitled执行
#【生成新tax】
a=list()
for (i in seq_along(files)){
a[[i]]=cbind(assign(paste0('q',i),
gsub('W',files[i],"W$tax <- paste(W$name,'=',W$taxID)")))
}
a
a2 <- unlist(a)
a2 <- as.data.frame(a2)
write.csv(a2,'w1.csv')
##【保留两个变量】
#b <- b[,c('tax','numUniqueReads')]
b=list()
for (i in seq_along(files)){
b[[i]] <- cbind(assign(paste0('q',i),
paste0(files[i], '<-', files[i],
"[-c(1:4),c("gene_id","fpkm_unstranded")]")))
}
b
b2 <- unlist(b)
b2 <- as.data.frame(b2)
write.csv(b2,'w2.csv')
#改名字
#names(a)[2] <- 'Control105A'
c <- list()
for (i in seq_along(files)){
c[[i]] <- cbind(assign(paste0('q',i),
paste0('names(',files[i], ')[2] <- ','"', files[i])))
}
c
c2 <- unlist(c)
c2 <- as.data.frame(c2)
c2
c2[,1] <- sub(pattern = "_report$", replacement = "\\1", c2[,1])
c2
c2[,1] <- paste0(c2[,1],'"')
c2
write.csv(c2,'w3.csv')
######
M1 <- read.csv(file = 'clinical.csv')
M1 <- M1[,1:3]
library(tidyverse)
names(M1)
head(M1,5)
Metadata2 <- full_join(Metadata,M1,by=c('case_id'))
write.csv(Metadata2,"metadataID.csv")
##
# getwd()
#
# #正则提取
# library(inferregex)
# zhengzhe <- read.csv("正则提取.csv")
# #提取正则
# library(inferregex)
# #s <- zhengzhe[7,1]
# #infer_regex(s)$regex
#
# #zhengzhe2 <- grep(pattern="^\\s{6}\"[a-z]{4}_[a-z]{2}\":",zhengzhe[,1])
# zhengzhe2 <- grep(pattern="case_id",zhengzhe[,1])
# zhengzhe3 <- as.data.frame(zhengzhe[zhengzhe2,])
#
# #s <- zhengzhe[11,1]
# #infer_regex(s)$regex
#
# zhengzhe2_2 <- grep(pattern="file_name",zhengzhe[,1])
# zhengzhe2_2 <- as.data.frame(zhengzhe[zhengzhe2_2,])
# zhengzhe2_2_2 <- grep(pattern=".tsv",zhengzhe2_2[,1])
# zhengzhe2_2_2 <- as.data.frame(zhengzhe2_2[zhengzhe2_2_2,])
#
# dim(zhengzhe3)
# dim(zhengzhe2_2_2)
#
# s <- zhengzhe[1,1]
# infer_regex(s)$regex
# zhengzhe4 <- grep(pattern="^\\s{2}\"[a-z]{4}_[a-z]{6}\":\\s\"[A-Z]{3}\",\\s$",zhengzhe[,1])
# zhengzhe4 <- as.data.frame(zhengzhe[zhengzhe4,])
# dim(zhengzhe4)