清洗TCGA数据

续上一节从官网上下载TCGA数据 https://www.jianshu.com/p/406bcb11411c,我们得到了如下文件夹,每个文件夹是一个测序数据

清洗TCGA数据_第1张图片

1. 获得并读取路径下所有的测序数据的表达谱

path="/Users/baiyunfan/desktop/TCGA"
folders=list.files(path)
count<-data.frame()
fd1<-folders[1]
files_name<-list.files(paste(path,"/",fd1,sep=""))
files_name_gz<-files_name[grepl(".htseq.counts",files_name)]
mydata<-read.table(gzfile(paste(path,"/",fd1,"/",files_name_gz,sep="")))
names(mydata)[2]<-c(fd1)
count<-mydata
for(fd in folders[2:length(folders)]){
files_name<-list.files(paste(path,"/",fd,sep=""))
files_name_gz<-files_name[grepl(".htseq.counts",files_name)]
mydata<-read.table(gzfile(paste(path,"/",fd,"/",files_name_gz,sep="")))
names(mydata)[2]<-c(fd)
count<-merge(count,mydata,by="V1")
}
count<-count[-c(1:5),]

2. 读取metadata文件,将文件夹名和样本名对应上

source("https://bioconductor.org/biocLite.R")
biocLite("rjson")
r=fromJSON(file="/Users/baiyunfan/desktop/TCGA2/metadata.cart.2019-06-29.json")
a<-sapply(r,function(I){I¥file_id})
b<-sapply(r,function(I){Iassociated_entities[[1]]¥entity_submitter_id})
result<-rbind(a,b)
result<-t(result)
result<-as.data.frame(result)
library(tidyr)
library(dplyr)
temp<-as.data.frame(colnames(count)[-1])
colnames(temp)<-"a"
temp1<-left_join(temp,result,by="a")
colnames(count)<-c("ENSMBL",as.character(temp1[,2]))

3. 将ensembl改成gene symbol,并将相同的gene symbol取平均数

library("org.Hs.eg.db")
ensids_id<-strsplit(as.character(count[,1]),"[.]")
ensids_id<-sapply(ensids_id,function(I){I[1]})
cols<-c("SYMBOL","GENENAME")
change<-select(org.Hs.eg.db,keys = ensids_id,columns = cols,keytype = "ENSEMBL")
change1<-change[!duplicated(change[,1]),]
identical(change1[,1],ensids_id)
count[,1]<-change1[,2]
test<-data.frame()
uni<-unique(count[,1])
for(i in 1:length(uni)){if(length(which(count[,1]==uni[i]))==1){test<-rbind(test,count[which(count[,1]==uni[i]),])}else{test<-rbind(test,c(uni[i],colMeans(count[which(count[,1]==uni[i]),-1])))}}

4. 编辑分组信息

group<-strsplit(colnames(test)[-1],"[-]")
class<-sapply(group,function(i){i[4]})
control<-which(grepl("11",class))
group1<-as.data.frame(colnames(test)[-1])
group1[which(grepl("11",class)),2]<-"normal"
group1[-which(grepl("11",class)),2]<-"cancer"

然后就得到了清洗好的TCGA数据啦
代码没加注释,就将就看吧 TAT

你可能感兴趣的:(清洗TCGA数据)