2018-09-06

提起lncRNA

options(stringsAsFactors = F)
expr_df_nopoint=read.table("RNA_count_data.txt", sep = "\t",header = T)
names(expr_df_nopoint)[1]="gene_id"

首先我定义了一个非编码RNA的集合,这个每个人的标准不一样,但是我的原则是,多多益善,这样出来以后会有个问题,就是编码基因转录出非编码基因会无法从基因名称上区分,可以在运行时把geneid换成转录本id,必须要记在心里。

详见:http://www.bioinfo-scrounger.com/archives/323

lncRNA <- c("3prime_overlapping_ncRNA","antisense_RNA","bidirectional_promoter_lncRNA","lincRNA","macro_lncRNA","non_coding","processed_transcript","sense_intronic","sense_overlapping")

读入GTF数据

library(dplyr)
library(rtracklayer)
AnnoData <- rtracklayer::import('E:/GTF_R/Homo_sapiens.GRCh38.90.chr.gtf')
AnnoData1 =data.frame(AnnoData )

用 gene_biotype来筛选lncRNA

LncRNA_exprSet <- AnnoData1 %>%
filter(gene_biotype %in% lncRNA) %>% #注意这里是gene_biotype
select(c(gene_name,gene_id,gene_biotype)) %>%
distinct() %>% #删除多余行
inner_join(expr_df_nopoint,by ="gene_id")%>%
select(-gene_id,-gene_biotype)

LncRNA_exprSet2<-LncRNA_exprSet%>%
distinct(LncRNA_exprSetgene_name`)

LncRNA_exprSet2gene_name`

Metadata 分组文件

save(LncRNA_exprSet2,file = "LncRNA_exprSet2.Rda")

write.table(LncRNA_exprSet2, "LNC_RNA_count_data.txt", row.names = F, sep = "\t", quote = F)

metadata <- data.frame(names(LncRNA_exprSet2)[-1])
for (i in 1:length(metadata[,1]))
{
num <- as.numeric(substr(metadata[i,1],14,15))
if (num %in% seq(1,9)) {metadata[i,2] <- "T"}
if (num %in% seq(10,29)) {metadata[i,2] <- "N"}
}

加个名称

names(metadata) <- c("TCGA_id","sample")

将sample转化成factor

metadatasample)
table(metadatasample)

我们可总结一下,有44例normal,502例肿瘤组织

metadata %>% dplyr::group_by(sample) %>% summarise(n())

############# 说明所有的样本都是肿瘤样本

制作countData

Lnc_RNA_mycounts <- LncRNA_exprSet2

row.names(Lnc_RNA_mycounts)<-Lnc_RNA_mycounts$gene_name
Lnc_RNA_mycounts=Lnc_RNA_mycounts[-1]
str(Lnc_RNA_mycounts)

############## log2(count+1) 归一化 #############
tt=head(Lnc_RNA_mycounts)

my_normalize<-function(x){
x=log2(x+1)

}

tt2=apply(tt,2,my_normalize)

Lnc_RNA_mycounts_norm<-apply(Lnc_RNA_mycounts,2,my_normalize)

save(Lnc_RNA_mycounts_norm,file = "lncRNA_norm.Rda")

你可能感兴趣的:(2018-09-06)