fasta文件更改header

把它转化为data.frame后面问题就迎刃而解
gtf文件是为了提取chr, start , end 等信息,gtf文件读取需要安装rtracklayer包

rm(list = ls())
modify_fa<-function(fa_dir,gtf_dir,out_dir){
  suppressMessages(library(Biostrings))
  suppressMessages(library(tidyverse))
  gtf <- rtracklayer::import(gtf_dir)
  gtf_df=as.data.frame(gtf) 
  gtf_df$seq_name<-paste0(gtf_df$transcript_id,"_",gtf_df$gene_id)
  gtf_df<- gtf_df %>% filter(type=="transcript")
  gtf_df$new_strand<-as.factor(ifelse(gtf_df$strand=="+",1,-1))
  
  gtf_df$new_name<-paste0(gtf_df$seq_name," ","transcript chromosome:GRCh38:",
                          gtf_df$seqnames,":",gtf_df$start,":",gtf_df$end,":",gtf_df$new_strand,
                          " ","gene:",gtf_df$gene_id)
  
  gtf_df<-gtf_df %>% select(seq_name,new_name)
  
  fa<-readDNAStringSet(fa_dir)
  seq_name = names(fa)
  sequence = paste(fa)
  df <- data.frame(seq_name, sequence)
  rt<-df %>% left_join(gtf_df,by="seq_name") %>% select(new_name,sequence) 
  rt$new_name<-paste0(">",rt$new_name)
  
  D <- do.call(rbind, lapply(seq(nrow(rt)), function(i) t(rt[i, ])))
  write.table(D,out_dir, row.names = FALSE, col.names = FALSE, quote = FALSE)
}

human_fa_dir="all.human.isoforms.fa"
human_gtf_dir="all.human.isoforms.gtf"
human_out_dir="all.human.isoforms.new.fa"

modify_fa(fa_dir=human_fa_dir,gtf_dir=human_gtf_dir,out_dir = human_out_dir)

你可能感兴趣的:(fasta文件更改header)