2023-04-10批量获取所有基因的启动子序列

输入基因组文件(fasta)及其对应的注释文件(gff/gff3/gtf),得到这个注释文件中所有基因的启动子序列:
参数介绍:

  • fa 输入的基因组文件,fasta格式

-g 基因组文件对应的注释文件(gff/gff3/gtf)

-out 输出的结果(fasta格式)

-n 获取基因前多少bp 的序列,默认1000,建议500-2000

-detail 加上这个参数,则在结果fasta文件头部展示详细信息,不加的话,头部只有基因名称信息

python scripts/getPromoter.py -fa genome.fa -g geonme.gff3 -n 2000 -out test.fa -detail

脚本如下

# -*- coding: utf-8 -*-

import sys
from Bio import SeqIO

if len(sys.argv)==1:
    print ("//\ntype Python getPromoter.py -h for details\n//")
    sys.exit()

inputGenome = ""
inputgff = ""
outputfile = ""
name_detail = False
n = 1000

for i in range(1,len(sys.argv)):
    if sys.argv[i] == "-fa":
        inputGenome = sys.argv[i+1]
    elif sys.argv[i] == "-g":
        inputgff = sys.argv[i+1]
    elif sys.argv[i] == "-out":
        outputfile = sys.argv[i+1]
    elif sys.argv[i] == "-n":
        n = int(sys.argv[i+1])
    elif sys.argv[i] == "-detail":
        name_detail = True
    elif sys.argv[i] == "-h":
        print ("\n// designed by zby\n")
        print ("python getPromoter.py [-fa genome.fa] [-gff cdna.gff3] [-out promoter.fa] [-n 2000]")
        print ("-fa  Genome File. FASTA format")
        print ("-g  Gene Annotation File")
        print ("-out  Export promoter sequence")
        print ("-n  Promoter sequence length")
        print ("-detail  whether fasta file's header contains detailed information. DEFAULT:False")
        print ("-h Print this page")
        print ("//")

        sys.exit()

if inputGenome=="" or inputgff=="" or outputfile=="":
    print ("//\nthe -fa -gff -out are required parameter.\n//")
    sys.exit()

gfftype = inputgff.split(".")[-1]

print ("Reading gff file ...")
gff_file = {}
for line in open(inputgff):
    if line.startswith("#") or line.strip()=="":
        pass
    else:
        l = line.strip().split("\t")
        if l[2] == "gene":
            ID = ""
            if gfftype=="gff" or gfftype=="gff3":
                ID = l[-1].split(";")[0].split("=")[1]
            elif gfftype=="gtf":
                ID = l[-1].split(";")[0].strip().split()[1][1:-1]
            else:
                print ("Please keep the input gff file ending with gff/gff3/gtf")
                sys.exit()
            if ID not in gff_file.keys():
                start = int(l[3])-1
                end = int(l[4])
                gff_file[ID] = [l[0],start,end,l[6]]

print ("Reading genome file ...")
seq_file = {}
for query in SeqIO.parse(inputGenome,'fasta'):
    chr = query.id
    seq = str(query.seq)
    seq_file[chr] = seq

def rev(seq):
    base_trans = {"A":"T","C":"G","T":"A","G":"C","a":"t","t":"a","c":"g","g":"c","n":"n","N":"N","R":"R"}
    rev_seq_list = [base_trans[k] for k in seq]
    rev_seq = "".join(rev_seq_list)
    return rev_seq

print ("Intercept the sequence and output ...")
output_fasta = open(outputfile,"w")
for key,value in gff_file.items():
    s = value[1]-n
    e = s+n
    if s<0:
        s = 0
    else:
        s = s
    if name_detail:
        head = ">"+key+" ["+value[0]+" "+str(s)+" "+str(e)+" "+value[3]+" "+str(n)+"bp"+"]"
    else:
        head = ">"+key

    seq = seq_file[value[0]][s:e]
    if value[3] == "+":
        seq = seq
    elif value[3] == "-":
        seq = rev(seq)
    output_fasta.write(head+"\n"+seq+"\n")
output_fasta.close()
print ("finished!")

你可能感兴趣的:(2023-04-10批量获取所有基因的启动子序列)