最近为了给平台上加上一个将DNA序列翻译为蛋白序列的工具,写了一个任何生信玩家初学时都会写的代码。看了一些别人的翻译工具,我也想尽量把代码写的完整一点,在这个过程中首次接触并使用了BioPython,目前看起来还是很好用的。
#!/bin/python3
from Bio.Seq import translate, reverse_complement
from Bio import SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser
import os
import sys
import getopt
import warnings
import time
warnings.filterwarnings("ignore")
def translate2(input_file, translated_file, corr_nucl_file, corr_prot_file, summary_file, read_start, rev_comp, no_stop, genetic_code):
'''
input_file: nucleic acid sequence fasta file
translated_file: output file 1, protein sequence fasta file
summary_file: input file information
read_start: 1(default). The position that translation starts
rev_comp: no(default). Do not use reverse complemetary sequences for translation.
non_stop: yes(default). Show stop codon as * in translated sequence.
'''
#Translate
nucl_dict = SeqIO.to_dict(SeqIO.parse(input_file,"fasta"), key_function = lambda rec: rec.description) #keep whitespace in FASTA header
prot_dict = {}
corr_nucl_dict = {}
corr_prot_dict = {}
read_start = int(read_start)
stop_sign = 0 #count how many sequences contain stop codon
for key in nucl_dict:
nucl_seq = nucl_dict[key].seq[read_start-1:]
if rev_comp == 'no':
prot_seq = translate(nucl_seq,table=genetic_code)
else:
prot_seq = translate(reverse_complement(nucl_seq),table=genetic_code)
prot_dict[key] = prot_seq
if '*' in prot_seq:
stop_sign += 1
else:
corr_nucl_dict[key] = nucl_seq
corr_prot_dict[key] = prot_seq
#print(prot_dict)
with open(translated_file,"w+") as out:
for key in prot_dict:
out.write('>'+str(key))
out.write('\n')
if (no_stop == 'no') & ('*' in str(prot_dict[key])):
cut_index = str(prot_dict[key]).index('*')
out.write(str(prot_dict[key])[0:cut_index])
else:
out.write(str(prot_dict[key]))
out.write('\n')
with open(corr_nucl_file, "w+") as cnf:
for key in corr_nucl_dict:
cnf.write('>'+str(key))
cnf.write('\n')
cnf.write(str(corr_nucl_dict[key]))
cnf.write('\n')
with open(corr_prot_file, "w+") as cpf:
for key in corr_prot_dict:
cpf.write('>'+str(key))
cpf.write('\n')
cpf.write(str(corr_prot_dict[key]))
cpf.write('\n')
#Fasta file info
count = 0
total_len = 0
with open(input_file) as in_handle:
for title, seq in SimpleFastaParser(in_handle):
count += 1
total_len += len(seq)
with open(summary_file, "w+") as summary:
summary.write("%i sequences with total length %i" % (count, total_len))
summary.write('\n')
summary.write("%i sequence(s) detected with stop codon during translation!" % stop_sign)
summary.write('\n')
def main(argv):
inputfile = ''
proteinfile = ''
cnf = ''
cpf = ''
summaryfile = ''
readstart = 1
rev_comp, no_stop, table = 'no', 'yes', 1
try:
opts, args = getopt.getopt(argv, "hi:o:a:p:s:f:r:n:t:", ["help","input=","translated=","corr-nucl=","corr-prot=","summary=","read-start=", "rev-comp=", "no-stop=", "table="])
except:
print("Usage: translate.py -i -o -a -p -s -f -r -n -t " )
sys.exit(2)
for opt, arg in opts:
if opt in ('-h', '--help'):
print("Usage: translate.py -i -o -a -p -s -f -r -n -t " )
print("input_file: nucleic acid sequence fasta file")
print("translated_file: output file 1, protein sequence fasta file")
print("corr-nucl-file: output file 2, stop-codon-free-nucleic acid sequences fasta file")
print("corr-prot-file: output file 3, stop-codon-free-protein sequences fasta file")
print("summary_file: output file 4, input file information and stop codon sequence")
print("read_start: 1(default). The position where translation starts. You should set this argument to 1, 2 or 3")
print("rev_comp: no(default). Do not use reverse complemetary sequences for translation.")
print("non_stop: yes(default). Show stop codon as * in translated sequence. If 'no' is selected, translation will halt on stop codon.")
print("table: Genetic code (NCBI)")
sys.exit()
elif opt in ('-i', '--input'):
inputfile = arg
elif opt in ('-o', '--translated'):
proteinfile = arg
elif opt in ('-a', '--corr-nucl-file'):
cnf = arg
elif opt in ('-p', '--corr-prot-file'):
cpf = arg
elif opt in ('-s', '--summary'):
summaryfile = arg
elif opt in ('-f', '--read-start'):
readstart = arg
elif opt in ('-r', '--rev-comp'):
rev_comp = arg
elif opt in ('-n', '--no-stop'):
no_stop = arg
elif opt in ('-t', '--table'):
table = arg
#translate2(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6])
translate2(inputfile, proteinfile, cnf, cpf, summaryfile, readstart, rev_comp, no_stop, table)
#print(inputfile, proteinfile, summaryfile, readstart, rev_comp, no_stop)
#translate2('test.fasta','out1.fasta','out2.txt',1,False,True)
if __name__ == '__main__':
start_time = time.time()
main(sys.argv[1:])
end_time = time.time()
print('Time Elasped: ' + str(end_time-start_time))
给这个代码使用getopt
实现了命令行参数的指定输入:
Usage: translate.py -i -o
该代码通过-f
参数选择起始翻译的阅读框位置,可以选1、2、3(原序列的前0、1、2个核苷酸将被去除)。 通过-r
参数可以选择是否使用当前序列的反向互补序列,通过-n
参数可以选择输出的蛋白序列是否在遇到终止密码子的时候停下,如果该参数选no
,那么序列中的终止密码子将以*
的形式显示。-t
参数提供了不同的遗传密码规则,即密码子表,该参数的取值可以参考The Genetic Codes。
另外,该代码的corr-nucl-file
和corr-prot-file
分别是去掉了含终止子序列的核酸及其对应蛋白序列。