DNA-蛋白翻译过程的Python实现

引言

最近为了给平台上加上一个将DNA序列翻译为蛋白序列的工具,写了一个任何生信玩家初学时都会写的代码。看了一些别人的翻译工具,我也想尽量把代码写的完整一点,在这个过程中首次接触并使用了BioPython,目前看起来还是很好用的。

代码

#!/bin/python3

from Bio.Seq import translate, reverse_complement
from Bio import SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser
import os
import sys
import getopt
import warnings
import time

warnings.filterwarnings("ignore")

def translate2(input_file, translated_file, corr_nucl_file, corr_prot_file, summary_file, read_start, rev_comp, no_stop, genetic_code):
	'''
	input_file: nucleic acid sequence fasta file
	translated_file: output file 1, protein sequence fasta file
	summary_file: input file information
	read_start: 1(default). The position that translation starts
	rev_comp: no(default). Do not use reverse complemetary sequences for translation.
	non_stop: yes(default). Show stop codon as * in translated sequence.
	'''
	#Translate
	nucl_dict = SeqIO.to_dict(SeqIO.parse(input_file,"fasta"), key_function = lambda rec: rec.description) #keep whitespace in FASTA header
	prot_dict = {}
	corr_nucl_dict = {}
	corr_prot_dict = {}

	read_start = int(read_start)
	stop_sign = 0 #count how many sequences contain stop codon
	for key in nucl_dict:
		nucl_seq = nucl_dict[key].seq[read_start-1:]
		if rev_comp == 'no':
			prot_seq = translate(nucl_seq,table=genetic_code)
		else:
			prot_seq = translate(reverse_complement(nucl_seq),table=genetic_code)
		prot_dict[key] = prot_seq
		if '*' in prot_seq:
			stop_sign += 1
		else:
			corr_nucl_dict[key] = nucl_seq
			corr_prot_dict[key] = prot_seq


	#print(prot_dict)
	with open(translated_file,"w+") as out:
		for key in prot_dict:
			out.write('>'+str(key))
			out.write('\n')
			if (no_stop == 'no') & ('*' in str(prot_dict[key])):
				cut_index = str(prot_dict[key]).index('*')
				out.write(str(prot_dict[key])[0:cut_index])
			else:
				out.write(str(prot_dict[key]))

			out.write('\n')

	with open(corr_nucl_file, "w+") as cnf:
		for key in corr_nucl_dict:
			cnf.write('>'+str(key))
			cnf.write('\n')
			cnf.write(str(corr_nucl_dict[key]))
			cnf.write('\n')

	with open(corr_prot_file, "w+") as cpf:
		for key in corr_prot_dict:
			cpf.write('>'+str(key))
			cpf.write('\n')
			cpf.write(str(corr_prot_dict[key]))
			cpf.write('\n')

	#Fasta file info
	count = 0
	total_len = 0
	with open(input_file) as in_handle:
		for title, seq in SimpleFastaParser(in_handle):
			count += 1
			total_len += len(seq)
	with open(summary_file, "w+") as summary:
		summary.write("%i sequences with total length %i" % (count, total_len))
		summary.write('\n')
		summary.write("%i sequence(s) detected with stop codon during translation!" % stop_sign)
		summary.write('\n')


def main(argv):
	inputfile = ''
	proteinfile = ''
	cnf = ''
	cpf = ''
	summaryfile = ''
	readstart = 1
	rev_comp, no_stop, table = 'no', 'yes', 1
	try:
		opts, args = getopt.getopt(argv, "hi:o:a:p:s:f:r:n:t:", ["help","input=","translated=","corr-nucl=","corr-prot=","summary=","read-start=", "rev-comp=", "no-stop=", "table="])
	except:
		print("Usage: translate.py -i  -o  -a  -p  -s  -f  -r  -n  -t ")
		sys.exit(2)
	for opt, arg in opts:
		if opt in ('-h', '--help'):
			print("Usage: translate.py -i  -o  -a  -p  -s  -f  -r  -n  -t ")
			print("input_file: nucleic acid sequence fasta file")
			print("translated_file: output file 1, protein sequence fasta file")
			print("corr-nucl-file: output file 2, stop-codon-free-nucleic acid sequences fasta file")
			print("corr-prot-file: output file 3, stop-codon-free-protein sequences fasta file")
			print("summary_file: output file 4, input file information and stop codon sequence")
			print("read_start: 1(default). The position where translation starts. You should set this argument to 1, 2 or 3")
			print("rev_comp: no(default). Do not use reverse complemetary sequences for translation.")
			print("non_stop: yes(default). Show stop codon as * in translated sequence. If 'no' is selected, translation will halt on stop codon.")
			print("table: Genetic code (NCBI)")
			sys.exit()
		elif opt in ('-i', '--input'):
			inputfile = arg
		elif opt in ('-o', '--translated'):
			proteinfile = arg
		elif opt in ('-a', '--corr-nucl-file'):
			cnf = arg
		elif opt in ('-p', '--corr-prot-file'):
			cpf = arg
		elif opt in ('-s', '--summary'):
			summaryfile = arg
		elif opt in ('-f', '--read-start'):
			readstart = arg
		elif opt in ('-r', '--rev-comp'):
			rev_comp = arg
		elif opt in ('-n', '--no-stop'):
			no_stop = arg
		elif opt in ('-t', '--table'):
			table = arg
	#translate2(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6])
	translate2(inputfile, proteinfile, cnf, cpf, summaryfile, readstart, rev_comp, no_stop, table)
	#print(inputfile, proteinfile, summaryfile, readstart, rev_comp, no_stop)
	#translate2('test.fasta','out1.fasta','out2.txt',1,False,True)
if __name__ == '__main__':
	start_time = time.time()
	main(sys.argv[1:])
	end_time = time.time()
	print('Time Elasped: ' + str(end_time-start_time))

参数

给这个代码使用getopt实现了命令行参数的指定输入:
Usage: translate.py -i -o -a -p -s

-f -r -n -t

该代码通过-f参数选择起始翻译的阅读框位置,可以选1、2、3(原序列的前0、1、2个核苷酸将被去除)。 通过-r参数可以选择是否使用当前序列的反向互补序列,通过-n参数可以选择输出的蛋白序列是否在遇到终止密码子的时候停下,如果该参数选no,那么序列中的终止密码子将以*的形式显示。-t参数提供了不同的遗传密码规则,即密码子表,该参数的取值可以参考The Genetic Codes。

另外,该代码的corr-nucl-filecorr-prot-file分别是去掉了含终止子序列的核酸及其对应蛋白序列。

你可能感兴趣的:(杂七杂八的Python小代码,生物信息学,python)