Either strand of a DNA double helix can serve as the coding strand for RNA transcription. Hence, a given DNA string implies six total reading frames, or ways in which the same region of DNA can be translated into amino acids: three reading frames result from reading the string itself, whereas three more result from reading its reverse complement.
An open reading frame (ORF) is one which starts from the start codon and ends by stop codon, without any other stop codons in between. Thus, a candidate protein string is derived by translating an open reading frame into amino acids until a stop codon is reached.
Given: A DNA string of length at most 1 kbp in FASTA format.
Return: Every distinct candidate protein string that can be translated from ORFs of . Strings can be returned in any order.
DNA双螺旋的任一链均可充当RNA转录的编码链。因此,给定的DNA字符串意味着共有六个阅读框,或将DNA的相同区域翻译成氨基酸的方式:三个阅读框来自阅读字符串本身,而另外三个则来自阅读其反向互补序列。
的开放阅读框(ORF)是一个从开始起始密码子和端部由终止密码子,而没有任何其他终止密码子之间。因此,通过将开放阅读框翻译成氨基酸直至达到终止密码子来衍生候选蛋白质串。
下式给出:甲DNA串 至多1个长度的千碱基在FASTA格式。
返回值:可以从ORF转换的每个不同的候选蛋白质字符串。字符串可以以任何顺序返回。
>Rosalind_99 AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG
MLLGSFRLIPKETLIQVAGSSPCNLS M MGMTPRLGLESLLE MTPRLGLESLLE
'''
Rosalind Problems: [ORF] Open Reading Frames
'''
def rev_comp(fa):
'''
Give transript and reversed complementary transcript
'''
fa_transcript = ''
for i in range(len(fa)):
index = -(i+1)
if fa[index] == 'A':
fa_transcript = fa_transcript + 'U'
elif fa[index] == 'T':
fa_transcript = fa_transcript + 'A'
elif fa[index] == 'C':
fa_transcript = fa_transcript + 'G'
else:
fa_transcript = fa_transcript +'C'
fa_rc = ''
for i in range(len(fa_transcript)):
index = -(i+1)
if fa_transcript[index] == 'A':
fa_rc = fa_rc + 'U'
elif fa_transcript[index] == 'U':
fa_rc = fa_rc + 'A'
elif fa_transcript[index] == 'C':
fa_rc = fa_rc + 'G'
else:
fa_rc = fa_rc +'C'
return fa_transcript, fa_rc
def find_all(s,substring):
'''
this function is for find all substrings in one string.
It returns the index(es) of the start of all substring(s).
'''
index_list = []
index = s.find(substring)
while index != -1: #find() returns -1 if there is no match.
index_list.append(index)
index = s.find(substring, index+1)
#mimic the return rule of find()
if len(index_list) > 0:
return index_list
else:
return -1
def orf(mrna):
#finding = find_all(mrna, 'AUG')
#print(finding)
start_codon = 'AUG'
stop_codon = ['UAA', 'UAG', 'UGA']
i, j = 0,0
out = []
while i <= len(mrna)-2:
if mrna[i:i+3] == start_codon:
j=i
sequence=''
while i<= len(mrna) -2:
if mrna[i:i+3] in stop_codon:
out.append(sequence)
break
sequence = sequence + mrna[i:i+3]
i = i+3
i = j+1
j = j+1
#print(out)
return out
def translate(rnaseq):
codon_table = { 'UUU': 'F', 'CUU': 'L', 'AUU': 'I', 'GUU': 'V', \
'UUC': 'F', 'CUC': 'L', 'AUC': 'I', 'GUC': 'V', \
'UUA': 'L', 'CUA': 'L', 'AUA': 'I', 'GUA': 'V', \
'UUG': 'L', 'CUG': 'L', 'AUG': 'M', 'GUG': 'V', \
'UCU': 'S', 'CCU': 'P', 'ACU': 'T', 'GCU': 'A', \
'UCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A', \
'UCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A', \
'UCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A', \
'UAU': 'Y', 'CAU': 'H', 'AAU': 'N', 'GAU': 'D', \
'UAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D', \
'UAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E', \
'UAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E', \
'UGU': 'C', 'CGU': 'R', 'AGU': 'S', 'GGU': 'G', \
'UGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G', \
'UGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G', \
'UGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'}
length = len(rnaseq)
proseq = []
for i in range(0,length,3):
triplet = rnaseq[i:i+3]
if codon_table[str(triplet)] != 'Stop':
proseq.append(codon_table[str(triplet)])
else:
break
proseq = ''.join(proseq)
return proseq
# dna = 'TATACATCACTCCAGGCATCAGAAAATCATGAGAAAGTCTGTGCGCGTAGCGAGAAGGTAGGCTCATTTGTTACCCTTGGACAACTACTGCCGCGTCTGGGCCTCCAAATCGGCTGGTCTTTTTCAGCTCCGTCTTAGGTATCGCGAAATGGACGGGAGGACCATAACTTACCTCCTCTTCTTTTGGCAGTCAGGCTATGACCACGTTTTGTCGGTTACAGATCACCTACCGCGGCGTAACACTGGTGCATATAGCTTGGTTGGGTTGCCTCTCCGCCTTCTCTGACTGGCGAGTGTACGGTAGGAACGCCGGTTCAATTGCATGCTCTGACCTTCTCAGGTAGAATTTCCAGACGAGTTGACAGACTCATCGTTACGCGGGCGGCGGTTCCAAAGCTCCTTACTAGAGATAGACAAGCGCCTAAATGGTTGCTTCCCGAGACGTTCATTAGCTAATGAACGTCTCGGGAAGCAACCATCATATCGATCCCGTGAATCCCTGCCCGTATGCCCCACAGGATAAGGATACACCAGTGACTGAACCTCTGCAATAGTCAGAGATCAGGGTGCTCTTTCATAGCTAATAGCTAGGCCGCGTACTTTAAGTTGTAACACTAACTGCTATGTGGTGAGCTTGAACGCGCGAAGCTGCCCCACAAGATGAAATATGGCCTTCGGAAAGATCACATTCTTGACCTCTGGGGTGTCACTTAAAATTGGCGAAGGTCGGAAAACTCTTTCTATTGCCCGCAAGGCTAAATGGTTCCAACCCCGATGTGTATTTCTCAAACTTTTCAGGTTTTTCTGAGTTACGAACAAGGGCTCGAGCGTGGGAATAGTTTAAATGAACTGTAGATTGAAGTATCGCAAGGAGGAAGTATTCTCTATCAGACGCTTGGTCACG'
dna = 'AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG'
mrna1, mrna2 = rev_comp(dna)
#print(mrna1, mrna2)
orf_list1=orf(mrna1)
orf_list2=orf(mrna2)
orf_list = orf_list1
for i in orf_list2:
if i not in orf_list:
orf_list.append(i)
#print(orf_list)
for i in orf_list:
print(translate(i))