After identifying the exons and introns of an RNA string, we only need to delete the introns and concatenate the exons to form a new string ready for translation.
Given: A DNA string (of length at most 1 kbp) and a collection of substrings of acting as introns. All strings are given in FASTA format.
Return: A protein string resulting from transcribing and translating the exons of . (Note: Only one solution will exist for the dataset provided.)
识别RNA字符串的外显子和内含子后,我们只需要删除内含子并连接外显子即可形成一个新的字符串,准备翻译。
下式给出:甲DNA串 (长度的至多1 kbp的)和的集合的子串 的作为内含子。所有字符串均以FASTA格式给出。
返回值:转录和翻译的外显子所产生的蛋白质串。(注意:对于提供的数据集,仅存在一种解决方案。)
>Rosalind_10 ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG >Rosalind_12 ATCGGTCGAA >Rosalind_15 ATCGGTCGAGCGTGT
MVYIADKQHVASREAYGHMFKVCA
codon_table = {'UUU': 'F', 'CUU': 'L', 'AUU': 'I', 'GUU': 'V', \
'UUC': 'F', 'CUC': 'L', 'AUC': 'I', 'GUC': 'V', \
'UUA': 'L', 'CUA': 'L', 'AUA': 'I', 'GUA': 'V', \
'UUG': 'L', 'CUG': 'L', 'AUG': 'M', 'GUG': 'V', \
'UCU': 'S', 'CCU': 'P', 'ACU': 'T', 'GCU': 'A', \
'UCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A', \
'UCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A', \
'UCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A', \
'UAU': 'Y', 'CAU': 'H', 'AAU': 'N', 'GAU': 'D', \
'UAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D', \
'UAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E', \
'UAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E', \
'UGU': 'C', 'CGU': 'R', 'AGU': 'S', 'GGU': 'G', \
'UGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G', \
'UGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G', \
'UGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'
}
def readfasta(lines):
seq = []
index = []
seqplast = ""
numlines = 0
for i in lines:
if '>' in i:
index.append(i.replace("\n", "").replace(">", ""))
seq.append(seqplast.replace("\n", ""))
seqplast = ""
numlines += 1
else:
seqplast = seqplast + i.replace("\n", "")
numlines += 1
if numlines == len(lines):
seq.append(seqplast.replace("\n", ""))
seq = seq[1:]
return index, seq
def translation(seq):
i = 0
p = ""
while i < len(seq)/3 - 1:
n = seq[3 * i] +seq[3*i+1] + seq[3*i+2]
r = codon_table[n]
i += 1
p = p + r
return p
f = open('rosalind_splc.txt', 'r')
lines = f.readlines()
f.close()
(index, seq) = readfasta(lines)
totlaseq = seq[0]
introns = seq[1:]
for line in introns:
n = len(line)
i = 0
while i < len(totlaseq) - n + 1:
subseq = totlaseq[i:i + n] # 逐个扫描长度与内含子相同的序列
if subseq == line: # 若找到内含子
newseq = totlaseq[:i] + totlaseq[i + n:] # 用新字符串存储去掉内含子的部分
totlaseq = newseq # 用新串取代老串
i += 1
rnaseq = totlaseq.replace('T', 'U')
protein = translation(rnaseq)
print(protein)
f = open('output.txt', 'w')
f.write(protein)
f.close()