rosalind-python

RNA

AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA

转换为protein,按照表

MAMAPRTEINSTRING

代码

#change the table to a dict for searching
f = open('dd.txt')
lines = f.readlines()
dict = {}
key = ""
for line in lines:
    line.strip("\n")
    line = line.split()
    for i in range(len(line)):
        if len(line[i]) == 3:
            dict[line[i]] = ""
            key = line[i]
        else :
            dict[key] = line[i]           

f1 = open('gg.txt')
line = f1.readline().strip("\n")
#split the string to 3 letters per group
res= [line[i:i+3] for i in range(0,len(line),3)]

abc = []
for i in res:
    #Stop doesnot show
    if dict[i] != "Stop":        
        abc.append(dict[i])
print("".join(abc))

统计基因段中的碱基,并返回每列最大的碱基

#每段基因查询碱基
def FindGen(word, str1):     
    lenght = "".join(str1[0])
    res = [0] * len(lenght)   
    for i in str1:         
        i = "".join(i)  
           
        for j in range(len(i)):            
            if i[j] == word:
                res[j] = res[j] +1
    strA = map(str, res)
    print("{}:".format(word), end=" ")
    
    for i in res:
        print(i, end=" ")          
    print("\r")
    return  res 
#读取基因段,将基因段放入字典
f = open("gg.txt")
lines = f.readlines()
dict = {}
#if "<", get the line to key
#if not, get the line to value
for line in lines:
    index = lines.index(line)
    line = line.strip('\n')
    if line[0] == ">" :
        name = line.split(">")[1]
        dict[name] = []
    else:
        dict[name].append(line)
#读取基因段
value = list(dict.values())
#查询碱基
GG = ["A", "C", "G", "T"]
countA = FindGen("A", value)
countC = FindGen("C", value)
countG = FindGen("G", value)
countT = FindGen("T", value) 
ginstr = []
#每列最大的碱基
for i in range(len(countA)):
    mm = [countA[i], countC[i], countG[i], countT[i]] 
    bb = mm.index(max(mm))
    ginstr.append(GG[bb])
print(''.join(ginstr))

找到基因段中最后三个字符与最初三个字符一致的基因段

#读取基因段,将基因段放入字典
f = open("gg.txt")
lines = f.readlines()
dict = {}
#if "<", get the line to key
#if not, get the line to value
for line in lines:
    index = lines.index(line)
    line = line.strip('\n')
    if line[0] == ">" :
        name = line.split(">")[1]
        dict[name] = []
    else:
        dict[name].append(line)
#多行转换为一个字符串
for i in dict.keys():
    dict[i] = ''.join(dict[i]) 
#最后三个字符等于起始的三个字符 
for i in dict.keys(): 
    finder = dict[i][-3:]       
    for j in dict.keys():        
        if j != i and finder == dict[j][0:3]:            
            print(i,j)

找到基因里面的最大共同碱基串

#读取基因段,将基因段放入字典
f = open("gg.txt")
lines = f.readlines()
dict = {}
#if "<", get the line to key
#if not, get the line to value
for line in lines:
    index = lines.index(line)
    line = line.strip('\n')
    if line[0] == ">" :
        name = line.split(">")[1]
        dict[name] = []
    else:
        dict[name].append(line)
#多行转换为一个字符串
for i in dict.keys():
    dict[i] = ''.join(dict[i]) 
#最后三个字符等于起始的三个字符 

ii = list(dict.values())
kk = []
res = ""
#取第一个基因,生成需要匹配的字段,字符串从大到小
length = len(ii[0])
for i in range(length): 
    strlen = length - i 
    for j in range(length):    
        if j + strlen <= length :                      
            kk.append(ii[0][j:j+strlen])            
#需要匹配的字段,每次取出一个,匹配所有基因,如果匹配到一个,就退出
for i in kk:
    tmp =[]
    for j in ii:
        re = j.find(i)
        tmp.append(re)
    
    if -1 not in tmp:
        print(i)
        break

数组全排列和输出到文件

import itertools
iter = itertools.permutation(list)

#数组输出到文件,但是去掉中括号和逗号
pp = str(list).strip('[').strip(']').replace(',','') + "\n"
f.write(pp)

你可能感兴趣的:(rosalind-python)