Python学习的第四天,有点懈怠了没坚持每天一更。
1. RNA序列转换成蛋白序列
codon_table = {
'GCU':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A', 'CGU':'R', 'CGC':'R',
'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R', 'UCU':'S', 'UCC':'S',
'UCA':'S', 'UCG':'S', 'AGU':'S', 'AGC':'S', 'AUU':'I', 'AUC':'I',
'AUA':'I', 'UUA':'L', 'UUG':'L', 'CUU':'L', 'CUC':'L', 'CUA':'L',
'CUG':'L', 'GGU':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G', 'GUU':'V',
'GUC':'V', 'GUA':'V', 'GUG':'V', 'ACU':'T', 'ACC':'T', 'ACA':'T',
'ACG':'T', 'CCU':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P', 'AAU':'N',
'AAC':'N', 'GAU':'D', 'GAC':'D', 'UGU':'C', 'UGC':'C', 'CAA':'Q',
'CAG':'Q', 'GAA':'E', 'GAG':'E', 'CAU':'H', 'CAC':'H', 'AAA':'K',
'AAG':'K', 'UUU':'F', 'UUC':'F', 'UAU':'Y', 'UAC':'Y', 'AUG':'M',
'UGG':'W',
'UAG':'STOP', 'UGA':'STOP', 'UAA':'STOP'
} #读取密码子表
rna = ''
for line in open('A06662-RNA.fasta'): #读取RNA序列文件
if not line.startswith('>'):
rna = rna + line.strip()
print(rna)
UGGGACCAGUCAGCAGAGGCAGCGUGUGUGCGCGUGCGUGUGCGUGUGUGUGCGUGUGUGUGUGUACGCUUGCAUUUGUGUCGGGUGGGUAAGGAGAUAGAGAUGGGCGGGCAGUAGGCCCAGGUCCCGAAGGCCUUGAACCCACUGGUUUGGAGUCUCCUAAGGGCAAUGGGGGCCAUUGAGAAGUCUGAACAGGGCUGUGUCUGAAUGUGAGGUCUAGAAGGAUCCUCCAGAGAAGCCAGCUCUAAAGCUUUUGCAAUCAUCUGGUGAGAGAACCCAGCAAGGAUGGACAGGCAGAAUGGAAUAGAGAUGAGUUGGCAGCUGAAGUGGACAGGAUUUGGUACUAGCCUGGUUGUGGGGAGCAAGCAGAGGAGAAUCUGGGACUCUGGUGGUCUGGCCUGGGGCAGACGGGGGUGUCUCAGGGGCUGGGAGGGAUGAGAGUAGGAUGAUACAUGGUGGUGUCUGGCAGGAGGCGGGCAAGGAUGACUAUGUGAAGGCACUGCCCGGGCAACUGAAGCCUUUUGAGACCCUGCUGUCCCAGAACCAGGGAGGCAAGACCUUCAUUGUGGGAGACCAGGUGAGCAUCUGGCC
for frame in range(3):
prot = ''
print('Reading frame ' + str(frame + 1))
for i in range(frame, len(rna), 3):
codon = rna[i:(i + 3)]
if codon in codon_table:
if codon_table[codon] == 'STOP':
prot = prot + '*'
else:
prot = prot + codon_table[codon]
else:
prot = prot + '-'
i = 0
while i < len(prot):
print(prot[i:i + 48])
i = i + 48
Reading frame 1
WDQSAEAACVRVRVRVCACVCVRLHLCRVGKEIEMGGQ*AQVPKALNP
LVWSLLRAMGAIEKSEQGCV*M*GLEGSSREASSKAFAIIW*ENPARM
DRQNGIEMSWQLKWTGFGTSLVVGSKQRRIWDSGGLAWGRRGCLRGWE
G*E*DDTWWCLAGGGQG*LCEGTARATEAF*DPAVPEPGRQDLHCGRP
GEHLA
Reading frame 2
GTSQQRQRVCACVCVCVRVCVYACICVGWVRR*RWAGSRPRSRRP*TH
WFGVS*GQWGPLRSLNRAVSECEV*KDPPEKPALKLLQSSGERTQQGW
TGRME*R*VGS*SGQDLVLAWLWGASRGESGTLVVWPGADGGVSGAGR
DESRMIHGGVWQEAGKDDYVKALPGQLKPFETLLSQNQGGKTFIVGDQ
VSIW-
Reading frame 3
GPVSRGSVCARACACVCVCVCTLAFVSGG*GDRDGRAVGPGPEGLEPT
GLESPKGNGGH*EV*TGLCLNVRSRRILQRSQL*SFCNHLVREPSKDG
QAEWNRDELAAEVDRIWY*PGCGEQAEENLGLWWSGLGQTGVSQGLGG
MRVG*YMVVSGRRRARMTM*RHCPGN*SLLRPCCPRTREARPSLWETR
*ASG-
2. 从fasta序列文件中查找insulin序列
swissprot = open("SwissProt.fasta")
insulin_ac = 'P61981'
result = None
while result == None:
line = next(swissprot)
if line.startswith('>'):
ac = line.split('|')[1]
if ac == insulin_ac:
result = line.strip()
print(result)
>sp|P61981|1433G_HUMAN 14-3-3 protein gamma OS=Homo sapiens GN=YWHAG PE=1 SV=2
3. 从fasta序列文件中查找不含有">"的行
for line in open("A06662-RNA.fasta"):
if not line.startswith(">"):
rna = rna + line.strip()
print(rna)
UGGGACCAGUCAGCAGAGGCAGCGUGUGUGCGCGUGCGUGUGCGUGUGUGUGCGUGUGUGUGUGUACGCUUGCAUUUGUGUCGGGUGGGUAAGGAGAUAGAGAUGGGCGGGCAGUAGGCCCAGGUCCCGAAGGCCUUGAACCCACUGGUUUGGAGUCUCCUAAGGGCAAUGGGGGCCAUUGAGAAGUCUGAACAGGGCUGUGUCUGAAUGUGAGGUCUAGAAGGAUCCUCCAGAGAAGCCAGCUCUAAAGCUUUUGCAAUCAUCUGGUGAGAGAACCCAGCAAGGAUGGACAGGCAGAAUGGAAUAGAGAUGAGUUGGCAGCUGAAGUGGACAGGAUUUGGUACUAGCCUGGUUGUGGGGAGCAAGCAGAGGAGAAUCUGGGACUCUGGUGGUCUGGCCUGGGGCAGACGGGGGUGUCUCAGGGGCUGGGAGGGAUGAGAGUAGGAUGAUACAUGGUGGUGUCUGGCAGGAGGCGGGCAAGGAUGACUAUGUGAAGGCACUGCCCGGGCAACUGAAGCCUUUUGAGACCCUGCUGUCCCAGAACCAGGGAGGCAAGACCUUCAUUGUGGGAGACCAGGUGAGCAUCUGGCC
4. 在fasta序列文件中查找是否含有特定的字符
bases = ['A','C','T','G'] #ATCG四种碱基
seq = "CAGGCCATTRKGL" #含有不同字符的seq序列,检测其中不是碱基序列的字符
for i in seq:
if i not in bases:
print(i, "is not a nucleotide")
R is not a nucleotide
K is not a nucleotide
L is not a nucleotide
5. 读取fasta文件并将其存入字典
{'GCU': 'A', 'CGA': 'R'} 这个就是一个字典,以{}将一个或多个key:value(键:值)形式的字符串封装起来
sequences = {}
ac = ''
seq = ''
for line in open("SwissProt.fasta"):
if line.startswith('>') and seq != '':
sequences[ac] = seq
seq = ''
if line.startswith('>'):
ac = line.split('|')[1]
else:
seq = seq + line.strip()
sequences[ac] = seq
print(sequences.keys())
dict_keys(['P31946', 'P62258', 'Q04917', 'P61981', 'P31947', 'P27348', 'P63104', 'P30443'])
print(sequences['P62258'])
MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLSVAYKNVIGARRASWRIISSIEQKEENKGGEDKLKMIREYRQMVETELKLICCDILDVLDKHLIPAANTGESKVFYYKMKGDYHRYLAEFATGNDRKEAAENSLVAYKAASDIAMTELPPTHPIRLGLALNFSVFYYEILNSPDRACRLAKAAFDDAIAELDTLSEESYKDSTLIMQLLRDNLTLWTSDMQGDGEEQNKEALQDVEDENQ
print(sequences)
{'P31946': 'MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGDAGEGEN', 'P62258': 'MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLSVAYKNVIGARRASWRIISSIEQKEENKGGEDKLKMIREYRQMVETELKLICCDILDVLDKHLIPAANTGESKVFYYKMKGDYHRYLAEFATGNDRKEAAENSLVAYKAASDIAMTELPPTHPIRLGLALNFSVFYYEILNSPDRACRLAKAAFDDAIAELDTLSEESYKDSTLIMQLLRDNLTLWTSDMQGDGEEQNKEALQDVEDENQ', 'Q04917': 'MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVCNDVLSLLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEISKEQMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDEEAGEGN', 'P61981': 'MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTSADGNEKKIEMVRAYREKIEKELEAVCQDVLSLLDNYLIKNCSETQYESKVFYLKMKGDYYRYLAEVATGEKRATVVESSEKAYSEAHEISKEHMQPTHPIRLGLALNYSVFYYEIQNAPEQACHLAKTAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDDDGGEGNN', 'P31947': 'MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSVAYKNVVGGQRAAWRVLSSIEQKSNEEGSEEKGPEVREYREKVETELQGVCDTVLGLLDSHLIKEAGDAESRVFYLKMKGDYYRYLAEVATGDDKKRIIDSARSAYQEAMDISKKEMPPTNPIRLGLALNFSVFHYEIANSPEEAISLAKTTFDEAMADLHTLSEDSYKDSTLIMQLLRDNLTLWTADNAGEEGGEAPQEPQS', 'P27348': 'MEKTELIQKAKLAEQAERYDDMATCMKAVTEQGAELSNEERNLLSVAYKNVVGGRRSAWRVISSIEQKTDTSDKKLQLIKDYREKVESELRSICTTVLELLDKYLIANATNPESKVFYLKMKGDYFRYLAEVACGDDRKQTIDNSQGAYQEAFDISKKEMQPTHPIRLGLALNFSVFYYEILNNPELACTLAKTAFDEAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDSAGEECDAAEGAEN', 'P63104': 'MDKNELVQKAKLAEQAERYDDMAACMKSVTEQGAELSNEERNLLSVAYKNVVGARRSSWRVVSSIEQKTEGAEKKQQMAREYREKIETELRDICNDVLSLLEKFLIPNASQAESKVFYLKMKGDYYRYLAEVAAGDDKKGIVDQSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLSEESYKDSTLIMQLLRDNLTLWTSDTQGDEAEAGEGGEN', 'P30443': 'MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRNMKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRCVDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSLTACKV'}
6. 根据氨基酸序列评估蛋白的protein disorder
每个氨基酸都特定的propensity values,一个蛋白的propensity values能反映出protein disorder。
#一个每个氨基酸propensity values的字典
propensities = {
'N': 0.2299, 'P': 0.5523, 'Q':-0.18770, 'A':-0.2615,
'R':-0.1766, 'S': 0.1429, 'C':-0.01515, 'T': 0.0089,
'D': 0.2276, 'E':-0.2047, 'V':-0.38620, 'F':-0.2256,
'W':-0.2434, 'G': 0.4332, 'H':-0.00120, 'Y':-0.2075,
'I':-0.4222, 'K':-0.1001, 'L': 0.33793, 'M':-0.2259
}
#设置了propensity values的阈值为0.3
threshold = 0.3
#输入的氨基酸序列
input_seq = "IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSG\
IQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSR\
VASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYP\
GQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYT\
KVCNYVSWIKQTIASN"
output_seq = ""
for res in input_seq:
if res in propensities:
if propensities[res] >= threshold: #如果氨基酸的propensity values大于等于0.3,则该氨基酸以大写字母表示
output_seq += res.upper()
else:
output_seq += res.lower() #如果氨基酸的propensity values小于0.3,则该氨基酸以小写字母表示
else:
print("unrecognized character:", res)
break
print(output_seq)
ivGGytcGantvPyqvsLnsGyhfcGGsLinsqwvvsaahcyksGiqvrLGedninvveGneqfisasksivhPsynsntLnndimLikLksaasLnsrvasisLPtscasaGtqcLisGwGntkssGtsyPdvLkcLkaPiLsdsscksayPGqitsnmfcaGyLeGGkdscqGdsGGPvvcsGkLqGivswGsGcaqknkPGvytkvcnyvswikqtiasn
7. 从PDB文件中读取序列
aa_codes = {
'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E',
'PHE':'F', 'GLY':'G', 'HIS':'H', 'LYS':'K',
'ILE':'I', 'LEU':'L', 'MET':'M', 'ASN':'N',
'PRO':'P', 'GLN':'Q', 'ARG':'R', 'SER':'S',
'THR':'T', 'VAL':'V', 'TYR':'Y', 'TRP':'W'}
seq = ""
for line in open("1TLD.pdb"):
if line[0:6] == "SEQRES":
columns = line.split()
for resname in columns[4:]:
seq = seq + aa_codes[resname]
i = 0
print(">1TLD")
>1TLD
while i < len(seq):
print(seq[i:(i+64)])
i = i +64
IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQF
ISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSG
TSYPDVLKCLKAPILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVS
WGSGCAQKNKPGVYTKVCNYVSWIKQTIASN
print(seq)
IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN