读入DNA序列
from Bio import Seq
from Bio.Alphabet import IUPAC
dna = open("data/hemoglobin-gene.txt").read().strip()
dna = Seq.Seq(dna, IUPAC.unambiguous_dna)
print(dna)
运行结果:
ATGGTGCTGTCTCCTGCCGACAAGACCAACGTCAAGGCCGCCTGGGGTAAGGTCGGCGCGCACGCTGGCGAGTATGGTGCGGAGGCCCTGGAGAGGATGTTCCTGTCCTTCCCCACCACCAAGACCTACTTCCCGCACTTCGACCTGAGCCACGGCTCTGCCCAGGTTAAGGGCCACGGCAAGAAGGTGGCCGACGCGCTGACCAACGCCGTGGCGCACGTGGACGACATGCCCAACGCGCTGTCCGCCCTGAGCGACCTGCACGCGCACAAGCTTCGGGTGGACCCGGTCAACTTCAAGCTCCTAAGCCACTGCCTGCTGGTGACCCTGGCCGCCCACCTCCCCGCCGAGTTCACCCCTGCGGTGCACGCCTCCCTGGACAAGTTCCTGGCTTCTGTGAGCACCGTGCTGACCTCCAAATACCGTTAA
翻译成蛋白质序列
from Bio import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
# read the input sequence
dna = open("data/hemoglobin-gene.txt").read().strip()
dna = Seq.Seq(dna, IUPAC.unambiguous_dna)
# transcribe and translate
mrna = dna.transcribe()
protein = mrna.translate()
print(protein)
运行结果:
MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR*
写入fasta文件
"""
从Bio导入 4 个模块
Seq 用来创建序列对象
IUPAC用来定义一个序列对象用的生物字符集
SeqRecord 创建一个包含ID,注释,描述等的序列记录对象
SeqIO 提供了方法来读写格式化的序列文件
"""
from Bio import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
# read the input sequence
dna = open("data/hemoglobin-gene.txt").read().strip()
dna = Seq.Seq(dna, IUPAC.unambiguous_dna)
# transcribe and translate
#
mrna = dna.transcribe()
protein = mrna.translate()
# write the protein sequence to a file
protein_record = SeqRecord(protein, id='sp|P69905.2|HBA_HUMAN',
description="Hemoglobin subunit alpha, Homo sapiens")
outfile = open("data/HBA_HUMAN.fasta", "w")
SeqIO.write(protein_record, outfile,"fasta")
outfile.close()
运行查看结果文件HBA_HUMAN.fasta
>sp|P69905.2|HBA_HUMAN Hemoglobin subunit alpha, Homo sapiens
MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG
KKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP
AVHASLDKFLASVSTVLTSKYR*
from Bio import Seq
my_seq = Seq.Seq('AGCATCGTAGCATGCAC')
print(my_seq)
运行结果:
AGCATCGTAGCATGCAC
#默认的序列是编码链
from Bio import Seq
my_seq = Seq.Seq('AGCATCGTAGCATGCAC')
rna = my_seq.transcribe()
print(rna)
运行结果:
AGCAUCGUAGCAUGCAC
#如果是模板链
#先反向互补,后在转录
from Bio import Seq
from Bio.Alphabet import IUPAC
dna = Seq.Seq('AGCATCGTAGCATGCAC',IUPAC.unambiguous_dna)
cdna = dna.reverse_complement()
print(cdna)
mrna = cdna.transcribe()
print(mrna)
运行结果:
GTGCATGCTACGATGCT
GUGCAUGCUACGAUGCU
#索引,切片,分割,转换,序列大小写,计算字符个数
from Bio import Seq
dna = Seq.Seq('AGCATCGTAGCATGCAC GCATGCAC')
print(dna[0])
print(dna[0:3])
print(dna.split('T'))
print(dna.count("A"))
print(dna.count('A')/ float(len(dna)))
print(dna.find('CGTA'))
运行结果:
A
AGC
[Seq('AGCA', Alphabet()), Seq('CG', Alphabet()), Seq('AGCA', Alphabet()), Seq('GCAC GCA', Alphabet()), Seq('GCAC', Alphabet())]
7
0.2692307692307692
5
from Bio import SeqIO
fasta_file = open("data/Uniprot.fasta","r")
for seq_record in SeqIO.parse(fasta_file, "fasta"):
print(seq_record.id)
print(repr(seq_record.seq))
print(len(seq_record))
fasta_file.close()
运行结果:
sp|P03372|ESR1_HUMAN
Seq('MTMTLHTKASGMALLHQIQGNELEPLNRPQLKIPLERPLGEVYLDSSKPAVYNY...ATV', SingleLetterAlphabet())
595
sp|P62333|PRS10_HUMAN
Seq('MADPRDKALQDYRKKLLEHKEIDGRLKELREQLKELTKQYEKSENDLKALQSVG...KPV', SingleLetterAlphabet())
389
sp|P62509|ERR3_MOUSE
Seq('MDSVELCLPESFSLHYEEELLCRMSNKDRHIDSSCSSFIKTEPSSPASLTDSVN...AKV', SingleLetterAlphabet())
458
from Bio import SeqIO
# read fasta entries to a list
uniprot_iterator = SeqIO.parse("data/Uniprot.fasta", "fasta")
records = list(uniprot_iterator)
print(records[0].id)
print(records[0].seq)
print('-' * 40)
# read fasta entries to a dictionary
uniprot_iterator = SeqIO.parse("data/Uniprot.fasta", "fasta")
records = SeqIO.to_dict(uniprot_iterator)
print(records['sp|P03372|ESR1_HUMAN'].id)
print(records['sp|P03372|ESR1_HUMAN'].seq)
运行结果:
sp|P03372|ESR1_HUMAN
MTMTLHTKASGMALLHQIQGNELEPLNRPQLKIPLERPLGEVYLDSSKPAVYNYPEGAAYEFNAAAAANAQVYGQTGLPYGPGSEAAAFGSNGLGGFPPLNSVSPSPLMLLHPPPQLSPFLQPHGQQVPYYLENEPSGYTVREAGPPAFYRPNSDNRRQGGRERLASTNDKGSMAMESAKETRYCAVCNDYASGYHYGVWSCEGCKAFFKRSIQGHNDYMCPATNQCTIDKNRRKSCQACRLRKCYEVGMMKGGIRKDRRGGRMLKHKRQRDDGEGRGEVGSAGDMRAANLWPSPLMIKRSKKNSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHQRLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLHAPTSRGGASVEETDQSHLATAGSTSSHSLQKYYITGEAEGFPATV
----------------------------------------
sp|P03372|ESR1_HUMAN
MTMTLHTKASGMALLHQIQGNELEPLNRPQLKIPLERPLGEVYLDSSKPAVYNYPEGAAYEFNAAAAANAQVYGQTGLPYGPGSEAAAFGSNGLGGFPPLNSVSPSPLMLLHPPPQLSPFLQPHGQQVPYYLENEPSGYTVREAGPPAFYRPNSDNRRQGGRERLASTNDKGSMAMESAKETRYCAVCNDYASGYHYGVWSCEGCKAFFKRSIQGHNDYMCPATNQCTIDKNRRKSCQACRLRKCYEVGMMKGGIRKDRRGGRMLKHKRQRDDGEGRGEVGSAGDMRAANLWPSPLMIKRSKKNSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHQRLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLHAPTSRGGASVEETDQSHLATAGSTSSHSLQKYYITGEAEGFPATV
from Bio import SeqIO
genbank_file = open ("data/AY810830.gbk", "r")
output_file = open("data/AY810830.fasta", "w")
records = SeqIO.parse(genbank_file, "genbank")
SeqIO.write(records, output_file, "fasta")
output_file.close()