>gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA
CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTG
AATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGG
...
这种文件其实我们用简单地文本处理也是可以解决的,但是Bio库中为我们提供了更为简单地解决方法。
from Bio import SeqIO
for seq_record in SeqIO.parse("ls_orchid.fasta", "fasta"): # 关于文件格式 [参考这里](https://biopython.org/wiki/SeqIO)
print(seq_record.id) # 提取所有的标题 以 > 开头的
print(seq_record.seq) # 提取出所有的seq,也就是碱基序列
print(seq_record) # 提取出所有的read
print(len(seq_record)) # 这个是序列长,不包括标题,只是碱基多少个nc
# 我们解析GBK文件也是同理
for seq_record in SeqIO.parse("ls_orchid.gbk", "genbank"):
# 上面介绍的都是读取多条数据,下面是读取单条数据
SeqIO.read("filename","fasta") # 针对只有一个read 的fasta文件
SeqIO变身迭代器对象:
record_iterator = SeqIO.parse("ls_orchid.fasta", "fasta")
first_record = record_iterator.next()
print first_record.id
print first_record.description
second_record = record_iterator.next()
print second_record.id
print second_record.description
from Bio import SeqIO
orchid_dict = SeqIO.to_dict(SeqIO.parse("ls_orchid.fasta", "fasta"))
print orchid_dict.keys()
Bio.SeqIO.write()
的写入:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein
rec1 = SeqRecord(Seq("MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD" \
+"GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK" \
+"NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM" \
+"SSAC", generic_protein),
id="gi|14150838|gb|AAK54648.1|AF376133_1",
description="chalcone synthase [Cucumis sativus]")
rec2 = SeqRecord(Seq("YPDYYFRITNREHKAELKEKFQRMCDKSMIKKRYMYLTEEILKENPSMCEYMAPSLDARQ" \
+"DMVVVEIPKLGKEAAVKAIKEWGQ", generic_protein),
id="gi|13919613|gb|AAK33142.1|",
description="chalcone synthase [Fragaria vesca subsp. bracteata]")
rec3 = SeqRecord(Seq("MVTVEEFRRAQCAEGPATVMAIGTATPSNCVDQSTYPDYYFRITNSEHKVELKEKFKRMC" \
+"EKSMIKKRYMHLTEEILKENPNICAYMAPSLDARQDIVVVEVPKLGKEAAQKAIKEWGQP" \
+"KSKITHLVFCTTSGVDMPGCDYQLTKLLGLRPSVKRFMMYQQGCFAGGTVLRMAKDLAEN" \
+"NKGARVLVVCSEITAVTFRGPNDTHLDSLVGQALFGDGAAAVIIGSDPIPEVERPLFELV" \
+"SAAQTLLPDSEGAIDGHLREVGLTFHLLKDVPGLISKNIEKSLVEAFQPLGISDWNSLFW" \
+"IAHPGGPAILDQVELKLGLKQEKLKATRKVLSNYGNMSSACVLFILDEMRKASAKEGLGT" \
+"TGEGLEWGVLFGFGPGLTVETVVLHSVAT", generic_protein),
id="gi|13925890|gb|AAK49457.1|",
description="chalcone synthase [Nicotiana tabacum]")
my_records = [rec1, rec2, rec3]
from Bio import SeqIO
SeqIO.write(my_records, "my_example.faa", "fasta")
下面我们学习IUPAC,这是一个可以让我们明确你传入的是DNA,RNA,还是氨基酸序列的碱基串,它提供检查的作用:
from Bio.Alphabet import IUPAC
my_seq = Seq("AGTACACTGGT", IUPAC.unambiguous_dna) # 明确是DNA序列
my_seq.alphabet # IUPACUnambiguousDNA()
from Bio.SeqUtils import GC # 可以计算GC量,实际上没有必要,我们自己随便写一个函数就能解决
count = SeqIO.convert("ls_orchid.gbk", "genbank", "my_example.fasta", "fasta") # 文件格式转换
# ls_orchid.gbk 为已知的genbank文件,转化为my_example.fasta
得到序列的反向互补、互补核苷酸序列:
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
my_seq.complement() # 互补序列
my_seq.reverse_complement() # 反向互补序列,这个其实可以用切片的方法进行反向
下一步是转录:
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna) # 这是原始的DNA,现在求信使RNA
template_dna = coding_dna.reverse_complement() # 得到模板RNA
messenger_rna = coding_dna.transcribe() # 转录
messenger_rna.translate(to_stop=True)
# 翻译 加了to_stop参数表示终止密码子不翻译 如果表示用的非标准的起始密码子 需要加 cds=True
当你想要改变Seq时;事实上,seq相当于一个元组一样,不能直接改变会报错的,但是可以通过下面的方法:
my_seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)
mutable_seq = my_seq.tomutable() # 转化为可变的
mutable_seq[5] = "C"
new_seq = mutable_seq.toseq() # 恢复为不可变的
这是非常重要的知识点,在处理复杂的碱基序列中用得着:
新建 SeqRecord:
simple_seq = Seq("GATC")
from Bio.SeqRecord import SeqRecord
simple_seq_r = SeqRecord(simple_seq)
还可以通过初始化函数给 id, name和description赋值;反之,它们被设为默认值“unknown”(可随后编辑):
simple_seq_r.id = "AC12345"
simple_seq_r.description = "Made up sequence I wish I could write a paper about"
# 可随SeqRecord同时建立:
SeqRecord(simple_seq, id="AC12345")
用切片从父序列截取5:18,然后取反向互补序列
from Bio.SeqFeature import SeqFeature, FeatureLocation
example_parent = Seq("ACCGAGACGGCAAAGGCTAGCATAGGTATGAGACTTCCTTCCTGCCAGTGCTGAGGAACTGGGAGCCTAC")
example_feature = SeqFeature(FeatureLocation(5, 18), type="gene", strand=-1)
feature_seq = example_feature.extract(example_parent)
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein
record = SeqRecord(Seq("MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD" \
+"GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK" \
+"NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM" \
+"SSAC", generic_protein),
id="gi|14150838|gb|AAK54648.1|AF376133_1",
description="chalcone synthase [Cucumis sativus]")
print record.format("fasta")
Bio.AlignIO.read()
只能读取一个多序列比对而Bio.AlignIO.parse()
可以依次读取多个序列比对数据 :
from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.sth", "stockholm")