相信 Entrez 的强大是有目共睹的,BioPython 将它几乎所有操作都封装为方法,使我们可以更加方便的利用这个强悍工具。对于分析比对多个序列文件时的工作量说多了都是泪。比如,老板让你比对自己测定序列与 NCBI 库中序列,并构建相应的进化树,而这个序列需要大于100条。我想你的心情不会和下载一条序列时那么平静,那么,接下来通过BioPython提供的接口来实现快速的自动化序列下载。
from Bio import Entrez,SeqIO
# 参数设置
Entrez.email = "[email protected]"
Entrez.tool = "exampleScript"
# 查询 oct4 基因的在 Nucleotide 中的总数
hd_egquery = Entrez.egquery(term="oct4")
read_egquery = Entrez.read(hd_egquery)
total = 0
for ele in read_egquery["eGQueryResult"]:
if ele["MenuName"] == "Nucleotide":
total = ele["Count"]
# 得到查询 id 列表
hd_esearch = Entrez.esearch(db="nucleotide", term="oct4", retmax=total)
read_esearch = Entrez.read(hd_esearch)
# 这里我们只取前两个序列
ids = read_esearch["IdList"][:2]
# 用得到的 id 列表去下载每一条 fasta 文件,并合并,以便后续分析使用(比如进化树构建)
hd_efetch_fa = Entrez.efetch(db='nucleotide', id=ids, rettype='fasta')
read_efetch_fa = hd_efetch_fa.read()
with open("res/oct4.fasta","w") as file:
file.write(read_efetch_fa)
# 同理你可以得到 xml 格式的序列信息
hd_efetch_xml = Entrez.efetch(db="nucleotide", id=ids, retmode="xml")
read_efetch_xml = Entrez.read(hd_efetch_xml)
print(read_efetch_xml)
hd_efetch_gb = Entrez.efetch(db="nuccore", id=ids, rettype="gb", retmode="text")
# 这里读取的是文本文件,保存为本地数据
read_efetch_gb = hd_efetch_gb.read()
with open("res/oct4.gb","w") as file:
file.write(read_efetch_gb)
# 如果需要提取其中一些信息,可以按照以下步骤, 这里需要注意需要重新得到 efetch 句柄
hd_efetch_gb = Entrez.efetch(db="nuccore", id=ids, rettype="gb", retmode="text")
parse_efetch_gb = SeqIO.parse(hd_efetch_gb, "gb")
# 这里可以保存为 xls 或者 csv 格式
for ele in parse_efetch_gb:
print(ele.name, ele.annotations['molecule_type'], ele.seq)
还记得上一篇教程中提到的历史记录吗?
利用这个特性,不仅可以减轻 Entrez 服务器的负载,更可以同时获取多条数据,节省大量时间精力
from Bio import Entrez
# 参数设置
Entrez.email = "[email protected]"
Entrez.tool = "exampleScript"
hd_search = Entrez.esearch(db="nucleotide", term="oct4", usehistory="y")
read_search = Entrez.read(hd_search)
webenv = read_search["WebEnv"]
query_key = read_search["QueryKey"]
# 使用历史记录特性来进行搜索。
# Entrez 将会提前进行缓冲,提高查询效率
step = 5
total = 10
with open("res/res_env_oct4.fasta", "w") as res_file:
for start in range(0, total, step):
end = min(total, start+step)
print("Download record %i to %i" % (start+1, end))
hd_fetch = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", retstart=start, retmax=step, webenv=webenv, query_key=query_key)
records = hd_fetch.read()
res_file.write(records)
from Bio import Entrez
from Bio import Medline
# 参数设置
Entrez.email = "[email protected]"
Entrez.tool = "exampleScript"
# 用 esearch 在 pubmed 库中搜索关键字为 "mouse" 的文章
# RetMax 这个参数为每次返回的最大个数,因此如果把Count的值赋给RetMax就会获取全部的mouse的文章,这里为实例设置为100
hd_esearch = Entrez.esearch(db="pubmed", term="mouse", RetMax="100")
read_esearch = Entrez.read(hd_esearch)
idlist = read_esearch["IdList"]
print ("Total: ", read_esearch["Count"])
# 用 efetch下载
hd_efetch = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text", )
# 用 Medline 来解析
parse_medline = Medline.parse(hd_efetch)
with open("res/mouse_pubmed.xls", "w") as file:
file.write("title\tauthors\tsource\tPubMed\n")
for i, ele in enumerate(list(parse_medline)):
line = ele['TI'] + "\t" + ",".join(ele['AU']) + "\t" + ele['SO'] + "\t" + ele['PMID'] + "\n"
file.write(line)
print (i, line)
from Bio import Entrez
# 参数设置
Entrez.email = "[email protected]"
Entrez.tool = "exampleScript"
# 搜索
hd_esearch = Entrez.esearch(db="pubmed", term="Sus scrofa", reldate=365, ptyp="Review", usehistory="y")
read_esearch = Entrez.read(hd_esearch)
total = int(read_esearch["Count"])
webenv = read_esearch["WebEnv"]
query_key = read_esearch["QueryKey"]
# 这里演示设定total为 10
total = 10
step = 5
print("Result items: ", total)
with open("res/recent_review_sus_scrofa.txt", "w") as file:
for start in range(0, total, step):
print("Download record %i to %i" % (start + 1, int(start+step)))
hd_efetch = Entrez.efetch(db="pubmed", retstart=start, retmax=step, webenv=webenv, query_key=query_key, rettype="medline", retmode="text")
file.write(hd_efetch.read())
NCBI 提供了很多生物相关数据库,用法几乎差不多,可以根据自身研究或者感兴趣的方向自行选择。
下面的例子是利用NCBI中的分类库 Taxonomy 来查询我们人类在分类学中的位置。
# =====查看物种谱系=====
from Bio import Entrez
# 参数设置
Entrez.email = "[email protected]"
Entrez.tool = "exampleScript"
# 在 Taxonomy 库中搜索 Homo sapiens
hd_esearch = Entrez.esearch(db="Taxonomy", term="Homo sapiens")
read_esearch = Entrez.read(hd_esearch)
id = read_esearch["IdList"][0]
hd_eftech = Entrez.efetch(db="Taxonomy", id=id, retmode="xml")
read_eftech = Entrez.read(hd_eftech)
print(read_eftech[0]["Lineage"])