之前用的常规爬虫思路(import requests,from bs4 import BeautifulSoup)来下载文章题目,作者,来源等信息时, 偶尔会出现各种问题,有那个调试的时间,就自己根据biopython快速写了一个脚本 ,简单好用。
# !bin/python
# encoding:utf-8
from Bio import Entrez
from Bio import Medline
Entrez.email = '[email protected]'
ref = open('ref.txt','w+')
def downref(pmid):
handle = Entrez.efetch(db="pubmed" , id=pmid , rettype="medline" , retmode="text")
records = Medline.parse(handle)
records = list(records) # records 是一个迭代器,所以只能访问这些records一次。如果想保存这些records,需要把他们转成列表。
for record in records:
print "title:" , record.get("TI" , "?")
if len(record.get("AU" , "?"))>3:
author = ','.join(record.get("AU" , "?")[0:3]) #如果名字很多时,作者名字取前三个
print "authors:",author
else:
author =','.join(record.get("AU" , "?"))
print "authors:",author
print "source:" , record.get("SO" , "?")
text = record.get("TI" , "?")+'\t'+ author +'\t' +record.get("SO" , "?") +'\n'
ref.write(text)
if __name__=="__main__":
ids = ['21142915','25007187','27574448','16330681','24404132','22868256','23314736','18221820','24338217','17537913','19016765','22124095','18551042','17194903','22546611','22304580','22838950','21642870','19376514','20665488','24647007','23603345','23816762','19203783','25495407','21057378','22329723','21902499','21635146','24215845','22188361','19786980','21362365','21449681','24193570','17457342','19307503','20634689','17700593','23736036','20385995','25331073','24732178','20638924','21605004','22134350','17602053','20226083','18538455','18538445','24361227','16609362','18294295','21995462','26641474','18347181','26696550','19052714','18357466','24533712','23588304','19898482','2953022','26857783','20562211','19450125','15007088','16636344','23860526','20125120','24061601','25069034','20078613','19620936','27498158','25592234','24446315','19362955','16875718','22821704','17352537','18377430','19214144','23222202','24300978','22838949','23230131','16551864','24980784','27150640','25084203','22843789','20212519','16890579','26763541','22441531','21521023'] #可以读文件来输入pmid
for id in ids:
downref(id)
参考:https://blog.csdn.net/Cassiel60/article/details/90664558