ncib网站爬虫源代码(上一篇博客内容)

from bs4 import BeautifulSoup
import requests
from lxml import html
start_url = 'https://www.ncbi.nlm.nih.gov/pubmed/?term=Journal+of+medicinal+chemistry'
url = 'https://www.ncbi.nlm.nih.gov/pubmed/{}'
header={
'Cookie': ' xxx '
'Host': 'www.ncbi.nlm.nih.gov',
'Referer':start_url,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0',
}
def get_Detail():          #获取分页url,传递url。
res = requests.get(start_url, headers=header)
soup = BeautifulSoup(res.text,'html.parser')
for i in range(0,20):
pmid = soup.select('.rprtid')[i].text.lstrip('PMID:').strip()
each_url = url.format(pmid)
get_con(each_url)
def get_con(each_url):    #获取详细页面函数
res = requests.get(each_url)
tree = html.fromstring(res.content)
print("标题:")
title = tree.xpath('//h1/text()')
print(title[0],end='')
print("\n作者:")
auther = tree.xpath('//div[@class="auths"]/a/text()')
for auter in auther:
print(auter,end=",")
print("\n摘要:")
abstract = tree.xpath('//div[@class=""]/p/text()')
if abstract:
print(abstract[0],end="")
else:
print(" ")
print("\nPMID:")
pmid = tree.xpath('//dl[@class="rprtid"]/dd/text()')
print(pmid[0],end="")
print("\nDOI:")
doi = tree.xpath('//dl[@class="rprtid"]/dd/a/text()')
print(doi[0])


if __name__ == '__main__':
get_Detail()

你可能感兴趣的:(ncib网站爬虫源代码(上一篇博客内容))