import urllib.request
import urllib.parse
from lxml import etree
def query(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Referer':'https://www.baidu.com/link?url=_A6PAPRyw5Gs2ITJuAiY91laVo3xjjRUzUOTRV7K_7ObZuWV-LshlI9xBVBPWYIgIVhfRH13NxGwDDG3i-93L48AFaO0Smrj7GzaKhCOMQTHl6Wfc1Cjhm4DGcU8M99lHu5G6YURVZJwShNyZ5pCNLQs_mPRhrlzIZfsIIsuDmO&wd=&eqid=c7d9761f0002125100000004627c655d' }
req = urllib.request.Request(url=url, headers=headers, method='GET')
response = urllib.request.urlopen(req)
text = response.read().decode('utf-8')
html = etree.HTML(text)
sen_list = html.xpath('//div[contains(@class,"lemma-summary") or contains(@class,"lemmaWgt-lemmaSummary")]//text()')
sen_list_after_filter = [item.strip('\n') for item in sen_list]
return ''.join(sen_list_after_filter)
if __name__ =="__main__":
url = 'https://baike.baidu.com/item/%E7%BE%8E%E5%9B%BD%E8%81%94%E9%82%A6%E8%B0%83%E6%9F%A5%E5%B1%80/297801?fr=aladdin'
result = query(url)
print('结果:' + result)