# 爬虫爬取微信公众号文章中的图片
import requests
from lxml import etree
test_url = 'https://mp.weixin.qq.com/s/AH7tdFR_QGTMg2wxaj0sog'
headers = {
'host': 'mp.weixin.qq.com',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0'
}
# 下载图片的函数定义
def download_pic(url):
print('download pic:' + url)
try:
pic_name = url.split('/')[-2]
fmt = url.split('=')[-1]
img_resp = requests.get(url).content
with open(pic_name + '.' + fmt, 'wb+') as f:
f.write(img_resp)
except Exception as reason:
print(str(reason))
# Xpath解析下载的函数定义
def get_pic(content):
img_list = content.xpath('//img/@data-src')
for img in img_list:
download_pic(img)
if __name__ == '__main__':
res = requests.get(url=test_url, headers=headers).text
html = etree.HTML(res)
get_pic(html)
不得不说,比较之后会发现,Xpath解析确实是最快、适用性最强的解析方法,毕竟Xpath语法底层是C语言开发的。