安装库: Windows + R 输入cmd 后 执行下面这两行
pip install requests
pip install lxml
import requests
from lxml import etree
import os
废话不多说直接上源码,代码复制后就可以直接运行
def one_url(url):
resp = requests.get(url)
resp.encoding = 'gb2312'
return resp.text
# 解析数据并保存数据
def analysis(xpath, url):
input_pwd = input("请输入下载路径:")
os.chdir(input_pwd)
html = etree.HTML(xpath)
for e in range(1, 8):
currency = html.xpath(f'/ html / body / div[5] / div / ul[{e}] ')
for i in currency:
title = i.xpath('./ li[1]/text()')[0]
if input_pwd == os.getcwd():
os.mkdir(title)
os.chdir(title)
else:
os.chdir(input_pwd)
os.mkdir(title)
os.mkdir(title)
name = i.xpath('./ li/ a/text()')
href = i.xpath('./ li/ a/@href')
oss_get = os.getcwd()
nuw = 1
for n, h in zip(name, href):
os_get = os.getcwd()
if oss_get == os_get:
os.mkdir(n)
os.chdir(n)
else:
os.chdir(oss_get)
os.mkdir(n)
os.chdir(n)
type_url = url + h.strip('/')
resp_template = requests.get(type_url)
resp_template.encoding = 'gb2312'
html = etree.HTML(resp_template.text)
xpath = html.xpath('/html/body/div[5]/dl/dd/ul/li/a/@href')
for u in xpath:
ppt_url = url + u.strip('/')
resp_ppt = requests.get(ppt_url)
resp_ppt.encoding = 'gb2312'
html_ppt = etree.HTML(resp_ppt.text)
xpath_ppt = html_ppt.xpath('/html/body/div[4]/div[1]/dl/dd/ul[1]/li/a/@href')
xpath_name = html_ppt.xpath('/html/body/div[4]/div[1]/dl/dd/div[1]/h1/text()')
for x_url, x_name in zip(xpath_ppt, xpath_name):
download_url = url + x_url.strip('/')
resp_ppt_download = requests.get(download_url)
html_download = etree.HTML(resp_ppt_download.text)
xpath_download = html_download.xpath('/html/body/dl/dd/ul[2]/li[1]/a/@href')
for download in xpath_download:
resp_download = requests.get(download)
with open(f'{x_name}.zip', mode='wb') as f:
f.write(resp_download.content)
print(x_name + '下载完成', nuw)
nuw += 1
# 执行函数
def main():
url = 'https://www.ppt.com/'
resp_text = one_url(url)
analysis(resp_text, url)
if __name__ == '__main__':
main()
拿到PPT模板分类标签,然后创建目录
切换到创建好的目录中,开始下载PPT
保存PPT模板到本地
有没有更简单的爬取方法,欢迎在下面评论