Python爬虫练习-Xpath解析批量爬取PPT

批量爬取PPT,分页爬取

import os
import requests
from lxml import etree

if __name__ == '__main__':
    # UA伪装
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.70 Safari/537.36'
    }
    # 指定url
    url = 'https://sc.chinaz.com/ppt/free_%d.html'
    # 分页爬取
    for pageNum in range(1, 3):
        new_url = format(url % pageNum)
        if pageNum == 1:
            new_url = 'https://sc.chinaz.com/ppt/free.html'
        # 发起请求
        response = requests.get(url=new_url, headers=headers)
        # 手动设置响应数据的编码格式
        response.encoding = 'utf-8'
        page_text = response.text
        # xpath解析标签
        tree = etree.HTML(page_text)
        ppt_list = tree.xpath('//div[@class="container clearfix"]/div[5]/div')
        # 判断文件夹
   

你可能感兴趣的:(Python爬虫,爬虫,python,开发语言)