python 爬取简历案例

import requests
import os
from lxml import etree

if __name__ == '__main__':
    if not os.path.exists('./sucai'):
        os.mkdir('./sucai')
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'
    }
    url = 'https://sc.chinaz.com/jianli/free_%d.html'
    # 分页操作
    for pageNum in range(5, 50):
        if pageNum == 1:
            new_url = 'https://sc.chinaz.com/jianli/free.html'
        else:
            new_url = format(url % pageNum)
        # 获取数据
        response = requests.get(url=new_url, headers=headers)
        response.encoding = 'utf-8'
        page_text = response.text
        # 实例化对象
        tree = etree.HTML(page_text)
        # 定位简历详情页标签//*为全局标签
        div_list = tree.xpath('//*[@id="container"]/div')
        print('爬取数据开始....')
        for div in div_list:
            # 获取详情页的url
            detail_url = 'https:' + div.xpath('./a/@href')[0]
            # 设置简历名字
            page_name = div.xpath('./a/img/@alt')[0] + '.rar'
            # 获取详情页数据
            # timeout 为防止服务器不能及时响应 进行参数化
            responsee = requests.get(url=detail_url, headers=headers,timeout=(3,5))
            # responsee.encoding = 'utf-8'
            responsee.encoding = responsee.apparent_encoding
            detail_data = responsee.text
            # print(detail_data)
            tree2 = etree.HTML(detail_data)  # 实例化
            # 选择下载地址1 即li[1]
            download_list = tree2.xpath('//div[@id="down"]/div[2]/ul/li[1]/a/@href')[0]
            # download_list = tree2.xpath('//*[@id="down"]/div[2]/ul/li[1]/a/@href')[0]
            # print(download_list)
            # timeout 为防止服务器不能及时响应 进行参数化
            download_data = requests.get(url=download_list, headers=headers, timeout=(3,10))
            download_data.encoding = 'utf-8'
            download_data = download_data.content
            filepath = 'sucai/' + page_name
            with open(filepath, 'wb') as fp:
                fp.write(download_data)
            print(page_name, '爬取成功!!!')

你可能感兴趣的:(python,python,开发语言,后端)