网络爬虫爬取简历模板案例

import requests
from lxml import etree
import os
url = 'https://sc.chinaz.com/jianli/free.html'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}

if not os.path.exists('./jianli'):
    os.makedirs('./jianli')
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
list_address = []
href_list = tree.xpath('//div[@id="container"]/div/a')
for href in href_list:
    hrefs = href.xpath('./@href')[0]#拿到了链接
    response = requests.get(url=hrefs,headers=headers)
    response.encoding = 'utf-8'
    href_text  =response.text
    tree_href = etree.HTML(href_text)
    down_href = tree_href.xpath('//ul[@class="clearfix"]/li[1]/a/@href')[0]#拿到了下载链接
    name_href = tree_href.xpath('//div[@class="bgwhite"]//h1/text()')[0] + '.ran'
    #name_href = name_href.encode("ISO-8859-1").decode("gbk")  # 通用,解决中文乱码问题 方法二
    down_text = requests.get(url=down_href,headers=headers).content
    ran_path = 'jianli/' +name_href
    with open(ran_path,'wb') as fp:
        fp.write(down_text)
        print(name_href+ "简历保存完毕!!!")

测试期间,只爬取了一页建立,想要爬取所有的简历模板只需要改动url = 'https://sc.chinaz.com/jianli/free.html',加上相应的参数,用循环套起来即可

你可能感兴趣的:(python爬虫,爬虫)