51job爬取招聘信息(python)

51job爬取招聘信息

2020.09.04

爬虫技术路线:requests库+bs4+xlwt
程序中分为三个函数:
spider(url)函数提取网页,返回整个网页源码
jiexi(html.info)函数解析网页,并且提取信息,参数html是网页源码,info是存放信息的列表
save(data)函数是保存数据的,将提取出来的数据进行逐一的保存至excel文件中去

# -*- coding: utf-8 -*-
# Author : YRH
# Data : 
# Project : 
# Tool : PyCharm
import requests
from bs4 import BeautifulSoup
import xlwt


def spider(url):
    headers = {
     
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"}
    try:
        rep = requests.get(url, headers=headers)
        rep.raise_for_status()
        rep.encoding = rep.apparent_encoding
        txt = rep.text
        return txt
    except:
        print("解析失败")


def jiexi(html, info):
    soup = BeautifulSoup(html, "lxml")
    text = soup.find_all("script", type="text/javascript")[2].string
    data = eval(str(text).split("=", 1)[1])["engine_search_result"]
    for d in data:
        try:
            job_name = d["job_name"].replace("\\", "")  # 岗位名称
        except:
            job_name = " "
        try:
            company_href = d["company_href"].replace("\\", "")  # 招聘网站
        except:
            company_href = " "
        try:
            company_name = d["company_name"].replace("\\", "")  # 公司名称
        except:
            company_name = " "
        try:
            providesalary_text = d["providesalary_text"].replace("\\", "")  # 薪资
        except:
            providesalary_text = " "
        try:
            workarea_text = d["workarea_text"].replace("\\", "")  # 工作地点
        except:
            workarea_text = " "
        info.append([job_name, company_name, workarea_text, providesalary_text, company_href])


def save(data):
    print("save.....")
    workbook = xlwt.Workbook(encoding="utf-8")  # 创建workbook对象
    movieBook = workbook.add_sheet("sheet1")  # 创建工作表

    # 输入头标签
    head = ["岗位", "公司名称", "工作地点", "薪资", "招聘网站"]
    for i in range(0, len(head)):
        movieBook.write(0, i, head[i])  # 参数1是行,参数2是列,参数3是值

    # 数据逐行输入
    y = 1
    for a in data:
        print("成功保存:" + str(y))
        for x in range(0, len(a)):
            movieBook.write(y, x, a[x])
        y += 1

    workbook.save("招聘信息.xls")  # 保存数据表


if __name__ == '__main__':
    name = input("请输入岗位名称")
    page = eval(input("请输入爬取页数"))
    info = []
    for i in range(1,page+1):
        url = "https://search.51job.com/list/000000,000000,0000,00,9,99," + name + ",2," + str(i) + ".html?"
        html = spider(url)
        jiexi(html,info)
    save(info)

注:禁止用于商业手段

你可能感兴趣的:(python爬虫,python,大数据,爬虫)