【网络爬虫】Python爬取51job职位相关信息到EXCEL表

【网络爬虫】Python爬取51job职位相关信息到EXCEL表

       运行代码直接输入要搜索的岗位,回车即可生成excel表格,可根据需要提取每个职位URL页面的信息。同时可根据要搜索的范围更改职位列表的页码,其中每一页有50个职位。【网络爬虫】Python爬取51job职位相关信息到EXCEL表_第1张图片

完整代码如下:

# -*- codeing = utf-8 -*-
# @Time : 2021/1/20 22:05
# @Author : 陈良兴
# @File : 51job_to_excel.py
# @Software : PyCharm

from bs4 import BeautifulSoup                       #网页解析,获取数据
from urllib import parse
import requests
import re                                           #正则表达式,进行文字匹配
import urllib.request,urllib.error                  #制定URL,获取网页数据
import xlwt                                         #进行excel操作


jobData = []                                        #每一张网页的岗位列表
job_href = []                                       #全部岗位url

#https://search.51job.com/list/040000,000000,0000,00,9,99,%25E6%259C%25BA%25E6%25A2%25B0,2,1.html
kw = input("请输入你要搜索的岗位关键字:")
keyword = parse.quote(parse.quote(kw))              #二次转换

def main():
    for page in range(1, 2):                        #要提取的岗位列表的页面数量,每页50条
        url = "https://search.51job.com/list/040000,000000,0000,00,9,99," + keyword + ",2," + str(page) + ".html"
        # url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,%25E6%259C%25BA%25E6%25A2%25B0,2,1.html'
        headers = {
     
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.5.4313.400"
        }
        page_text = requests.get(url=url,headers=headers).text

        pat_job_href = r'"job_href":"(.*?)\?'
        jobList = re.findall(pat_job_href,page_text,re.S)
        job_href.extend(jobList)
        page += 1
    # print(len(jobList))

    # 进入岗位链接页面获取相关信息
    for i in range(0, len(job_href)):                               #分别进入每个岗位的链接,提取相关信息
        pageurl = job_href[i].replace("\\", "")
        request = urllib.request.Request(url=pageurl, headers=headers)
        html = ""
        # print(html)
        try:                                                        #异常处理
            response = urllib.request.urlopen(request)
            html = response.read().decode("gbk")
        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reason)

        bs = BeautifulSoup(html, "html.parser")                      #解析html,提取数据

        jnames = bs.select(" .cn > h1 ")                             #工作岗位名称
        for jname in jnames:
            jobData.append({
     "工作岗位": jname["title"]})

        cnames = bs.select(" .catn ")                                #公司名称
        for cname in cnames:
            jobData.append({
     "公司": cname["title"]})

        jobMsgStr = bs.select(" .job_msg > p")                       #职位信息
        job_information = ""
        for job_str in jobMsgStr:
            job_information = job_information + job_str.text

        companyMsgStr = bs.select(" .tmsg.inbox ")                   #公司信息
        company_information = ""
        for company_str in companyMsgStr:
            company_information = company_information + company_str.text

        salary_info = bs.select(" .cn > strong")                     # 薪资
        salary = str(salary_info).lstrip('[').rstrip(']')

        job_category_info = str(bs.select(" .mt10 > p.fp > a.el.tdn"))  # 职能类别
        pat_job_category = r'(.*?).*?'
        job_category_find = re.findall(pat_job_category, job_category_info, re.S)
        job_category = '、'.join(job_category_find)

        days = bs.select(".ltype")
        info = days[0]["title"].split("|")
        if len(info) == 5:
            for j in range(0, 5):
                info[j].strip()
            site = info[0].strip()                                  # 公司所在地
            work_experience = info[1].strip()[0:-2]                 # 工作经验
            education = info[2].strip()                             # 学历
            recruitment = info[3].strip()                           # 招聘人数
            # date_of_issue = info[4].strip()[0:-2]                 # 发布日期
        else:
            for j in range(0, len(info)):
                info[j].strip()
            site = info[0].strip()                                  # 公司所在地
            work_experience = info[1].strip()[0:-2]                 # 工作经验
            education = "学历不限"                                    # 学历
            recruitment = info[2].strip()                           # 招聘人数
            # date_of_issue = info[3].strip()[0:-2]                 # 发布日期

        # 将获取的信息放入字典
        jobData.append({
     "工作经验": work_experience})
        jobData.append({
     "学历": education})
        jobData.append({
     "薪资": salary})
        jobData.append({
     "招聘人数": recruitment})
        jobData.append({
     "职能类别": job_category})
        jobData.append({
     "职位信息": job_information.replace("\xa0", "")})
        jobData.append({
     "地址": site})
        jobData.append({
     "公司信息": company_information.replace("\xa0", "")})
        # jobData.append({
     "date_of_issue": date_of_issue})
    saveData(jobData,".\\51job最新职位表.xls")


#保存数据
def saveData(jobData,savepath):
    print("\n\033[34;1m正在保存,请稍等......\033[0m")
    book = xlwt.Workbook(encoding="utf-8",style_compression=0)               #创建workbook对象
    sheet = book.add_sheet('51job最新50条职位表',cell_overwrite_ok=True)        #创建工作表
# jnames(工作岗位名称)、cnames(公司名称)、work_experience(工作经验)、education(学历)、salary(薪资)、
# recruitment(招聘人数)、job_category(职能类别)、job_information(职位信息)、
# site(地址)、company_information(公司信息)、date_of_issue(发布日期)
    col = ("工作岗位","公司","工作经验","学历","薪资","招聘人数","职能类别","职位信息","地址","公司信息")
    k = 0
    # col = ("jnames","cnames","work_experience","education","salary","recruitment","job_category","job_information","site","company_information")
    for i in range(0,10):
        sheet.write(0,i,col[i])                                             #行名
    for i in range(0,len(job_href)):
        print("第%d条" %(i+1))
        for n in range(0,10):
            sheet.write(i+1,n,jobData[k].get(col[n]))                        #向excel表写入数据
            k += 1
            n += 1

    book.save(savepath)                                      #保存数据表


if __name__ == "__main__":
    #调用函数
    main()
    print("\n\033[31;1m爬取完毕!!!\033[0m")
    print("\n\033[31;1m已生成Excel表,请打开查看!!!\033[0m")

你可能感兴趣的:(python,python)