运行代码直接输入要搜索的岗位,回车即可生成excel表格,可根据需要提取每个职位URL页面的信息。同时可根据要搜索的范围更改职位列表的页码,其中每一页有50个职位。
# -*- codeing = utf-8 -*-
# @Time : 2021/1/20 22:05
# @Author : 陈良兴
# @File : 51job_to_excel.py
# @Software : PyCharm
from bs4 import BeautifulSoup #网页解析,获取数据
from urllib import parse
import requests
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error #制定URL,获取网页数据
import xlwt #进行excel操作
jobData = [] #每一张网页的岗位列表
job_href = [] #全部岗位url
#https://search.51job.com/list/040000,000000,0000,00,9,99,%25E6%259C%25BA%25E6%25A2%25B0,2,1.html
kw = input("请输入你要搜索的岗位关键字:")
keyword = parse.quote(parse.quote(kw)) #二次转换
def main():
for page in range(1, 2): #要提取的岗位列表的页面数量,每页50条
url = "https://search.51job.com/list/040000,000000,0000,00,9,99," + keyword + ",2," + str(page) + ".html"
# url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,%25E6%259C%25BA%25E6%25A2%25B0,2,1.html'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.5.4313.400"
}
page_text = requests.get(url=url,headers=headers).text
pat_job_href = r'"job_href":"(.*?)\?'
jobList = re.findall(pat_job_href,page_text,re.S)
job_href.extend(jobList)
page += 1
# print(len(jobList))
# 进入岗位链接页面获取相关信息
for i in range(0, len(job_href)): #分别进入每个岗位的链接,提取相关信息
pageurl = job_href[i].replace("\\", "")
request = urllib.request.Request(url=pageurl, headers=headers)
html = ""
# print(html)
try: #异常处理
response = urllib.request.urlopen(request)
html = response.read().decode("gbk")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
bs = BeautifulSoup(html, "html.parser") #解析html,提取数据
jnames = bs.select(" .cn > h1 ") #工作岗位名称
for jname in jnames:
jobData.append({
"工作岗位": jname["title"]})
cnames = bs.select(" .catn ") #公司名称
for cname in cnames:
jobData.append({
"公司": cname["title"]})
jobMsgStr = bs.select(" .job_msg > p") #职位信息
job_information = ""
for job_str in jobMsgStr:
job_information = job_information + job_str.text
companyMsgStr = bs.select(" .tmsg.inbox ") #公司信息
company_information = ""
for company_str in companyMsgStr:
company_information = company_information + company_str.text
salary_info = bs.select(" .cn > strong") # 薪资
salary = str(salary_info).lstrip('[').rstrip(']')
job_category_info = str(bs.select(" .mt10 > p.fp > a.el.tdn")) # 职能类别
pat_job_category = r'(.*?).*?'
job_category_find = re.findall(pat_job_category, job_category_info, re.S)
job_category = '、'.join(job_category_find)
days = bs.select(".ltype")
info = days[0]["title"].split("|")
if len(info) == 5:
for j in range(0, 5):
info[j].strip()
site = info[0].strip() # 公司所在地
work_experience = info[1].strip()[0:-2] # 工作经验
education = info[2].strip() # 学历
recruitment = info[3].strip() # 招聘人数
# date_of_issue = info[4].strip()[0:-2] # 发布日期
else:
for j in range(0, len(info)):
info[j].strip()
site = info[0].strip() # 公司所在地
work_experience = info[1].strip()[0:-2] # 工作经验
education = "学历不限" # 学历
recruitment = info[2].strip() # 招聘人数
# date_of_issue = info[3].strip()[0:-2] # 发布日期
# 将获取的信息放入字典
jobData.append({
"工作经验": work_experience})
jobData.append({
"学历": education})
jobData.append({
"薪资": salary})
jobData.append({
"招聘人数": recruitment})
jobData.append({
"职能类别": job_category})
jobData.append({
"职位信息": job_information.replace("\xa0", "")})
jobData.append({
"地址": site})
jobData.append({
"公司信息": company_information.replace("\xa0", "")})
# jobData.append({
"date_of_issue": date_of_issue})
saveData(jobData,".\\51job最新职位表.xls")
#保存数据
def saveData(jobData,savepath):
print("\n\033[34;1m正在保存,请稍等......\033[0m")
book = xlwt.Workbook(encoding="utf-8",style_compression=0) #创建workbook对象
sheet = book.add_sheet('51job最新50条职位表',cell_overwrite_ok=True) #创建工作表
# jnames(工作岗位名称)、cnames(公司名称)、work_experience(工作经验)、education(学历)、salary(薪资)、
# recruitment(招聘人数)、job_category(职能类别)、job_information(职位信息)、
# site(地址)、company_information(公司信息)、date_of_issue(发布日期)
col = ("工作岗位","公司","工作经验","学历","薪资","招聘人数","职能类别","职位信息","地址","公司信息")
k = 0
# col = ("jnames","cnames","work_experience","education","salary","recruitment","job_category","job_information","site","company_information")
for i in range(0,10):
sheet.write(0,i,col[i]) #行名
for i in range(0,len(job_href)):
print("第%d条" %(i+1))
for n in range(0,10):
sheet.write(i+1,n,jobData[k].get(col[n])) #向excel表写入数据
k += 1
n += 1
book.save(savepath) #保存数据表
if __name__ == "__main__":
#调用函数
main()
print("\n\033[31;1m爬取完毕!!!\033[0m")
print("\n\033[31;1m已生成Excel表,请打开查看!!!\033[0m")