# -*- coding:utf-8 -*-
import requests
import re
import random
import time
import pandas as pd
from bs4 import BeautifulSoup
import lxml
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) ###禁止提醒SSL警告
##清除无效字符
def clear(val_list):
illegal_char = [' ','\n',' ','\r','\r','\t','\f']
for i in illegal_char:
val = re.sub(i,'',val_list)
return val
class job(object):
def __init__(self,url):
self.url=url
self.s = requests.session() ## 创建一个session对象
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'jobs.51job.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36',
}
self.s.headers.update(headers) ### 设置请求头
def getdata(self):###获取企业内招聘目录
all=pd.DataFrame()
for i in range(1,16):
time.sleep(0.3) ##延时
p=str(i)
##请求数据
data={
'pageno':p,
'hidTotal':'1060',
'type':'undefined',
'code':'undefined',
}
req=self.s.post(url=self.url,data=data,verify=False).text
title=re.findall('title="(.*?)">',req) #职位
href=re.findall('href="([https].*?)"',req) ##链接
t2=re.findall('class="t2">(.*?)<',req) ##要求
t3 = re.findall('class="t3">(.*?)<', req) ##地区
t4 = re.findall('class="t4">(.*?)<', req) ##薪酬
t5 = re.findall('class="t5">(.*?)<', req) ##更新日期
data={
'职位':title,
'链接':href,
'要求': t2,
'地区': t3,
'薪酬': t4,
'更新日期': t5
}
df = pd.DataFrame(data)
all=pd.concat([all,df],ignore_index = True)
return all
def getdetails(self,url): ##获取招聘信息详情
req=self.s.get(url=url,verify=False).text
html=BeautifulSoup(req, 'lxml')
tCompany_main=html.find(class_='tCompany_main')
tBorderTop_box=tCompany_main.find_all(class_='tBorderTop_box')
if len(tBorderTop_box)!=3:
job_information = clear(tBorderTop_box[0].get_text().strip()) ##职位信息
address = clear(tBorderTop_box[1].get_text().strip()) ##联系方式
departmental = clear(tBorderTop_box[2].get_text().strip()) ##部门信息
company = clear(tBorderTop_box[3].get_text().strip()) ##公司信息
else:
job_information = clear(tBorderTop_box[0].get_text().strip()) ##职位信息
address = clear(tBorderTop_box[1].get_text().strip()) ##联系方式
departmental = '' ##部门信息
company = clear(tBorderTop_box[2].get_text().strip()) ##公司信息
return job_information,address,departmental,company
def load(self):
df=self.getdata()
df.loc[:, "职位信息"] = ''
df.loc[:, "地址"] = ''
df.loc[:, "部门信息"] = ''
urls=df['链接'].values.tolist()
for i in range(len(urls)):
details=self.getdetails(urls[i])
df.at[i, '职位信息'] = details[0]
df.at[i, '地址'] = details[1]
df.at[i, '部门信息'] = details[2]
df.to_excel('51job.xlsx',index=0,encoding="GB18030")
if __name__ == '__main__':
url='https://jobs.51job.com/all/co4148005.html#syzw' ##需要爬取的企业链接
wy=job(url)
wy.load()