import requests
import math
import pandas as pd
import time
from lxml import etree
def get_json(url, num,i):
'''''从网页获取JSON,使用POST请求,加上头部信息'''
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city={}'.format(i),
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
}
params = (
('labelWords', ''),
('fromSearch', 'true'),
('suginput', ''),
)
s = requests.Session()
s.headers.update(headers)
response = s.get('https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city={}#filterBox'.format(i), params=params)
data = {
'first': 'true',
'pn':num,
'kd':'数据分析'}
r = s.post('https://www.lagou.com/jobs/positionAjax.json?px=default&city={}&needAddtionalResult=false'.format(i), data=data, headers={
'Origin': 'https://www.lagou.com',
'X-Anit-Forge-Code': '0',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city={}'.format(i),
'X-Requested-With': 'XMLHttpRequest',
'X-Anit-Forge-Token': 'None',
})
# 得到包含职位信息的字典
r.encoding='utf-8'
page = r.json()
print(r.text)
return page
def get_page_num(count):
'''''计算要抓取的页数'''
# 每页15个职位,向上取整
res = math.ceil(count / 15)
# 拉勾网最多显示30页结果
if res > 30:
return 30
else:
return res
#对职位职责进行解析
def get_detail_ifo(position_id):
try:
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'DNT': '1',
'Host': 'www.lagou.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} # 每个职位,随机选择一个User-Agent
s = requests.Session() # 创建会话
s.headers.update(headers) # 需要设置headers信息,否则返回登陆页面
s.get('http://www.lagou.com') # session首先访问首页,获得cookies
r = s.get('https://www.lagou.com/jobs/%s.html'%str(position_id))
# print(r.text)
html = etree.HTML(r.text)
a = html.xpath('//*[@id="job_detail"]/dd[2]/div/p/text()')
return a
except Exception as e:
print(e)
return 'NAN'
def get_page_info(jobs_list):
'''''对一个网页的职位信息进行解析,返回列表'''
page_info_list = []
for i in jobs_list:
print(i)
job_info = []
#城市
job_info.append(i['city'])
job_info.append(i['companyFullName'])
job_info.append(i['companyShortName'])
job_info.append(i['companySize'])
job_info.append(i['financeStage'])
job_info.append(i['district'])
job_info.append(i['positionName'])
job_info.append(i['workYear'])
job_info.append(i['education'])
job_info.append(i['salary'])
job_info.append(i['positionAdvantage'])
job_info.append(i['jobNature'])
job_info.append(i['industryField'])
position_id=i['positionId']
job_info.append(get_detail_ifo(position_id))
time.sleep(10)
page_info_list.append(job_info)
return page_info_list
def main():
#一线与新一线city
citylist=['%E6%AD%A6%E6%B1%89','%E5%AE%81%E6%B3%A2','%E5%8C%97%E4%BA%AC','%E4%B8%8A%E6%B5%B7','%E5%B9%BF%E5%B7%9E','%E6%B7%B1%E5%9C%B3','%E6%88%90%E9%83%BD','%E6%9D%AD%E5%B7%9E','%E9%87%8D%E5%BA%86','%E8%A5%BF%E5%AE%89','%E8%8B%8F%E5%B7%9E','%E5%A4%A9%E6%B4%A5','%E5%8D%97%E4%BA%AC','%E9%95%BF%E6%B2%99','%E9%83%91%E5%B7%9E','%E4%B8%9C%E8%8E%9E','%E9%9D%92%E5%B2%9B','%E6%B2%88%E9%98%B3','%E5%AE%81%E6%B3%A2','%E6%98%86%E6%98%8E']
for i in citylist:
url='https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city={}#filterBox'.format(i)
# 先设定页数为1,获取总的职位数
page_1 = get_json(url, 1,i)
# print(page_1)
total_count = page_1['content']['positionResult']['totalCount']
num = get_page_num(total_count)
total_info = []
time.sleep(20)
print('职位总数:{},页数:{}'.format(total_count, num))
for n in range(1, num + 1):
# 对每个网页读取JSON, 获取每页数据
page = get_json(url, n,i)
jobs_list = page['content']['positionResult']['result']
page_info = get_page_info(jobs_list)
total_info += page_info
print('已经抓取第{}页, 职位总数:{}'.format(n, len(total_info)))
time.sleep(30)
# 将总数据转化为data frame再输出
df = pd.DataFrame(data=total_info,
columns=['p公司省市','公司全名', '公司简称', '公司规模', '融资阶段', '区域', '职位名称', '工作经验', '学历要求', '工资', '职位福利','工作类型','企业领域','职位要求'])
df.to_csv('lagou_job汇总最终.csv', index=False,mode='a')
print(i,'已完成')
if __name__ == "__main__":
main()
最终包含‘公司省市’,‘公司全名’, ‘公司简称’, ‘公司规模’, ‘融资阶段’, ‘区域’, ‘职位名称’, ‘工作经验’, ‘学历要求’, ‘工资’, ‘职位福利’,‘工作类型’,‘企业领域’,'职位要求’等信息,写的比较笨重,欢迎批评指正。