拉勾网爬取职位信息

拉勾的一些反爬机制，例如：
1、职位信息通过ajax加载，直接获取html会报'页面加载中'，需要查看ajax请求
2、反爬header，不加header信息会报"您操作太频繁,请稍后再访问"
3、频繁访问封IP，使用time.sleep()解决
整体代码：

import json
import requests
from bs4  import BeautifulSoup
import time
import pandas
def position_detail(position_id):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.168 Safari/537.36',
        'Host':'www.lagou.com',
        'Referer':'https://www.lagou.com/jobs/4070834.html',
        'Upgrade-Insecure-Requests':'1',
        'Cookie':'_ga=GA1.2.925224148.1520249382; _gid=GA1.2.1215509072.1520249382; user_trace_token=20180305192941-7f1b8d57-2068-11e8-b126-5254005c3644; LGUID=20180305192941-7f1b9199-2068-11e8-b126-5254005c3644; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAAIAACBIDC6D0022285C6CAF595664653AED3310; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1520249382,1520258022,1520300598; LGSID=20180306094319-bf8c1061-20df-11e8-9d87-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%3Fpx%3Ddefault%26city%3D%25E5%25B9%25BF%25E5%25B7%259E; _putrc=81382530AB4F106A; SEARCH_ID=e2307ebf7bf348bd80fc22d967cc5f9c; _gat=1; login=true; unick=%E4%BA%8E%E6%9D%B0; gate_login_token=b4130f20f835c543e0404ca5e937a6c7ad8b2df841a6502c; LGRID=20180306101243-daaf0d82-20e3-11e8-b126-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1520302363'
    }
    url = 'https://www.lagou.com/jobs/%s.html' % position_id
    result = requests.get(url, headers=headers)
    time.sleep(5)
    soup = BeautifulSoup(result.content, 'html.parser')
    position_conent = soup.find('dd',class_="job_bt")
    if  position_conent is None:
        return 
    return position_conent.text
def main():
    url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
        'Host':'www.lagou.com',
        'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
        'Cookie':'user_trace_token=20180426120038-d73f4812-f98c-45f1-80ad-2fa15885541b; _ga=GA1.2.316967756.1524715241; LGUID=20180426120043-640d2714-4906-11e8-a51d-525400f775ce; JSESSIONID=ABAAABAAAGGABCB949979A4BA453871F38CFD8CD7DD59E2; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1524715240,1526206393; _gat=1; LGSID=20180513181313-3f272a9d-5696-11e8-823a-5254005c3644; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Db_WifzKOMEAsBbEykMZM4WuB5QXKYS9D3oDGOxeVcdS%26wd%3D%26eqid%3Df5739de300024fc6000000065af80fb3; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; _gid=GA1.2.1991740482.1526206393; index_location_city=%E6%B7%B1%E5%9C%B3; TG-TRACK-CODE=index_search; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526206657; LGRID=20180513181738-dc96e102-5696-11e8-987d-525400f775ce; SEARCH_ID=73b9a480e63f4300b8c20eec45eb915f',
        'X-Anit-Forge-Code':'0',
        'X-Anit-Forge-Token':'None',
        'X-Requested-With':'XMLHttpRequest'
    }
    info_list = []
    for x in range(1,26):
        datas = {
            'first': True,
            'pn': x,
            'kd': '数据分析'
        }    
        time.sleep(3)
        content=requests.post(url=url,headers=headers,data=datas)
        results = content.json()
        jobs = results['content']['positionResult']['result']
        for job in jobs:
            job_dict={
                'positionname':job['positionName'],
                'workyear':job['workYear'],
                'salary':job['salary'],
                'education':job['education'],
                'industryField':job['industryField'],
                'companyshortname':job['companyShortName'],
                'companyLabelList':job['companyLabelList'],
                'district':job['district'],
                'companyFullName':job['companyFullName'],
                }
            position_id=job['positionId']
            job_dict['position_detail'] = position_detail(position_id)
            print(position_detail)
            info_list.append(job_dict)
            df=pandas.DataFrame(info_list)
            df.to_excel('E:\\数据分析-深圳.xls')
    return df
#print(json.dumps(info_list,ensure_ascii=False,indent=2))

main()

一、通过右键检查找到ajax获取职位数据的那个请求

根据这个ajax找到请求的方式、url、头部、参数等信息

url 和requests获取方式

头部：

参数：

url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
        'Host':'www.lagou.com',
        'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
        'Cookie':'user_trace_token=20180426120038-d73f4812-f98c-45f1-80ad-2fa15885541b; _ga=GA1.2.316967756.1524715241; LGUID=20180426120043-640d2714-4906-11e8-a51d-525400f775ce; JSESSIONID=ABAAABAAAGGABCB949979A4BA453871F38CFD8CD7DD59E2; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1524715240,1526206393; _gat=1; LGSID=20180513181313-3f272a9d-5696-11e8-823a-5254005c3644; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Db_WifzKOMEAsBbEykMZM4WuB5QXKYS9D3oDGOxeVcdS%26wd%3D%26eqid%3Df5739de300024fc6000000065af80fb3; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; _gid=GA1.2.1991740482.1526206393; index_location_city=%E6%B7%B1%E5%9C%B3; TG-TRACK-CODE=index_search; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526206657; LGRID=20180513181738-dc96e102-5696-11e8-987d-525400f775ce; SEARCH_ID=73b9a480e63f4300b8c20eec45eb915f',
        'X-Anit-Forge-Code':'0',
        'X-Anit-Forge-Token':'None',
        'X-Requested-With':'XMLHttpRequest'
content=requests.post(url=url,headers=headers,data=datas)

二．

通过 print(content.text) 获取json格式文件

将得到的输出结果通过https://www.bejson.com/ 校验格式是否正确

若正确，https://www.json.cn 输入到此网站解析为xml文件

在content.json().['content']['positionResult']['result']中我们找到了要爬取的职位信息

content=requests.post(url=url,headers=headers,data=datas)
results = content.json()
jobs = results['content']['positionResult']['result']

三．获取所有页的职位信息

接着我们看下此请求的参数：

'first':'true'#是否是第一页

'pn':1 #请求第几页

'kd':'python'#搜索条件

所以只要改变请求页数就可以获得分页的数据

此处有25页就可以用一个range(1,26)搞定，同时针对反扒机制，把线程休眠时间调长

info_list = []
    for x in range(1,26):
        datas = {
            'first': True,
            'pn': x,
            'kd': '数据分析'
        }    
        time.sleep(3)
        content=requests.post(url=url,headers=headers,data=datas)
        results = content.json()
        jobs = results['content']['positionResult']['result']

四．提取所需字段，组成一个新字典

在jobs这个字典中我们选取需要爬取的条目组成一个新的字典job_dict

for job in jobs:
            job_dict={
                'positionname':job['positionName'],
                'workyear':job['workYear'],
                'salary':job['salary'],
                'education':job['education'],
                'industryField':job['industryField'],
                'companyshortname':job['companyShortName'],
                'companyLabelList':job['companyLabelList'],
                'district':job['district'],
                'companyFullName':job['companyFullName'],
                }

五．

接着我们要爬取每一个职位里的职位描述详细信息利用positionId进行网页联结

在dd class = “job_bt”中找到职位描述信息
相同原理进行爬取修改URL positionId为s% 定义一个爬取职位描述的函数，使用BeautifulSoup将文本提取，返回职位要求内容，对于没有job_bt的跳过

def position_detail(position_id):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.168 Safari/537.36',
        'Host':'www.lagou.com',
        'Referer':'https://www.lagou.com/jobs/4070834.html',
        'Upgrade-Insecure-Requests':'1',
        'Cookie':'_ga=GA1.2.925224148.1520249382; _gid=GA1.2.1215509072.1520249382; user_trace_token=20180305192941-7f1b8d57-2068-11e8-b126-5254005c3644; LGUID=20180305192941-7f1b9199-2068-11e8-b126-5254005c3644; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAAIAACBIDC6D0022285C6CAF595664653AED3310; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1520249382,1520258022,1520300598; LGSID=20180306094319-bf8c1061-20df-11e8-9d87-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%3Fpx%3Ddefault%26city%3D%25E5%25B9%25BF%25E5%25B7%259E; _putrc=81382530AB4F106A; SEARCH_ID=e2307ebf7bf348bd80fc22d967cc5f9c; _gat=1; login=true; unick=%E4%BA%8E%E6%9D%B0; gate_login_token=b4130f20f835c543e0404ca5e937a6c7ad8b2df841a6502c; LGRID=20180306101243-daaf0d82-20e3-11e8-b126-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1520302363'
    }
    url = 'https://www.lagou.com/jobs/%s.html' % position_id
    result = requests.get(url, headers=headers)
    time.sleep(5)
    soup = BeautifulSoup(result.content, 'html.parser')
    position_conent = soup.find('dd',class_="job_bt")
    if  position_conent is None:
        return 
    return position_conent.text

六．

position_id=job['positionId']
job_dict['position_detail'] = position_detail(position_id)
print(position_detail)
info_list.append(job_dict)
df=pandas.DataFrame(info_list)
df.to_excel('E:\\数据分析-深圳.xls')

通过id在函数中运行获取的职位信息添加到job_dict字典中将job_dict添加到空列表中info_list

通过pandas转化为DataFrame数据

保存为excel文件

拉勾网爬取职位信息

五．

你可能感兴趣的:(拉勾网爬取职位信息)