python爬拉钩案例 爬虫

直接上代码

这里拉勾网做了cookie的反扒机制,所以用

requests.utils.dict_from_cookiejar这个方法去获取cookie然后赋值
import requests

url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
    'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&gx=&isSchoolJob=1&city=%E6%B7%B1%E5%9C%B3&district=%E5%8D%97%E5%B1%B1%E5%8C%BA',

}
#或者response从而获取cookie
response = requests.get(
    'https://www.lagou.com/jobs/list_?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=',
    headers=HEADERS)

r = requests.utils.dict_from_cookiejar(response.cookies)

cookies = {
    'JSESSIONID': r['JSESSIONID'],
    'user_trace_token': r['user_trace_token'],
    '_ga': 'GA1.2.1889115744.1563421102',
    '_gid': 'GA1.2.1225486071.1563421102',
    'LGUID': '20190718113925-a3cc07cd-a90d-11e9-80a3-525400f775ce',
    'index_location_city': '%E5%85%A8%E5%9B%BD',
    'sajssdk_2015_cross_new_user': '1',
    'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%2216c07ab26c51b4-065227a62628bc-3a65460c-2073600-16c07ab26c68f4%22%2C%22%24device_id%22%3A%2216c07ab26c51b4-065227a62628bc-3a65460c-2073600-16c07ab26c68f4%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D',
    '_gat': '1',
    'LGSID': '20190719084048-da6ab30b-a9bd-11e9-a4e9-5254005c3644',
    'PRE_HOST': 'www.baidu.com',
    'PRE_SITE': 'https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dd91csmVeDbOkAv0gASlle3QOl3qzP5JMj3vNs493fSu%26wd%3D%26eqid%3Dd3586856003916d8000000045d31118e,',
    'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2F',
    'X_MIDDLE_TOKEN': '9ab824646ded14a04a618ff87e97943b',
    'X_HTTP_TOKEN': r['X_HTTP_TOKEN'],
    'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1563421102,1563496786,1563497272',
    'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1563497272',
    'LGRID': '20190719084855-fc6d00c5-a9be-11e9-a4e9-5254005c3644',
    'TG-TRACK-CODE': 'index_search',
    'SEARCH_ID': r['SEARCH_ID']
}

#pn就是页码  kd就是关键字查询
form_data = {'first': 'true',
             'pn': '100',
             'kd': 'python'}

#做分页用的,不过我没用
def get_page():
    res = requests.post(url=url, headers=HEADERS, data=form_data, cookies=cookies)
    # print(res)
    result = res.json()
    # print(result)
    jobs = result['content']['positionResult']
    print(jobs)
    return jobs['totalCount'] / 15


def get_jobs():
    res = requests.post(url=url, headers=HEADERS, data=form_data, cookies=cookies)
    result = res.json()

    jobs = result['content']['positionResult']['result']
    for job in jobs:
        print(
            '地址:' + job['district'] + ', ' + '公司名称:' + job['companyFullName'] + ', ' + '待遇:' + job['positionAdvantage']
            + ',' + ' 薪资:' + job['salary'])


if __name__ == '__main__':
    get_jobs()

 

你可能感兴趣的:(python)