直接上代码
这里拉勾网做了cookie的反扒机制,所以用
requests.utils.dict_from_cookiejar这个方法去获取cookie然后赋值
import requests
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&gx=&isSchoolJob=1&city=%E6%B7%B1%E5%9C%B3&district=%E5%8D%97%E5%B1%B1%E5%8C%BA',
}
#或者response从而获取cookie
response = requests.get(
'https://www.lagou.com/jobs/list_?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=',
headers=HEADERS)
r = requests.utils.dict_from_cookiejar(response.cookies)
cookies = {
'JSESSIONID': r['JSESSIONID'],
'user_trace_token': r['user_trace_token'],
'_ga': 'GA1.2.1889115744.1563421102',
'_gid': 'GA1.2.1225486071.1563421102',
'LGUID': '20190718113925-a3cc07cd-a90d-11e9-80a3-525400f775ce',
'index_location_city': '%E5%85%A8%E5%9B%BD',
'sajssdk_2015_cross_new_user': '1',
'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%2216c07ab26c51b4-065227a62628bc-3a65460c-2073600-16c07ab26c68f4%22%2C%22%24device_id%22%3A%2216c07ab26c51b4-065227a62628bc-3a65460c-2073600-16c07ab26c68f4%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D',
'_gat': '1',
'LGSID': '20190719084048-da6ab30b-a9bd-11e9-a4e9-5254005c3644',
'PRE_HOST': 'www.baidu.com',
'PRE_SITE': 'https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dd91csmVeDbOkAv0gASlle3QOl3qzP5JMj3vNs493fSu%26wd%3D%26eqid%3Dd3586856003916d8000000045d31118e,',
'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2F',
'X_MIDDLE_TOKEN': '9ab824646ded14a04a618ff87e97943b',
'X_HTTP_TOKEN': r['X_HTTP_TOKEN'],
'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1563421102,1563496786,1563497272',
'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1563497272',
'LGRID': '20190719084855-fc6d00c5-a9be-11e9-a4e9-5254005c3644',
'TG-TRACK-CODE': 'index_search',
'SEARCH_ID': r['SEARCH_ID']
}
#pn就是页码 kd就是关键字查询
form_data = {'first': 'true',
'pn': '100',
'kd': 'python'}
#做分页用的,不过我没用
def get_page():
res = requests.post(url=url, headers=HEADERS, data=form_data, cookies=cookies)
# print(res)
result = res.json()
# print(result)
jobs = result['content']['positionResult']
print(jobs)
return jobs['totalCount'] / 15
def get_jobs():
res = requests.post(url=url, headers=HEADERS, data=form_data, cookies=cookies)
result = res.json()
jobs = result['content']['positionResult']['result']
for job in jobs:
print(
'地址:' + job['district'] + ', ' + '公司名称:' + job['companyFullName'] + ', ' + '待遇:' + job['positionAdvantage']
+ ',' + ' 薪资:' + job['salary'])
if __name__ == '__main__':
get_jobs()