代码如下
import requests
import json
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,'
' like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
def url_index():
range_ = ['https://fe-api.zhaopin.com/c/i/sou?start={}&pageSize=90'.format(num) for num in range(0, 12)]
return range_
def salary_index(salary):
salarys=['','']
if 'K' in salary:
salary=salary.replace('K','000')
if '.' in salary:
salary = salary.replace('.000', '00')
if '-' in salary:
salarys=salary.split('-')
else:
salarys=[salary,salary]
return salarys
def go_index(url):
response = requests.post(url, headers)
loads = json.loads(response.content.decode())
json_list=[]
for item in loads['data']['results']:
salarys=salary_index(item.get('salary',''))
data={
'city_name':item.get('city',dict()).get('display',''),
'com_name':item.get('company',dict()).get('name',''),
'com_size':item.get('company',dict()).get('size',dict()).get('name',''),
'com_type':item.get('company',dict()).get('type',dict()).get('name',''),
'job_name': item.get('jobName', ''),
'job_tag': item.get('jobTag', dict()).get('searchTag', ''),
'timeState': item.get('timeState', ''),
'low_salary': salarys[0],
'higt_salary': salarys[1],
'positionURL': item.get('positionURL', '')
}
json_list.append(data)
return json_list
def del_json(json_list):
end_list=[]
for item in json_list:
url=item['positionURL']
item['size']=''
response = requests.get(url, headers)
html = etree.HTML(response.content.decode())
html_index = html.xpath('//ul[@class="summary-plane__info"]/li')
item['size']=html_index[3].xpath('./text()')
del item['positionURL']
print(item)
end_list.append(item)
return end_list
if __name__ == '__main__':
url_list=url_index();
json_list=[]
for url in url_list:
json_index=go_index(url);
json_list+=json_index
end_list=del_json(json_list)
try:
with open('zl2.txt','a+',encoding='utf-8') as f:
json.dump(end_list,f,ensure_ascii=False)
except Exception as e:
print(e)