百度百聘-python爬虫实践

这次是分析网页获得后台API的接口网址,然后直接进行爬取。随着学习深入,可将爬取内容直接写入数据库。

import requests

def getJobDetail(jobinfo):
    jobname = jobinfo['title']
    source= jobinfo['source']
    compname = jobinfo['officialname']
    education = jobinfo['ori_education']
    experience = jobinfo['experience']
    salary = jobinfo['ori_salary']
    return "{jn} | 学历要求:{edu} | 经验要求:{exp} | 薪资:{salary} | 公司名:{cn} | 来源:{src} ".format(
            jn=jobname,edu=education,exp=experience,salary=salary,cn=compname,src=source)

baiduAPI = 'http://zhaopin.baidu.com/api/quanzhiasync'
keyword = input('请输入关键字: ')
city = input('请输入城市: ')
headerAPI = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
             'X-Requested-With':'XMLHttpRequest',
             'Host':'zhaopin.baidu.com',
             'Accept-Language':'zh-CN,zh;q=0.8'
             }

parmsAPI ={'query':keyword,
           'sort_type':1,
           'city':city,
           'detailmode':'close',
           'rn':20,
           'pn':0}

s = requests.Session()
s.headers.update(headerAPI)
resp = s.get(baiduAPI,params=parmsAPI)
content = resp.json()
resultset= content['data']['main']['data']
total_job_cnt = int(resultset['dispNum'])
if parmsAPI['rn']>= total_job_cnt:
    parmsAPI['rn'] = total_job_cnt

print(resultset['dispNum'],resultset['listNum'])
print("total {} jobs printed".format(total_job_cnt))
with open('zhaopin.txt','w',encoding='utf-8') as f: 
    while(total_job_cnt>0):
        joblist= content['data']['main']['data']['disp_data']
        for jobinfo in joblist:
            f.write(getJobDetail(jobinfo) + '\n')
        parmsAPI['pn'] += parmsAPI['rn']
        if parmsAPI['rn'] == total_job_cnt:
            break
        total_job_cnt -= parmsAPI['rn']
        if total_job_cnt <= parmsAPI['rn']:
            parmsAPI['rn'] = total_job_cnt
        content = s.get(baiduAPI,params=parmsAPI).json()

s.close()

你可能感兴趣的:(百度百聘-python爬虫实践)