python爬虫——爬取拉勾网一线与新一线城市职位信息

import requests
import math
import pandas as pd
import time
from lxml import etree


def get_json(url, num,i):
    '''''从网页获取JSON,使用POST请求,加上头部信息'''
    headers = {
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city={}'.format(i),
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
    }

params = (
    ('labelWords', ''),
    ('fromSearch', 'true'),
    ('suginput', ''),
    )


s = requests.Session()
s.headers.update(headers)

response = s.get('https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city={}#filterBox'.format(i), params=params)

data = {
       'first': 'true',
       'pn':num,
       'kd':'数据分析'}

r = s.post('https://www.lagou.com/jobs/positionAjax.json?px=default&city={}&needAddtionalResult=false'.format(i), data=data, headers={
    'Origin': 'https://www.lagou.com',
    'X-Anit-Forge-Code': '0',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city={}'.format(i),
    'X-Requested-With': 'XMLHttpRequest',
    'X-Anit-Forge-Token': 'None',
})
# 得到包含职位信息的字典
r.encoding='utf-8'
page = r.json()
print(r.text)
return page


def get_page_num(count):
    '''''计算要抓取的页数'''
    # 每页15个职位,向上取整
    res = math.ceil(count / 15)
    # 拉勾网最多显示30页结果
    if res > 30:
        return 30
    else:
        return res
#对职位职责进行解析
    def get_detail_ifo(position_id):
        try:
            headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                       'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9',
                       'Cache-Control': 'max-age=0',
                       'Connection': 'keep-alive',
                       'DNT': '1',
                       'Host': 'www.lagou.com',
                       'Upgrade-Insecure-Requests': '1',
                       'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}  # 每个职位,随机选择一个User-Agent

        s = requests.Session()  # 创建会话
        s.headers.update(headers)  # 需要设置headers信息,否则返回登陆页面

        s.get('http://www.lagou.com')  # session首先访问首页,获得cookies
        r = s.get('https://www.lagou.com/jobs/%s.html'%str(position_id))
        # print(r.text)
        html = etree.HTML(r.text)
        a = html.xpath('//*[@id="job_detail"]/dd[2]/div/p/text()')
        return a
    except Exception as e:
        print(e)
        return 'NAN'


def get_page_info(jobs_list):
    '''''对一个网页的职位信息进行解析,返回列表'''
    page_info_list = []
    for i in jobs_list:
        print(i)
        job_info = []
        #城市
        job_info.append(i['city'])
        job_info.append(i['companyFullName'])
        job_info.append(i['companyShortName'])
        job_info.append(i['companySize'])
        job_info.append(i['financeStage'])
        job_info.append(i['district'])
        job_info.append(i['positionName'])
        job_info.append(i['workYear'])
        job_info.append(i['education'])
        job_info.append(i['salary'])
        job_info.append(i['positionAdvantage'])
        job_info.append(i['jobNature'])
        job_info.append(i['industryField'])
        position_id=i['positionId']
        job_info.append(get_detail_ifo(position_id))
        time.sleep(10)
        page_info_list.append(job_info)
    return page_info_list


def main():
    #一线与新一线city
    citylist=['%E6%AD%A6%E6%B1%89','%E5%AE%81%E6%B3%A2','%E5%8C%97%E4%BA%AC','%E4%B8%8A%E6%B5%B7','%E5%B9%BF%E5%B7%9E','%E6%B7%B1%E5%9C%B3','%E6%88%90%E9%83%BD','%E6%9D%AD%E5%B7%9E','%E9%87%8D%E5%BA%86','%E8%A5%BF%E5%AE%89','%E8%8B%8F%E5%B7%9E','%E5%A4%A9%E6%B4%A5','%E5%8D%97%E4%BA%AC','%E9%95%BF%E6%B2%99','%E9%83%91%E5%B7%9E','%E4%B8%9C%E8%8E%9E','%E9%9D%92%E5%B2%9B','%E6%B2%88%E9%98%B3','%E5%AE%81%E6%B3%A2','%E6%98%86%E6%98%8E']
    for i in citylist:
        url='https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city={}#filterBox'.format(i)
        # 先设定页数为1,获取总的职位数
        page_1 = get_json(url, 1,i)
        # print(page_1)
        total_count = page_1['content']['positionResult']['totalCount']
        num = get_page_num(total_count)
        total_info = []
        time.sleep(20)
        print('职位总数:{},页数:{}'.format(total_count, num))

        for n in range(1, num + 1):
            # 对每个网页读取JSON, 获取每页数据
            page = get_json(url, n,i)
            jobs_list = page['content']['positionResult']['result']
            page_info = get_page_info(jobs_list)
            total_info += page_info
            print('已经抓取第{}页, 职位总数:{}'.format(n, len(total_info)))

            time.sleep(30)
            # 将总数据转化为data frame再输出
        df = pd.DataFrame(data=total_info,
                          columns=['p公司省市','公司全名', '公司简称', '公司规模', '融资阶段', '区域', '职位名称', '工作经验', '学历要求', '工资', '职位福利','工作类型','企业领域','职位要求'])
        df.to_csv('lagou_job汇总最终.csv', index=False,mode='a')
        print(i,'已完成')


if __name__ == "__main__":
    main()

最终包含‘公司省市’,‘公司全名’, ‘公司简称’, ‘公司规模’, ‘融资阶段’, ‘区域’, ‘职位名称’, ‘工作经验’, ‘学历要求’, ‘工资’, ‘职位福利’,‘工作类型’,‘企业领域’,'职位要求’等信息,写的比较笨重,欢迎批评指正。

你可能感兴趣的:(爬虫,拉勾网,python)