利用requests爬取智联相关职位信息

初步代码

# -*- encoding: utf-8 -*-


from crawl.WebRequest import *
from crawl.mysqldb import SQL
import time, json, random, math, requests, logging, hashlib

# 格式化输出日志
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
logging.basicConfig(filename='zhilian.log', level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT)
# 获取请求头
logging.info('begin to get web request header')
# 需要爬取的职位
positions = ['大数据']
# 需要爬取的城市:北京、上海、深圳、广州、成都、杭州、武汉
# city_ids = ['530', '538', '765', '763', '801', '653', '736']
city_ids = ['801']
# 工作经验:无经验、1年以下、1-3年、3-5年、5-10年、10年以上
work_exps = ['0000', '0001', '0103', '0305', '0510', '1099']
# 请求头
header = header()
# 获取代理IP
proxy_list = get_home_proxy()


def main():
    logging.info('begin to sending request')
    sql = SQL()
    latest_jobNums = sql.get_latest_jobNum('zhilian_update')
    for city_id in city_ids:
        for position in positions:
            for work_exp in work_exps:
                base_url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId={cityId}&salary=0,0' \
                           '&workExperience={workExp}&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&sortType=publish' \
                           '&kw={position}&kt=3&=0&_v=' + getParam()[0] + "&x-zp-page-request-id=" + getParam()[1]
                base_url = base_url.format(cityId=city_id, workExp=work_exp, position=position)
                header = header_zhilian(city_id, work_exp, position)
                try:
                    response = requests.get(url=base_url, headers=header, )
                    data = json.loads(response.text)
                    if data['code'] == 200:
                        resultCount = data['data']['numFound']  # 职位条数
                        total_page = int(get_page_num(resultCount))  # 总页数
                        if total_page != 0:
                            # 开始进行爬取操作
                            for page in range(0, total_page):
                                logging.info(
                                    "now it's crawling position:" + position + ",city_id:" + city_id + ",work_exp:" + work_exp + ",result_count:" + str(
                                        resultCount) + ",total_page:" + str(total_page) + ",crawling page:" + str(
                                        page + 1))
                                # 获取对应工作年限的增量jobnum信息
                                latest_jobNum = latest_jobNums.get(work_exp)
                                if resultCount < 90:
                                    # 解析获取到的结果数据
                                    results = get_result(latest_jobNum, data['data']['results'])
                                else:
                                    # 计算页面起始值
                                    startIndex = page * 90
                                    if startIndex == 0:
                                        results = get_result(latest_jobNum, data['data']['results'])
                                    else:
                                        # 拼接请求url
                                        crawl_url = base_url + "&start=" + str(startIndex)
                                        res = requests.get(url=crawl_url, headers=header, )
                                        res_data = json.loads(res.text)
                                        # 解析获取到的结果数据
                                        results = get_result(latest_jobNum, res_data['data']['results'])
                                job_results = results[0]
                                # 判断是否停止循环的标签
                                break_flag = results[1]
                                if len(job_results) > 0:
                                    # 保存数据到数据库中
                                    if page == 0:
                                        # 根据第一页开始爬取的第一条作为下次增量爬取的结束条件
                                        job_num = job_results[0].get('job_num')
                                        # 增量条件
                                        update_condition = (job_num, work_exp)
                                        # 根据条件保存数据至数据库
                                        save_result(job_results, update_condition)
                                        time.sleep(random.randint(1, 5))
                                    else:
                                        save_result(job_results, ())
                                        time.sleep(random.randint(1, 5))
                                if break_flag is True:
                                    break
                except Exception as e:
                    logging.error('crawl error:', e)


def save_result(result, update_condition):
    if not result or not len(result) > 0:
        return
    sql = SQL()
    try:
        if len(update_condition) == 0:
            for res in result:
                sql.insert('zhilian_original', **res)
            logging.info("save data success")
        else:
            sql.update_jobNum(update_condition)
            for res in result:
                sql.insert('zhilian_original', **res)
            logging.info("save data success")
    except Exception as e:
        logging.error("save data failed", e)


# 格式化数据结果
def get_result(jobNum, results):
    crawl_results = []
    flag = False
    for result in results:
        job_num = str(result['number'])
        if jobNum is not None and jobNum == job_num:
            flag = True
            break
        else:
            crawl_results.append({
                'job_num': job_num,  # 工作编号
                'job_name': result['jobName'],  # 工作名称
                'emp_type': result['emplType'],  # 工作类型
                'job_type': result['jobType']['display'],  # 职业大分类名称
                'job_city': result['city']['display'],  # 工作城市
                # 'business_area': result['businessArea'],  # 商业区
                'working_exp': result['workingExp']['name'],  # 工作经验
                'edu_level': result['eduLevel']['name'],  # 教育水平
                'salary': result['salary'],  # 工资
                'job_light': str(json.loads(result['positionLabel'])['jobLight']),  # 职位亮点
                'job_skill': str(json.loads(result['positionLabel'])['skill']),  # 职位技能
                'company_name': result['company']['name'],  # 公司名称
                'company_size': result['company']['size']['name'],  # 公司规模
                'company_type': result['company']['type']['name'],  # 公司类型
                'create_date': result['createDate'],  # 创建时间
                'update_date': result['updateDate'],
                'end_date': result['endDate'],
                'job_tag': str(result['jobTag']['searchTag']),  # 工作待遇
                'welfare': str(result['welfare'])  # 工作福利
            })
    return (crawl_results, flag)


# 计算要抓取的总页数
def get_page_num(resultCount):
    pageSize = 90
    res = math.ceil(resultCount / pageSize)  # 每页15个职位,向上取整
    return res


def getParam():
    # 1、生成一个随机32位数id
    md5 = hashlib.md5()
    id = str(random.random())
    md5.update(id.encode('utf-8'))
    random_id = md5.hexdigest()
    #  2、生成当前时间戳
    now_time = int(time.time() * 1000)
    #  3、生成随机6位数
    randomnumb = int(random.random() * 1000000)
    # 组合代码生成x-zp-page-request-id
    x_zp_page_request_id = str(random_id) + '-' + str(now_time) + '-' + str(randomnumb)
    # 生成_v
    url_v = str(round(random.random(), 8))
    return [url_v, x_zp_page_request_id]


if __name__ == '__main__':
    main()

你可能感兴趣的:(爬虫)