初步代码
# -*- encoding: utf-8 -*-
from crawl.WebRequest import *
from crawl.mysqldb import SQL
import time, json, random, math, requests, logging, hashlib
# 格式化输出日志
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
logging.basicConfig(filename='zhilian.log', level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT)
# 获取请求头
logging.info('begin to get web request header')
# 需要爬取的职位
positions = ['大数据']
# 需要爬取的城市:北京、上海、深圳、广州、成都、杭州、武汉
# city_ids = ['530', '538', '765', '763', '801', '653', '736']
city_ids = ['801']
# 工作经验:无经验、1年以下、1-3年、3-5年、5-10年、10年以上
work_exps = ['0000', '0001', '0103', '0305', '0510', '1099']
# 请求头
header = header()
# 获取代理IP
proxy_list = get_home_proxy()
def main():
logging.info('begin to sending request')
sql = SQL()
latest_jobNums = sql.get_latest_jobNum('zhilian_update')
for city_id in city_ids:
for position in positions:
for work_exp in work_exps:
base_url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId={cityId}&salary=0,0' \
'&workExperience={workExp}&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&sortType=publish' \
'&kw={position}&kt=3&=0&_v=' + getParam()[0] + "&x-zp-page-request-id=" + getParam()[1]
base_url = base_url.format(cityId=city_id, workExp=work_exp, position=position)
header = header_zhilian(city_id, work_exp, position)
try:
response = requests.get(url=base_url, headers=header, )
data = json.loads(response.text)
if data['code'] == 200:
resultCount = data['data']['numFound'] # 职位条数
total_page = int(get_page_num(resultCount)) # 总页数
if total_page != 0:
# 开始进行爬取操作
for page in range(0, total_page):
logging.info(
"now it's crawling position:" + position + ",city_id:" + city_id + ",work_exp:" + work_exp + ",result_count:" + str(
resultCount) + ",total_page:" + str(total_page) + ",crawling page:" + str(
page + 1))
# 获取对应工作年限的增量jobnum信息
latest_jobNum = latest_jobNums.get(work_exp)
if resultCount < 90:
# 解析获取到的结果数据
results = get_result(latest_jobNum, data['data']['results'])
else:
# 计算页面起始值
startIndex = page * 90
if startIndex == 0:
results = get_result(latest_jobNum, data['data']['results'])
else:
# 拼接请求url
crawl_url = base_url + "&start=" + str(startIndex)
res = requests.get(url=crawl_url, headers=header, )
res_data = json.loads(res.text)
# 解析获取到的结果数据
results = get_result(latest_jobNum, res_data['data']['results'])
job_results = results[0]
# 判断是否停止循环的标签
break_flag = results[1]
if len(job_results) > 0:
# 保存数据到数据库中
if page == 0:
# 根据第一页开始爬取的第一条作为下次增量爬取的结束条件
job_num = job_results[0].get('job_num')
# 增量条件
update_condition = (job_num, work_exp)
# 根据条件保存数据至数据库
save_result(job_results, update_condition)
time.sleep(random.randint(1, 5))
else:
save_result(job_results, ())
time.sleep(random.randint(1, 5))
if break_flag is True:
break
except Exception as e:
logging.error('crawl error:', e)
def save_result(result, update_condition):
if not result or not len(result) > 0:
return
sql = SQL()
try:
if len(update_condition) == 0:
for res in result:
sql.insert('zhilian_original', **res)
logging.info("save data success")
else:
sql.update_jobNum(update_condition)
for res in result:
sql.insert('zhilian_original', **res)
logging.info("save data success")
except Exception as e:
logging.error("save data failed", e)
# 格式化数据结果
def get_result(jobNum, results):
crawl_results = []
flag = False
for result in results:
job_num = str(result['number'])
if jobNum is not None and jobNum == job_num:
flag = True
break
else:
crawl_results.append({
'job_num': job_num, # 工作编号
'job_name': result['jobName'], # 工作名称
'emp_type': result['emplType'], # 工作类型
'job_type': result['jobType']['display'], # 职业大分类名称
'job_city': result['city']['display'], # 工作城市
# 'business_area': result['businessArea'], # 商业区
'working_exp': result['workingExp']['name'], # 工作经验
'edu_level': result['eduLevel']['name'], # 教育水平
'salary': result['salary'], # 工资
'job_light': str(json.loads(result['positionLabel'])['jobLight']), # 职位亮点
'job_skill': str(json.loads(result['positionLabel'])['skill']), # 职位技能
'company_name': result['company']['name'], # 公司名称
'company_size': result['company']['size']['name'], # 公司规模
'company_type': result['company']['type']['name'], # 公司类型
'create_date': result['createDate'], # 创建时间
'update_date': result['updateDate'],
'end_date': result['endDate'],
'job_tag': str(result['jobTag']['searchTag']), # 工作待遇
'welfare': str(result['welfare']) # 工作福利
})
return (crawl_results, flag)
# 计算要抓取的总页数
def get_page_num(resultCount):
pageSize = 90
res = math.ceil(resultCount / pageSize) # 每页15个职位,向上取整
return res
def getParam():
# 1、生成一个随机32位数id
md5 = hashlib.md5()
id = str(random.random())
md5.update(id.encode('utf-8'))
random_id = md5.hexdigest()
# 2、生成当前时间戳
now_time = int(time.time() * 1000)
# 3、生成随机6位数
randomnumb = int(random.random() * 1000000)
# 组合代码生成x-zp-page-request-id
x_zp_page_request_id = str(random_id) + '-' + str(now_time) + '-' + str(randomnumb)
# 生成_v
url_v = str(round(random.random(), 8))
return [url_v, x_zp_page_request_id]
if __name__ == '__main__':
main()