1、 遍历首页所有职位分类
2、 点击进入职位分类详情页,按照地区抓取,职位名称,月薪,经验年限要求,学历要求,招聘公司,所属行业,轮次,人数(规模),发布时间
3、 点击进入职位详情页,抓取该职位的技能标签。
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-06 10:40:07
# Project: boss_recruit
from pyspider.libs.base_handler import *
import re
import datetime
from pymongo import MongoClient
# 连接线下数据库
DB_IP = '10.15.4.126'
DB_PORT = 28018
client = MongoClient(host=DB_IP, port=DB_PORT)
# admin 数据库有帐号,连接-认证-切换
db_auth = client.admin
db_auth.authenticate("xyzhang", "niub-food*2018")
DB_NAME = 'research'
DB_COL = 'boss_recruit'
db = client[DB_NAME]
col = db[DB_COL]
class Handler(BaseHandler):
crawl_config = {
"headers":{"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
},
#"proxy": "http://10.15.100.94:6666"
}
url = 'https://www.zhipin.com/?ka=header-home'
def format_date(self, date):
return datetime.datetime.strptime(date, '%Y%m%d')
@every(minutes=24 * 60)
def on_start(self):
print(get_proxy())
self.crawl(self.url, callback=self.index_page, proxy=get_proxy())
@config(age=60)
def index_page(self, response):
page = response.etree
base_url = 'https://www.zhipin.com'
# 所有行业列表
vocation_list = page.xpath("//div[@class='job-menu']//div[@class='menu-sub']/ul/li")
for each in vocation_list:
belong = each.xpath("./h4/text()")[0]
detail_list = each.xpath("./div[@class='text']/a")
print(belong)
for detail in detail_list:
detail_title = detail.xpath("./text()")[0]
detail_url = base_url + detail.xpath("./@href")[0]
#save = {"belonging":[belong, detail_title]}
save = {"belonging": detail_title}
print(detail_title, detail_url)
self.crawl(detail_url, callback=self.detail_page, save=save, proxy=get_proxy())
@config(age=60)
def detail_page(self, response):
page = response.etree
base_url = 'https://www.zhipin.com'
# 城市列表
city_list = page.xpath("//div[@class='condition-box']/dl[@class='condition-city show-condition-district']/dd/a[@ka]")[1:] #不要全国
for each in city_list:
city_name = each.xpath("./text()")[0]
city_url = base_url + each.xpath("./@href")[0]
params = {"ka": each.xpath("./@ka")[0]}
save = {"city": city_name, "belonging": response.save["belonging"]}
self.crawl(city_url, callback=self.parse_city, params=params, save=save, proxy=get_proxy())
@config(age=60)
def parse_city(self, response):
page = response.etree
base_url = 'https://www.zhipin.com'
#该城市的地区列表
district_list = page.xpath("//div[@class='condition-box']/dl[@class='condition-district show-condition-district']/dd/a[position()>1]")
for each in district_list:
district_name = each.xpath("./text()")[0]
print(district_name)
district_url = base_url + each.xpath("./@href")[0]
params = {"ka": each.xpath("./@ka")[0]}
save = {"district": district_name, "city": response.save["city"], "belonging": response.save["belonging"]}
self.crawl(district_url, callback=self.parse_district, params=params, save=save, proxy=get_proxy())
@config(age=60)
def parse_district(self, response):
page = response.etree
base_url = 'https://www.zhipin.com'
#该地区的区域列表
area_list = page.xpath("//div[@class='condition-box']/dl[@class='condition-area show-condition-area']/dd/a[position()>1]")
for each in area_list:
area_name = each.xpath("./text()")[0]
print(area_name)
area_url = base_url + each.xpath("./@href")[0]
params = {"ka": each.xpath("./@ka")[0]}
save = {"area": area_name, "district": response.save["district"], "city": response.save["city"], "belonging": response.save["belonging"], "base_url": area_url, "page_num": 1}
self.crawl(area_url, callback=self.parse_content, params=params, save=save, proxy=get_proxy())
@config(age=60)
def parse_page(self, response):
page = response.etree
#翻页
page_url = response.save.pop("base_url")
page_num = 10
print(page_url)
for each in range(1, page_num+1):
ka = 'page-{}'.format(each)
params = {"page": each,
"ka": ka
}
self.crawl(page_url, callback=self.parse_content, params=params, save=response.save)
@config(age=60)
def parse_content(self, response):
page = response.etree
base_url = 'https://www.zhipin.com'
page_url = response.save.get("base_url")
#内容列表
content_list = page.xpath("//div[@class='job-list']/ul/li")
#判断是否有内容
if content_list == []:
return
for each in content_list:
# 职位名称
position_name = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/div[@class='job-title']/text()")[0]
#薪水
salary = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/span/text()")[0]
#经验
experience = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[1]
#学历
education = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[2]
#公司
company = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/h3[@class='name']/a/text()")[0]
#轮数
if len(each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")) == 3:
rounds = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1]
#规模
scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[2]
else:
rounds = ''
#规模
scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1]
#发布时间
public_time = each.xpath("./div[@class='job-primary']/div[@class='info-publis']/p/text()")[0]
if ''.join(re.findall(u'昨天',public_time)):
public_time = (datetime.datetime.now()+datetime.timedelta(days=-1)).strftime('%Y%m%d')
elif ''.join(re.findall('\d+:\d+',public_time)):
public_time = datetime.datetime.now().strftime('%Y%m%d')
else:
public_time = '2018' + ''.join(re.findall(u'(\d+)月(\d+)日',public_time)[0])
print(public_time)
#职位详情链接
position_url = base_url + each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/@href")[0]
print(position_url)
save = {"area": response.save["area"],
"district": response.save["district"],
"city": response.save["city"],
"belonging": response.save["belonging"],
"position_name": position_name,
"salary": salary,
"experience": experience,
"education": education,
"company": company,
"rounds": rounds,
"scale": scale,
"public_time": public_time
}
#爬取职位的详情
self.crawl(position_url, callback=self.parse_body, save=save, proxy=get_proxy())
#翻页
page_num = response.save.get('page_num')
print(page_num)
page_num += 1
if page_num <= 10:
ka = 'page-{}'.format(page_num)
params = {"page": page_num,
"ka": ka
}
response.save.update({"page_num": page_num})
self.crawl(page_url, callback=self.parse_content, params=params, save=response.save, proxy=get_proxy())
def parse_body(self, response):
page = response.etree
print(response.save["public_time"])
#职位技能
skill = ''.join(page.xpath("//div[@class='detail-content']/div[@class='job-sec'][1]//text()")).strip()
print(skill)
result = {"skill": skill,
"area": response.save["area"],
"district": response.save["district"],
"city": response.save["city"],
"belonging": response.save["belonging"],
"position_name": response.save["position_name"],
"salary": response.save["salary"],
"experience": response.save["experience"],
"education": response.save["education"],
"company": response.save["company"],
"rounds": response.save["rounds"],
"scale": response.save["scale"],
"public_time": self.format_date(response.save["public_time"]),
"update_time": datetime.datetime.now()
}
yield result
def on_result(self, result):
if result is None:
return
update_key = {
'position_name': result['position_name'],
'public_time': result['public_time'],
'city': result['city'],
'district': result['district'],
'area': result['area'],
'company': result['company']
}
col.update(update_key, {'$set': result}, upsert=True)