pyspider爬虫框架之boss直聘职位信息爬取

需求

1、 遍历首页所有职位分类
2、 点击进入职位分类详情页,按照地区抓取,职位名称,月薪,经验年限要求,学历要求,招聘公司,所属行业,轮次,人数(规模),发布时间
3、 点击进入职位详情页,抓取该职位的技能标签。

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-06 10:40:07
# Project: boss_recruit

from pyspider.libs.base_handler import *
import re
import datetime
from pymongo import MongoClient

# 连接线下数据库
DB_IP = '10.15.4.126'
DB_PORT = 28018

client = MongoClient(host=DB_IP, port=DB_PORT)

# admin 数据库有帐号,连接-认证-切换
db_auth = client.admin
db_auth.authenticate("xyzhang", "niub-food*2018")

DB_NAME = 'research'
DB_COL = 'boss_recruit'
db = client[DB_NAME]
col = db[DB_COL]



class Handler(BaseHandler):
    crawl_config = {
        "headers":{"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
                  },
        #"proxy": "http://10.15.100.94:6666"
    }

    url = 'https://www.zhipin.com/?ka=header-home'


    def format_date(self, date):
        return datetime.datetime.strptime(date, '%Y%m%d')


    @every(minutes=24 * 60)
    def on_start(self):
        print(get_proxy())
        self.crawl(self.url, callback=self.index_page, proxy=get_proxy())

    @config(age=60)
    def index_page(self, response):
        page = response.etree
        base_url = 'https://www.zhipin.com'

        # 所有行业列表
        vocation_list = page.xpath("//div[@class='job-menu']//div[@class='menu-sub']/ul/li")

        for each in vocation_list: 
            belong = each.xpath("./h4/text()")[0]

            detail_list = each.xpath("./div[@class='text']/a")
            print(belong)
            for detail in detail_list:
                detail_title = detail.xpath("./text()")[0]
                detail_url = base_url + detail.xpath("./@href")[0]

                #save = {"belonging":[belong, detail_title]}
                save = {"belonging": detail_title}

                print(detail_title, detail_url)

                self.crawl(detail_url, callback=self.detail_page, save=save, proxy=get_proxy())

    @config(age=60)
    def detail_page(self, response):
        page = response.etree
        base_url = 'https://www.zhipin.com'
        # 城市列表
        city_list = page.xpath("//div[@class='condition-box']/dl[@class='condition-city show-condition-district']/dd/a[@ka]")[1:]  #不要全国

        for each in city_list:

            city_name = each.xpath("./text()")[0]
            city_url = base_url + each.xpath("./@href")[0]

            params = {"ka": each.xpath("./@ka")[0]}

            save = {"city": city_name, "belonging": response.save["belonging"]}

            self.crawl(city_url, callback=self.parse_city, params=params, save=save, proxy=get_proxy())

    @config(age=60)        
    def parse_city(self, response):
        page = response.etree
        base_url = 'https://www.zhipin.com'
        #该城市的地区列表
        district_list = page.xpath("//div[@class='condition-box']/dl[@class='condition-district show-condition-district']/dd/a[position()>1]")

        for each in district_list:
            district_name = each.xpath("./text()")[0]
            print(district_name)

            district_url = base_url + each.xpath("./@href")[0]
            params = {"ka": each.xpath("./@ka")[0]}
            save = {"district": district_name, "city": response.save["city"], "belonging": response.save["belonging"]}

            self.crawl(district_url, callback=self.parse_district, params=params, save=save, proxy=get_proxy())


    @config(age=60)        
    def parse_district(self, response):
        page = response.etree
        base_url = 'https://www.zhipin.com'
        #该地区的区域列表
        area_list = page.xpath("//div[@class='condition-box']/dl[@class='condition-area show-condition-area']/dd/a[position()>1]")

        for each in area_list:
            area_name = each.xpath("./text()")[0]
            print(area_name)

            area_url = base_url + each.xpath("./@href")[0]
            params = {"ka": each.xpath("./@ka")[0]}
            save = {"area": area_name, "district": response.save["district"], "city": response.save["city"], "belonging": response.save["belonging"], "base_url": area_url, "page_num": 1}

            self.crawl(area_url, callback=self.parse_content, params=params, save=save, proxy=get_proxy())



    @config(age=60)
    def parse_page(self, response):
        page = response.etree

        #翻页

        page_url = response.save.pop("base_url")
        page_num = 10
        print(page_url)

        for each in range(1, page_num+1):
            ka = 'page-{}'.format(each)
            params = {"page": each,
                      "ka": ka
                     }
            self.crawl(page_url, callback=self.parse_content, params=params, save=response.save)



    @config(age=60)        
    def parse_content(self, response):
        page = response.etree
        base_url = 'https://www.zhipin.com'
        page_url = response.save.get("base_url")
        #内容列表
        content_list = page.xpath("//div[@class='job-list']/ul/li")

        #判断是否有内容
        if content_list == []:
            return

        for each in content_list:
            # 职位名称
            position_name = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/div[@class='job-title']/text()")[0]
            #薪水
            salary = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/span/text()")[0]
            #经验
            experience = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[1]
            #学历
            education = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[2]
            #公司
            company = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/h3[@class='name']/a/text()")[0]

            #轮数
            if len(each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")) == 3: 
                rounds = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1]
                #规模
                scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[2]
            else:
                rounds = ''
                #规模
                scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1]                

            #发布时间
            public_time = each.xpath("./div[@class='job-primary']/div[@class='info-publis']/p/text()")[0]

            if ''.join(re.findall(u'昨天',public_time)):
                public_time = (datetime.datetime.now()+datetime.timedelta(days=-1)).strftime('%Y%m%d')

            elif ''.join(re.findall('\d+:\d+',public_time)):
                public_time = datetime.datetime.now().strftime('%Y%m%d')
            else:
                public_time = '2018' + ''.join(re.findall(u'(\d+)月(\d+)日',public_time)[0])

            print(public_time)

            #职位详情链接
            position_url = base_url + each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/@href")[0]
            print(position_url)

            save = {"area": response.save["area"], 
                    "district": response.save["district"], 
                    "city": response.save["city"], 
                    "belonging": response.save["belonging"], 
                    "position_name": position_name,
                    "salary": salary,
                    "experience": experience,
                    "education": education,
                    "company": company,
                    "rounds": rounds,
                    "scale": scale,
                    "public_time": public_time
                   }

            #爬取职位的详情
            self.crawl(position_url, callback=self.parse_body, save=save, proxy=get_proxy())


        #翻页
        page_num = response.save.get('page_num')
        print(page_num)
        page_num += 1


        if page_num <= 10:
            ka = 'page-{}'.format(page_num)
            params = {"page": page_num,
                      "ka": ka
                     }
            response.save.update({"page_num": page_num})

            self.crawl(page_url, callback=self.parse_content, params=params, save=response.save, proxy=get_proxy())







    def parse_body(self, response):
        page = response.etree
        print(response.save["public_time"])
        #职位技能
        skill = ''.join(page.xpath("//div[@class='detail-content']/div[@class='job-sec'][1]//text()")).strip()

        print(skill)

        result = {"skill": skill,
                  "area": response.save["area"], 
                  "district": response.save["district"], 
                  "city": response.save["city"], 
                  "belonging": response.save["belonging"], 
                  "position_name": response.save["position_name"],
                  "salary": response.save["salary"],
                  "experience": response.save["experience"],
                  "education": response.save["education"],
                  "company": response.save["company"],
                  "rounds": response.save["rounds"],
                  "scale": response.save["scale"],
                  "public_time": self.format_date(response.save["public_time"]),
                  "update_time": datetime.datetime.now()
                 }

        yield result 


    def on_result(self, result):
        if result is None:
            return

        update_key = {
                'position_name': result['position_name'],
                'public_time': result['public_time'],
                'city': result['city'],
                'district': result['district'],
                'area': result['area'],
                'company': result['company']
            }

        col.update(update_key, {'$set': result}, upsert=True)          

你可能感兴趣的:(爬虫)