python爬虫之六 —— selenium和BOSS直聘

主要逻辑

  1. 打开首页
  2. 搜索关键字,进入第一页
    2.1 获取详情页url
    2.2 进入详情页抓取数据
  3. 翻页,重复第二步

详细代码

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
import time


class ZhiPin:
    def __init__(self):
        self.start_url = 'https://www.zhipin.com/'
        self.driver = webdriver.Chrome()
    
    def search(self, kw):
        element = self.driver.find_element_by_class_name('ipt-search')
        element.send_keys(kw)
        element.send_keys(Keys.RETURN)
        self.driver.find_element_by_link_text('全国').click()
    
    def next_page(self):
        next_page = self.driver.find_elements_by_class_name('next')
        next_page = next_page[0] if len(next_page)>0 else None
        return next_page
    
    def parse_page(self):
        page = self.driver.page_source
        soup = bs(page, 'lxml')
        contents = soup.select('#main > div > div.job-list > ul > li')
        for content in contents:
            result = {
                    'title': content.select_one('.job-title').get_text(),
                    'salary': content.select_one('.red').get_text(),
                    'location': content.select('p')[0].get_text(),
                    'company': content.select_one('div.info-company > div > h3 > a').get_text(),
                    'company_condition': content.select('p')[1].get_text(),
                    'hr': content.select_one('div.info-publis > h3').get_text(),
                    'date': content.select('p')[2].get_text()
                    }
            yield result

# 调用这个方法可进入详情页 
#    def detail_page(self):
#        handle = self.driver.current_window_handle
#        details = self.driver.find_elements_by_css_selector('div.job-list > ul > li > div > div.info-primary > h3.name > a')
#        for detail in details:
#            detail.click()
#            handles = self.driver.window_handles
#            for newhandle in handles:
#                if newhandle != handle:
#                    self.driver.switch_to.window(newhandle)
#                    time.sleep(2)
#                    self.driver.close()
#                    self.driver.switch_to.window(handles[0])            
#            time.sleep(2)        
    
    def run(self, kw):
        # 1.打开首页
        self.driver.get(self.start_url)
        # 2.搜索关键字,进入第一页
        self.search(kw)
        time.sleep(2)
          # 2.1 获取详情页url
          # 2.2 进入详情页抓取数据
        for result in self.parse_page():
            print(result)
#        self.detail_page()
        # 3.翻页,重复第二步
        next_page = self.next_page()
        while next_page:
            next_page.click()
            time.sleep(2)
            for result in self.parse_page():
                print(result)
#            self.detail_page()
            next_page = self.next_page()


if __name__ == '__main__':
    boss = ZhiPin()
    boss.run('数据分析')

你可能感兴趣的:(spiders)