拉钩翻页爬取数据


```python
import time
import random
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
class QiChe(object):
    driver = webdriver.Chrome()
    def __init__(self):
        self.url = 'https://www.lagou.com/jobs/list_python%E5%B7%A5%E7%A8%8B%E5%B8%88/p-city_215?px=default#filterBox'
        self.list = []
    def run(self):
        self.driver.get(self.url)
        wait = WebDriverWait(self.driver, 100)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'body-btn')))
        self.driver.find_element_by_class_name('body-btn').click()  ##点击弹出来的红包(让红包消失)
        time.sleep(2)
        while True:
            try:
                self.details_page()
                wait.until(EC.presence_of_element_located((By.CLASS_NAME,'pager_next ')))
                self.driver.find_element_by_class_name('pager_next ').click()
                time.sleep(random.randint(5,10))
            except:
                print('最后一页了')
                break
    def details_page(self):
        url_page = self.driver.page_source  ##获取网页
        url = etree.HTML(url_page)  ##转化为str字符串赋值给url
        styles = url.xpath('//div[@class="p_top"]/a/@href')  ##获取详情页连接
        for style in styles:
            # print(style)
            self.parse_url(style)
            time.sleep(random.randint(3, 6))
    def parse_url(self,style):
        self.driver.execute_script("window.open('%s')"%style)
        self.driver.switch_to.window(self.driver.window_handles[1])
        time.sleep(random.randint(3, 7))
        url_2 = self.driver.page_source
        url_str = etree.HTML(url_2)
        company_names = url_str.xpath('//div[@class="job-name"]/h4[@class="company"]/text()')
        job_names = url_str.xpath('//div[@class="job-name"]//h1[@class="name"]/text()')
        salary_dresss = url_str.xpath('//dd[@class="job_request"]//span/text()')
        job_contents = url_str.xpath('//div[@class="job-detail"]/p/text()')
        for job_content in job_contents:
            print(job_content)
            print('-'*50)
        self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[0])

if __name__ == '__main__':
    qiche =QiChe()
    qiche.run()

你可能感兴趣的:(python)