selenium+requests完美爬取拉勾网

需要导入的模块:

  1. import re # 已安装
  2. from selenium import webdriver
  3. from lxml import etree
  4. import csv
  5. import time # 已安装
  6. import requests

需要的环境:

  1. chromedriver驱动器来驱动chrome, 模拟浏览器行为
  2. python3.x以上的解释器
  • chromedriver这个到博客上搜索一下,会有详细教程

代码:

import requests
from selenium import webdriver
from lxml import etree
import re
import time
import csv


class LagouSpider(object):
    """爬虫类"""
    def __init__(self):
        """初始化, 新建lagou_detail_position.csv文件(类excel文件)来保存数据"""
        self.driver = webdriver.Chrome()
        self.url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
        # 新建excel文件,保存内容
        headers = ['name', 'company_name', 'salary', 'city', 'work_years', 'education', 'desc']  # 列表
        with open('lagou_detail_position.csv', 'w', newline='', encoding='utf-8') as fp:
            writer = csv.writer(fp)
            writer.writerow(headers)  # 写入头部

    def run(self):
        try:
            self.driver.get(self.url)
            time.sleep(5)
            # 输入框webelement
            inputTag = self.driver.find_element_by_id("keyword")
            # 默认有内容,得clear一下
            inputTag.clear()
            inputTag.send_keys("python爬虫")  # 这里输入你需要爬取的岗位内容
            # 提交button的webelement
            submitTag = self.driver.find_element_by_id("submit")
            submitTag.click()
            while True:
                # 获取当前列表页的页数
                now_number = self.driver.find_element_by_xpath("//span[@class='pager_is_current']")
                now_number = now_number.get_attribute("page")
                print("*"*30)
                print("正在下载第%s页面的内容\n" % now_number)
                time.sleep(2)
                # 获取列表页的内容
                source = self.driver.page_source
                self.parse_list_page(source)
                print("已经下载完第%s页面的内容" % now_number)
                # 获取下一个的按钮元素webelement
                next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
                # 判断是否还有下一页,没有就退出循环
                if "pager_next pager_next_disabled" in next_btn.get_attribute("class"):
                    break
                next_btn.click()
        finally:
            time.sleep(2)
            self.driver.quit()

    def parse_list_page(self, source):
        """获取列表页中各个详情页的内容"""
        html = etree.HTML(source)
        links = html.xpath("//a[@class='position_link']/@href")
        for link in links:  # 依次遍历各个岗位的详情页
            print(link)
            self.request_detail_page(link)
            time.sleep(4)

    def request_detail_page(self, url):
        """跳转到详情页页面"""
        # 重新打开个窗口
        window_open = "window.open('"+url+"')"
        self.driver.execute_script(window_open)
        # 跳转到新的窗口
        self.driver.switch_to_window(self.driver.window_handles[1])
        source = self.driver.page_source
        self.parse_detail_page(source)
        # 关闭详情页的窗口
        self.driver.close()
        # 调回至列表页的窗口
        self.driver.switch_to_window(self.driver.window_handles[0])

    def parse_detail_page(self, source):
        """详情页提取具体内容"""
        # 将source转换成可以xpath的element对象
        html = etree.HTML(source)
        # 职位名称
        position_name = html.xpath("//div[@class='job-name']/@title")[0]
        # 公司名称
        company_name = html.xpath("//img[@class='b2']/@alt")[0]
        # 岗位需求
        job_request_spans = html.xpath("//dd[@class='job_request']//span")
        # 工资
        salary = job_request_spans[0].xpath(".//text()")[0].strip()
        # 所在城市
        city = job_request_spans[1].xpath(".//text()")[0].strip()
        city = re.sub(r"[\s/]", "", city)
        # 需要的工作经验
        work_years = job_request_spans[2].xpath(".//text()")[0].strip()
        work_years = re.sub(r"[\s\/]", "", work_years).strip()
        # 需要的教育程度
        education = job_request_spans[3].xpath(".//text()")[0].strip()
        education = re.sub(r"[\s/]", "", education)
        # 岗位描述
        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        desc = re.sub(r"[\n\s]", "", desc)
        position = [position_name, company_name, salary, city, work_years, education, desc]
        self.save_data(position)

    def save_data(self, position):
        # 保存获取到的数据
        with open('lagou_detail_position.csv', 'a', newline='', encoding='utf-8') as fp:
            writer = csv.writer(fp)
            writer.writerow(position)  # 多行写入


if __name__ == "__main__":
    LagouSpider().run()


这个我就不截结果图了,里面的time.sleep()必须得设置,或者你可以弄个隐式等待,知识点如下,不然会因为爬取过快,被拉勾网识别为爬虫,最少间隔得5s以上,安全起见,还是弄个代理ip好点,块代理有免费的代理,不让你爬还好,封你Ip就不太好了


设置代理:

from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("--proxy-server=http://115.218.216.110:9000")  # 有时候    # https不行,http才行
driver = webdriver.Chrome(chrome_options=options)
driver.get("http://httpbin.org/ip")

隐式等待:

调用driver.implicity_wait。那么在获取不可用的元素之前,会先等待10秒中的时间

显示等待:

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as Ec
try:
	element = WebDriverWait(driver, 10).until(
		Ec.presence_of_element_located(By.ID, "myDyname"))
)
finallly:
	driver.quit()
"""
presence_of_element_located:某个元素已经加载完毕
presence_of_all_element_located:网页中所有满足条件的元素都加载完毕了。
"""

你可能感兴趣的:(爬虫,拉勾网爬虫,chromedriver,selenium,人生苦短,我用Python)