import pandas as pd
import matplotlib.pyplot as plt
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import csv
url = r'https://login.51job.com/login.php?lang=c'
# 处理反selenium: window.navigator.webdriver
opt = Options()
opt.add_argument('--disable-blink-features=AutomationControlled')
web = Chrome(options=opt)
web.get(url)
# 处理登录
switch_pw_login = web.find_element(By.XPATH, '//*[@id="NormalLoginBtn"]/span[3]/a').click()
time.sleep(1)
# user_input 和pw_input分别是用户和密码
user_input = web.find_element(By.XPATH, '//*[@id="loginname"]').send_keys('你的用户账号')
pw_input = web.find_element(By.XPATH, '//*[@id="password"]').send_keys('你的密码')
agree_code = web.find_element(By.XPATH, '//*[@id="isread_em"]').click()
time.sleep(1)
log_in = web.find_element(By.XPATH, '//*[@id="login_btn_withPwd"]').click()
time.sleep(5)
# 注意在这一步的时候,51job可能会跳出来验证信息,我让程序睡5秒,可以手动再页面上拉那个验证进度条,程序就会往下走了
# 爬取信息
web.maximize_window()
# 输入关键字符然后搜索 python 可以换成任意你想搜的工作信息
web.find_element(By.XPATH, '//*[@id="kwd"]').send_keys('python')
web.find_element(By.XPATH, '//*[@id="topIndex"]/div/div/i').click()
# 信息获取
time.sleep(5)
f = open('res.csv','a',newline='')
csvwriter = csv.writer(f)
while 1:
job_info = web.find_element(By.XPATH, '/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]').find_elements(By.CLASS_NAME,'e')
# print(job_info)
for job in job_info:
try:
job_name = job.find_element(By.XPATH, './/p[@class="t"]/span[1]').text
money = job.find_element(By.CLASS_NAME, 'sal').text
time_update = job.find_element(By.XPATH, './/p[@class="t"]/span[2]').text
info = job.find_element(By.XPATH, './/p[@class="info"]/span[2]').text
print(job_name, money, time_update, info)
csvwriter.writerow([job_name, money, time_update, info])
except Exception as e:
print(e)
web.find_element(By.CLASS_NAME,'j_page').find_element(By.CLASS_NAME, 'e_icons').click()
time.sleep(1)
f.close()
web.close()
还在继续爬,已经爬了16000多条信息,静静等着51job把我ip给屏蔽了。。。。。
程序处理的不美观,能用就行,别介意,哈哈哈。