爬拉勾

文章目录

  • 用 requests 爬
  • 用 selenium 爬

用 requests 爬

import requests
import re
import time
import json
import random

from lxml import etree

User_Agent = [
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
	'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
	'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
	'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
	'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)',
	'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
	'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)'
	]
headers = {
	'User-Agent': User_Agent[random.randint(0,9)],
	'Accept': 'application/json, text/javascript, */*; q=0.01',
	'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
}
data = {
  'first': 'true',
  'pn': 1,
  'kd': 'python'
}
# 原始 url
url = 'https://www.lagou.com/jobs/list_python/p-city_4?px=default#filterBox'
# positionAjax 中获得的 url
url_post = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%A4%A9%E6%B4%A5&needAddtionalResult=false'
s = requests.Session()
s.get(url, headers=headers)
# 获取到从原始 url 中得到的 cookie
cookies = s.cookies
# 天津的 python 招聘当前一共四页
job_information = {}
for i in range(1,5):
	data['pn'] = i
	if i > 1:
		# 当页数大于 1,即 pn>1 时,first 为 false,并且多了个属性 sid 且是固定值
		# 将 first 值改为 false
		data['first'] = 'false'
		# 将 sid 属性值改/添加为 f2c32c03327c4ce58ac492bfcfb49600
		data['sid'] = 'f2c32c03327c4ce58ac492bfcfb49600'
	req = requests.post(url_post,headers=headers,cookies=cookies,data=data)
	result = req.json()['content']['positionResult']['result']
	print(result)
	for res in result:
		positionId = res['positionId']
		positionUrl = 'https://www.lagou.com/jobs/%d.html' % positionId
		# positionUrl = 'https://www.lagou.com/jobs/5268012.html'
		response = requests.get(positionUrl,headers=headers,cookies=cookies)
		html = etree.HTML(response.content.decode('utf8'))
		title = html.xpath('//div[@class="position-content "]//div[@class="job-name"]/@title')
		print(title)
		sss = html.xpath('//div[@class="position-content "]//dd[@class="job_request"]//span/text()')
		print(sss)
		salary = sss[0].replace('/','').strip()
		city = sss[1].replace('/','').strip()
		experience = sss[2].replace('/','').strip()
		education = sss[3].replace('/','').strip()
		job_time = sss[4].replace('/','').strip()
		job_information['salary'] = salary
		job_information['city'] = city
		job_information['experience'] = experience
		job_information['education'] = education
		job_information['job_time'] = job_time
		# city = html.xpath('//div[@data-companyid="488941"]//dd[@class="job_request"]/h3/span/text()')[1].replace('/', '').split()[0]
		# experience = html.xpath('//div[@data-companyid="488941"]//dd[@class="job_request"]/h3/span/text()')[2].replace('/', '').split()[0]
		# education = html.xpath('//div[@data-companyid="488941"]//dd[@class="job_request"]/h3/span/text()')[3].replace('/', '').split()[0]
		# job_time = html.xpath('//div[@data-companyid="488941"]//dd[@class="job_request"]/h3/span/text()')[4].replace('/', '').split()[0]
		job_type = html.xpath('//div[@class="position-head"]//li[@class="labels"]/text()')[0]
		welfare = html.xpath('//dl[@id="job_detail"]//dd[@class="job-advantage"]//p/text()')[0]
		dutys = html.xpath('//dl[@id="job_detail"]//dd[@class="job_bt"]//p/text()')
		job_duty = ''
		for duty in dutys:
			job_duty += duty
			job_duty += '\n'
		job_information['job_type'] = job_type
		job_information['welfare'] = welfare
		job_information['job_duty'] = job_duty
		addrs = html.xpath('//dl[@id="job_detail"]//dd[@class="job-address clearfix"]/div[@class="work_addr"]/a/text()')
		addr = ''
		for i in range(len(addrs)-1):
			addr += addrs[i]
		job_information['addr'] = addr
		time.sleep(random.randint(5,10))
print(job_information)

用 selenium 爬

import time
import re
import random
import json

from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class LaGou(object):

	def __init__(self):
		self.driver_path = 'D:\downloads\chromedriver_win32\chromedriver.exe'
		self.driver = webdriver.Chrome(executable_path=self.driver_path)
		# self.job_info_list = []

	def page_url(self):
		url = 'https://www.lagou.com/jobs/list_python/p-city_4?px=default#filterBox'
		self.driver.get(url)
		while True:
			page_source = self.driver.page_source
			html = etree.HTML(page_source)
			# self.job_information = {}
			# 除了最后一页每页 15 个 url
			links = html.xpath('//a[@class="position_link"]/@href')
			for link in links:
				self.job_information = {}
				# 打开新的标签页
				self.driver.execute_script('window.open("%s")' % link)
				# 为了防止 driver.close() 此时标签页还没关闭,切换到最新的标签页,即新打开的标签页
				self.driver.switch_to.window(self.driver.window_handles[1])
				source = self.driver.page_source
				self.job_info_onepage(source)
				self.driver.close()
				# 如果不切换回首页,没法新打开一个标签页,会报错
				self.driver.switch_to.window(self.driver.window_handles[0])
			# 当出现 class="pager_next pager_next_disabled" 时,说明已经到最后一页了,此时终止循环
			if 'class="pager_next pager_next_disabled"' in page_source:
				break
			next_pageBtn = self.driver.find_element(By.XPATH,'//span[@class="pager_next "]')
			# 因为首页会弹出个 js 的广告,会影响到 next_pageBtn.click(),所以用 execute_script
			# 这里是 js 的语法,arguments[0] 表示第一个参数,即 next_pageBtn
			self.driver.execute_script('arguments[0].click();',next_pageBtn)
			time.sleep(random.randint(1,3))

	def job_info_onepage(self,source):
		page_html = etree.HTML(source)
		title = page_html.xpath('//div[@class="job-name"]/@title')
		sss = page_html.xpath('//dd[@class="job_request"]//span/text()')
		salary = sss[0].replace('/', '').strip()
		city = sss[1].replace('/', '').strip()
		experience = sss[2].replace('/', '').strip()
		education = sss[3].replace('/', '').strip()
		job_time = sss[4].replace('/', '').strip()
		self.job_information['salary'] = salary
		self.job_information['city'] = city
		self.job_information['experience'] = experience
		self.job_information['education'] = education
		self.job_information['job_time'] = job_time
		welfare = page_html.xpath('//dl[@id="job_detail"]//dd[@class="job-advantage"]//p/text()')[0]
		dutys = page_html.xpath('//dl[@id="job_detail"]//dd[@class="job_bt"]//p/text()')
		job_duty = ''
		for duty in dutys:
			job_duty += duty
		self.job_information['welfare'] = welfare
		self.job_information['job_duty'] = job_duty
		addrs = page_html.xpath('//dl[@id="job_detail"]//dd[@class="job-address clearfix"]/div[@class="work_addr"]/a/text()')
		addr = ''
		# 获取的 addrs 最后一个都是地图,所以不要最后一个
		for i in range(len(addrs) - 1):
			addr += addrs[i]
		self.job_information['addr'] = addr
		time.sleep(random.randint(1, 4))
		json_job_info = json.dumps(self.job_information,ensure_ascii=False)
		with open('lagou_tianjin.json','a',encoding='utf8') as f:
			f.write(json_job_info)
			f.write('\n')

lagou = LaGou()
lagou.page_url()

你可能感兴趣的:(爬拉勾)