文章目录
- 用 requests 爬
- 用 selenium 爬
用 requests 爬
import requests
import re
import time
import json
import random
from lxml import etree
User_Agent = [
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)'
]
headers = {
'User-Agent': User_Agent[random.randint(0,9)],
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
}
data = {
'first': 'true',
'pn': 1,
'kd': 'python'
}
url = 'https://www.lagou.com/jobs/list_python/p-city_4?px=default#filterBox'
url_post = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%A4%A9%E6%B4%A5&needAddtionalResult=false'
s = requests.Session()
s.get(url, headers=headers)
cookies = s.cookies
job_information = {}
for i in range(1,5):
data['pn'] = i
if i > 1:
data['first'] = 'false'
data['sid'] = 'f2c32c03327c4ce58ac492bfcfb49600'
req = requests.post(url_post,headers=headers,cookies=cookies,data=data)
result = req.json()['content']['positionResult']['result']
print(result)
for res in result:
positionId = res['positionId']
positionUrl = 'https://www.lagou.com/jobs/%d.html' % positionId
response = requests.get(positionUrl,headers=headers,cookies=cookies)
html = etree.HTML(response.content.decode('utf8'))
title = html.xpath('//div[@class="position-content "]//div[@class="job-name"]/@title')
print(title)
sss = html.xpath('//div[@class="position-content "]//dd[@class="job_request"]//span/text()')
print(sss)
salary = sss[0].replace('/','').strip()
city = sss[1].replace('/','').strip()
experience = sss[2].replace('/','').strip()
education = sss[3].replace('/','').strip()
job_time = sss[4].replace('/','').strip()
job_information['salary'] = salary
job_information['city'] = city
job_information['experience'] = experience
job_information['education'] = education
job_information['job_time'] = job_time
job_type = html.xpath('//div[@class="position-head"]//li[@class="labels"]/text()')[0]
welfare = html.xpath('//dl[@id="job_detail"]//dd[@class="job-advantage"]//p/text()')[0]
dutys = html.xpath('//dl[@id="job_detail"]//dd[@class="job_bt"]//p/text()')
job_duty = ''
for duty in dutys:
job_duty += duty
job_duty += '\n'
job_information['job_type'] = job_type
job_information['welfare'] = welfare
job_information['job_duty'] = job_duty
addrs = html.xpath('//dl[@id="job_detail"]//dd[@class="job-address clearfix"]/div[@class="work_addr"]/a/text()')
addr = ''
for i in range(len(addrs)-1):
addr += addrs[i]
job_information['addr'] = addr
time.sleep(random.randint(5,10))
print(job_information)
用 selenium 爬
import time
import re
import random
import json
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class LaGou(object):
def __init__(self):
self.driver_path = 'D:\downloads\chromedriver_win32\chromedriver.exe'
self.driver = webdriver.Chrome(executable_path=self.driver_path)
def page_url(self):
url = 'https://www.lagou.com/jobs/list_python/p-city_4?px=default#filterBox'
self.driver.get(url)
while True:
page_source = self.driver.page_source
html = etree.HTML(page_source)
links = html.xpath('//a[@class="position_link"]/@href')
for link in links:
self.job_information = {}
self.driver.execute_script('window.open("%s")' % link)
self.driver.switch_to.window(self.driver.window_handles[1])
source = self.driver.page_source
self.job_info_onepage(source)
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
if 'class="pager_next pager_next_disabled"' in page_source:
break
next_pageBtn = self.driver.find_element(By.XPATH,'//span[@class="pager_next "]')
self.driver.execute_script('arguments[0].click();',next_pageBtn)
time.sleep(random.randint(1,3))
def job_info_onepage(self,source):
page_html = etree.HTML(source)
title = page_html.xpath('//div[@class="job-name"]/@title')
sss = page_html.xpath('//dd[@class="job_request"]//span/text()')
salary = sss[0].replace('/', '').strip()
city = sss[1].replace('/', '').strip()
experience = sss[2].replace('/', '').strip()
education = sss[3].replace('/', '').strip()
job_time = sss[4].replace('/', '').strip()
self.job_information['salary'] = salary
self.job_information['city'] = city
self.job_information['experience'] = experience
self.job_information['education'] = education
self.job_information['job_time'] = job_time
welfare = page_html.xpath('//dl[@id="job_detail"]//dd[@class="job-advantage"]//p/text()')[0]
dutys = page_html.xpath('//dl[@id="job_detail"]//dd[@class="job_bt"]//p/text()')
job_duty = ''
for duty in dutys:
job_duty += duty
self.job_information['welfare'] = welfare
self.job_information['job_duty'] = job_duty
addrs = page_html.xpath('//dl[@id="job_detail"]//dd[@class="job-address clearfix"]/div[@class="work_addr"]/a/text()')
addr = ''
for i in range(len(addrs) - 1):
addr += addrs[i]
self.job_information['addr'] = addr
time.sleep(random.randint(1, 4))
json_job_info = json.dumps(self.job_information,ensure_ascii=False)
with open('lagou_tianjin.json','a',encoding='utf8') as f:
f.write(json_job_info)
f.write('\n')
lagou = LaGou()
lagou.page_url()