爬取全国java岗位
from selenium import webdriver
from lxml import etree
import re
import time
class LagouSpider(object):
driver_path = r'D:\chromedriver\chromedriver.exe'
def __init__(self):
self.driver = webdriver.Chrome(executable_path=self.driver_path)
self.url = 'https://www.lagou.com/jobs/list_java?labelWords=&fromSearch=true&suginput='
def run(self):
self.driver.get(self.url)
while True:
source = self.driver.page_source
self._parse_list_page(source)
time.sleep(3)
next_btn = self.driver.find_element_by_xpath("//*[@id='s_position_list']/div[2]/div/span[last()]")
if 'pager_next pager_next_disabled' not in next_btn.get_attribute('class'):
next_btn.click()
time.sleep(2)
else:
break
time.sleep(10)
self.driver.close()
def _parse_list_page(self, source):
html = etree.HTML(source)
links = html.xpath("//a[@class='position_link']/@href")
for link in links:
result = self._parse_detail_page(link) # 请求每个详情页
self._show_or_store(result) # 展示/存储数据
def _parse_detail_page(self, link): # 爬取详情页【页面切换】
# self.driver.get(link)
self.driver.execute_script("window.open('%s')" % link)
self.driver.switch_to.window(self.driver.window_handles[1])
print('当前页面:%s' % self.driver.current_url)
time.sleep(6) # 每次抓取详情页等待
source = self.driver.page_source
html = etree.HTML(source)
position_name = html.xpath("//div[@class='job-name']/@title")[0]
company = html.xpath("//*[@id='job_company']/dt/a/div/h3/em/text()")[0].strip()
temp = []
result = []
for i in range(1, 6):
xpath_url = "/html/body/div[4]/div/div[1]/dd/h3/span[%d]/text()" % i
item = html.xpath(xpath_url)[0]
temp.append(re.sub(r'/', '', item))
position = {
'name': position_name, 'company': company, 'salary': temp[0], 'address': temp[1],
'work_year': temp[2], 'education': temp[3], 'desc': temp[4]
}
result.append(position)
# for i in result:
# print(i)
# print('###' * 50)
self.driver.close() # 关闭当前页
self.driver.switch_to.window(self.driver.window_handles[0])
return result
def _show_or_store(self, result):
for i in result:
print(i)
print('###' * 40)
if __name__ == '__main__':
spider = LagouSpider()
spider.run()