重点代码解释:
1.调用lxml的etree实现xpath方法调用,xpath相对正则比较简单,可以不在使用Beauitfulsoup定位
from lxml import etree
2.界面的可视话与否,对于你的运行资源只能用减少
opt=webdriver.ChromeOptions()
# 把chrome设置成无界面模式,不论windows还是linux都可以,自动适配对应参数
opt.set_headless()#无界面
self.driver=webdriver.Chrome(options=opt)
3.加载数据时到xpath定位的位置进行爬取
#此句话大致意思,执行driver 时间不超过20s 什么时候加载到xpath定位的位置神魔时候停止开始执行页面 内容爬去
WebDriverWait(driver=self.driver,timeout=20).until(EC.presence_of_all_elements_located(By.XPATH,’//*[@id=“s_position_list”]/div[2]/div/a[6]’))
4.python 中join()函数strip() 函数和 split() 函数的详解及实例详细内容请到我的另一篇博客
查看
content = “”.join(html.xpath("//dd[@class=‘job_bt’]//text()")).strip()
‘’’
语法: ‘sep’.join(seq)
参数说明
sep:分隔符。可以为空
seq:要连接的元素序列、字符串、元组、字典
上面的语法即:以sep作为分隔符,将seq所有的元素合并成一个新的字符串
返回值:返回一个以分隔符sep连接各个元素后生成的字符串
‘’’
from selenium import webdriver
import lxml
from lxml import etree
import re
import time
import pymysql
import urllib.request
import requests
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class LagouSpider(object):
def __init__(self):
opt=webdriver.ChromeOptions()
# 把chrome设置成无界面模式,不论windows还是linux都可以,自动适配对应参数
opt.set_headless()#无界面
self.driver=webdriver.Chrome(options=opt)
self.url="https://www.lagou.com/zhaopin/Python/"
def run(self):
self.driver.get(self.url)
while True:
source = self.driver.page_source
#此句话大致意思,执行driver 时间不超过20s 什么时候加载到xpath定位的位置神魔时候停止开始执行页面 内容爬去
WebDriverWait(driver=self.driver,timeout=20).until(EC.presence_of_all_elements_located(By.XPATH,'//*[@id="s_position_list"]/div[2]/div/a[6]'))
# WebDriverWait(driver=self.driver, timeout=20).until(
# EC.presence_of_element_located((By.XPATH, '//*[@id="s_position_list"]/div[2]/div/a[6]'))
# )
self.parse_list_page(source)
# 点“下一页”
next_btn=self.driver.find_element_by_xpath( '//*[@id="s_position_list"]/div[2]/div/a[6]')
# 提取下一页的按钮,注意class的值中有空格不可用。
if "pager_next_disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
time.sleep(1)
# next_btn = self.driver.find_element_by_xpath(
# '//*[@id="s_position_list"]/div[2]/div/a[6]')
# if "pager_next_disabled" in next_btn.get_attribute("class"):
# break
# else:
# next_btn.click()
# time.sleep(1)
# source=self.driver.page_source
# #print(source)
# self.parse_list_page(source)
#职位url列表
def parse_list_page(self,source):
#t通过etree调用xpath
html=etree.HTML(source)
links=html.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[1]/a/@href')
for link in links:
self.request_detail_page(link)
# print(link)
#time.sleep(1)
#执行提取的url
def request_detail_page(self,url):
#self.driver.get(url)
#打开新的页面
self.driver.execute_script("window.open('%s')"%url)
#切换句柄进入新打开的页面
self.driver.switch_to.window((self.driver.window_handles[1]))
# self.driver.execute_script("window.open('%s')" % url)
# self.driver.switch_to.window(self.driver.window_handles[1])
#加载出来工作名开始爬取
WebDriverWait(driver=self.driver, timeout=20).until(
EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']")) )
source=self.driver.page_source
self.parse_detail_page(source)
# 关闭当前详情页,并且切换到列表页
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
#self.parse_list_page(source)
#提取具体信息
def parse_detail_page(self,source):
html=etree.HTML(source)
positionName=html.xpath("//div[@class='position-head']/div/div[1]/div/span/text()")[0]
job_request_spans=html.xpath("//div[@class='position-head']/div/div[1]/dd/p[1]/span")
salary=job_request_spans[0].xpath(".//text()")[0].strip()
city=job_request_spans[1].xpath('.//text()')[0].strip()
#city = re.match(r'/(.*?) /',city)
city = re.sub(r"[\s/]", "", city)#此处将"/"替换为空""
work_years = job_request_spans[2].xpath('.//text()')[0].strip()
work_years = re.sub(r"[\s/]", "", work_years)
education = job_request_spans[3].xpath('.//text()')[0].strip()
education = re.sub(r"[\s/]", "", education)
content = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
#print(positionName)
#mysql=MySQLPipeline()
#mysql.process_item(positionName,salary,city,work_years,education,content)
'''
语法: 'sep'.join(seq)
参数说明
sep:分隔符。可以为空
seq:要连接的元素序列、字符串、元组、字典
上面的语法即:以sep作为分隔符,将seq所有的元素合并成一个新的字符串
返回值:返回一个以分隔符sep连接各个元素后生成的字符串
'''
class MySQLPipeline(object):
def __init__(self):
self.conn = pymysql.connect(host="localhost",user="root",password="root",db="lagou", charset='utf8')
self.cursor = self.conn.cursor()
def process_item(self,positionName,salary,city,work_years,education,content):
insert_sql = '''
insert into lagou_table(positionName,salary,city,work_years,education,content)
values(%s,%s,%s,%s,%s,%s)
'''
self.cursor.execute(insert_sql,(positionName,salary,city,work_years,education,content))
self.conn.commit()
def close_spider(self,spider): #TypeError: close_spider() takes 1 positional argument but 2 were given
self.cursor.close()
self.conn.close()
if __name__=="__main__":
spider=LagouSpider()
spider.run()