前言:并非所有的动态页面都是Ajax生成,还有Js生成的,还有的参数很复杂。
Selenium(可见即可爬):
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as Ec
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
try:
browser.get('http://www.baidu.com/')
input = browser.find_element_by_id('kw')
input.send_keys('Python')
input.send_keys(Keys.ENTER)
wait = WebDriverWait(browser, 10)
wait.until(Ec.presence_of_element_located((By.ID, 'content_left')))
print(browser.current_url)
print(browser.get_cookies())
print(browser.page_source)
finally:
# pass
browser.quit()
声明浏览器对象:
from selenium import webdriver
# 谷歌浏览器
browser = webdriver.Chrome()
# 火狐浏览器
browser = webdriver.Firefox()
# 微软浏览器
browser = webdriver.Edge()
# 这个无界面的浏览器
browser = webdriver.PhantomJS()
# 苹果的那个浏览器
browser = webdriver.Safari()
访问页面:
get方法()请求网页,参传入URL链接
# 引入包的驱动模块
from selenium import webdriver
# 驱动的初始化
browser = webdriver.Chrome()
# 请求网页
browser.get('https://www.jd.com/')
# 打印出源代码的操作
print(browser.page_source)
# 关闭浏览器
browser.quit()
获取节点:
find_element_by_name() 根据name的值来获取
find_element_by_id() 根据id来获取
另外也可以通过Xpath,CSS选择器来获取:
# 引入包的驱动模块
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.taobao.com/')
input_first = browser.find_element_by_id('q')
input_second = browser.find_element_by_css_selector('#q')
input_third = browser.find_element_by_xpath('//*[@id="q"]')
print(input_first, input_second, input_third)
browser.quit()
获取单个节点的方法:
find_element_by_id
find_element_by_name
find_element_by_xpath
find_element_by_link_text
find_element_by_partial_link_text
find_element_by_tag_name
find_element_by_class_name
find_element_by_css_selector
Selenium提供了通用方法find_element(),使用的时候需要传入两个参数:
# 引入包的驱动模块
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.get('https://www.taobao.com/')
input_first = browser.find_element(By.ID, 'q')
print(input_first)
browser.quit()
查找多个节点:find_elements()
# 引入包的驱动模块
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.get('https://www.taobao.com/')
lis = browser.find_element_by_css_selector('.service-bd li')
print(lis)
browser.quit()
所有获取多个节点的方法:
find_elements_by_id
find_elements_by_name
find_elements_by_xpath
find_elements_by_link_text
find_elements_by_partial_link_text
find_elements_by_tag_name
find_elements_by_class_name
find_elements_by_css_selector
替代写法:
lis = browser.find_elements(By.CSS_SELECTOR, '.service-bd li')
节点交互:
输入文字,清空等操作
# 引入包的驱动模块
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://www.taobao.com/')
input = browser.find_element_by_id('q')
input.send_keys('U盘')
time.sleep(1)
input.clear()
input.send_keys('ipad')
button = browser.find_element_by_class_name('search-button')
button.click()
动作链的操作:
# 引入包的驱动模块
from selenium import webdriver
from selenium.webdriver import ActionChains
browser = webdriver.Chrome()
url = 'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
browser.get(url)
# 切换到iframeResult框架
browser.switch_to.frame('iframeResult')
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')
# 声明action对象
actions = ActionChains(browser)
# 动作链的轨迹
actions.drag_and_drop(source, target)
# 动作链的执行
actions.perform()
执行JavaScript:
例子是下拉进度条
# 引入包的驱动模块
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://cuiqingcai.com/5630.html')
# 进度条下滑到底部的实际操作
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
# 进度条下滑到底部的语句输出
browser.execute_script('alert("To Bottom")')
获取属性:
# 引入包的驱动模块
from selenium import webdriver
from selenium.webdriver import ActionChains
browser = webdriver.Chrome()
url = 'https://www.zhihu.com/explore'
browser.get(url)
# 找到logo结点的id
logo = browser.find_element_by_id('zh-top-link-logo')
print(logo)
# 打印节点的class属性值
print(logo.get_attribute('class'))
browser.quit()
获取文本值得方法:
# 引入包的驱动模块
from selenium import webdriver
from selenium.webdriver import ActionChains
browser = webdriver.Chrome()
url = 'https://www.zhihu.com/explore'
browser.get(url)
# 发现节点的class的名称
input = browser.find_element_by_class_name('post-link')
print(input.text)
browser.quit()
获取节点id、位置、标签名、大小:
# 引入包的驱动模块
from selenium import webdriver
from selenium.webdriver import ActionChains
browser = webdriver.Chrome()
url = 'https://www.zhihu.com/explore'
browser.get(url)
input = browser.find_element_by_class_name('zu-top-add-question')
print(input.id)
print(input.location)
print(input.tag_name)
print(input.size)
browser.quit()
切换Frame:
在子Frame 和 父Frame进行切换操作,找到自己需要的信息
# 引入包的驱动模块
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
browser = webdriver.Chrome()
url = 'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
browser.get(url)
# 找那个节点
browser.switch_to_frame('iframeResult')
try:
logo = browser.find_element_by_class_name('logo')
except:
print('No Logo')
# 切换到父frame
browser.switch_to.parent_frame()
logo = browser.find_element_by_class_name('logo')
print(logo)
print(logo.text)
browser.quit()
延时等待:
隐式等待 显示等待来确保获取了所有的节点
隐式等待:
# 引入包的驱动模块
from selenium import webdriver
browser = webdriver.Chrome()
# 模糊等待
browser.implicitly_wait(10)
url = 'https://www.zhihu.com/explore'
browser.get(url)
input = browser.find_element_by_class_name('question_link')
print(input)
browser.quit()
显式等待:
指定要查找的节点,指定一个最长等待时间。超出时间加载出来了,就返回这个节点,否则则报错
# 引入包的驱动模块
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome()
url = 'https://www.taobao.com/'
browser.get(url)
# 调用WebDriverWait方法,指定最长等待时间
wait = WebDriverWait(browser, 10)
# until可传入要等待的时间,这里就是传入要找到的那个节点的条件
input = wait.until(EC.presence_of_element_located((By.ID, 'q')))
# 节点可点击
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
print(input, button)
浏览器的前进后退:
import time
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.baidu.com/')
browser.get('https://www.python.org/')
browser.get('https://www.taobao.com/')
# 浏览器的后退
browser.back()
time.sleep(1)
# 浏览器的前进
browser.forward()
browser.quit()
获取cookies:
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
# 加载完毕,获取cookies
print(browser.get_cookies())
# 传入字典,冲重新获取cookies
browser.add_cookie({'name': 'name','domain': 'www.zhihu.com', 'value': 'germey'})
print(browser.get_cookies())
browser.delete_all_cookies()
print(browser.get_cookies())
browser.quit()
选项卡管理:
对开启的选项卡进行管理
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://www.baidu.com/')
# 调用script语句,开启一个新的选项卡
browser.execute_script('window.open()')
print(browser.window_handles)
# 转换到第二个选项卡中
browser.switch_to_window(browser.window_handles[1])
browser.get('https://www.jd.com/')
time.sleep(1)
# 转回到第一个选项卡当中
browser.switch_to_window(browser.window_handles[0])
browser.get('https://www.python.org/')
异常处理:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
browser = webdriver.Chrome()
try:
browser.get('https://www.baidu.com/')
except TimeoutException:
print('Time out')
try:
browser.find_element_by_id('hello')
except NoSuchElementException:
print('No Element')
finally:
browser.quit()