Python 3网络爬虫之Selenium用法

selenium的使用

概念:一个自动化测试工具,利用他可以驱动浏览器执行特定的动作,同时可以获取浏览器当前呈现的页面的源代码

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser=webdriver.Chrome()
try:
    browser.get('https://www.baidu.com')
    input=browser.find_element_by_id('kw')
    input.send_keys('PYTHON')
    input.send_keys(Keys.ENTER)
    wait=WebDriverWait(browser,10)
    wait.until(EC.presence_of_element_located((By.ID,'content_left')))
    print(browser.current_url)
    print(brwoser.get_cookies())
    print(broser.page_source)
except:
    pass
finally:
    browser.close()

 1.声明浏览器

from selenium import webdriver
browser=webdriver.Chrome() # Firefox()、Edge()、PhantomJS()、Safari()

 2.访问页面

from selenium import webdriver
browser=webdriver.Chrome()
browser.get('https:/www.taobao.com')
print(browser.page_source)
browser.close()

 3.查找节点

# 源码 
# ID、CSS选择器、XPath获取
from selenium import webdriver
browser=webdirver.Chrome()
browser.get('https://www.taobao.com')
input_first=browser.find_element_by_id('q')
input_second=browser.find_element_by_css_selector('#q')
input_third=browser.find_element_by_xpath('//*[@id='q']')
browser.close()

# 获取单个节点的方法 find_element_by_*
# find_element_by_id、find_element_by_name[xpath、link_text、partial_link_text、tag_name、class_name、css_selector]
# 等同于  find_element(By.ID,id) ---browser.find_element(By.ID,'q')

# 获取多个节点的方法 find_elements
lis=browser.find_elements_by_css_selector('.service-bd li')
# find_elements_by_id[name、xpath、link_text、partial_link_text、tag_name、class_name、css_selector]

4.节点交互

解析:常见动作 send_keys()、clear()、click()

input.send_keys('iphone')  # 输入iphone 文本
time.sleep(1)
input.clear()   # clear() 清空输入框
input.send_keys('ipad')
button=browser.find_element_by_class_name('btn-search')  # 获取按钮
button.click()  # 完成搜索动作

5.动作链

解析:鼠标拖曳、键盘按键等,这些动作的执行

from selenium import webdriver
from selenium.webdriver import ActionChains
browser=web.driver.Chrome()
url='*'
browser.get(url)
browser.switch_to.frame('iframeResult') # 切换,此处有ID属性
source=browser.find_element_by_css_selector('#draggable')
target=browser.find_element_by_css_selector('#droppable')
actions=ActionChains(browser)
actions.drag_and_drop(source,target)  # 鼠标拖曳动作
actions.perform() # 执行动作

6.执行JavaScript

解析:执行下拉进度条等动作,直接模拟运行JavaScript

此处用execute_script()方法实现

from selenium import webdriver
browser=webdriver.Chrome()
browser.get('*')
browser.execute_script('window.scroll(0,document.body.scrollHeight)')  # 将进度条下拉至最底部
browser.execute_script('alert('TO Bottom')') # 弹出alert提示框

7.获取节点信息

解析:selenium已提供了选择节点的方法,可通过相关的方法和属性来直接提取节点信息

# 获取属性
from selenium import webdriver
from selenium.webdriver import ActionChains
browser=webdriver.Chrome()
url='*'
browser.get(url)
logo=browser.find_element_by_id('zh-top-link-logo')
print(logo)
print(logo.get_attribute('class')) # 通过获取的属性名来捕获它的值
input=browser.find_element_by_id('zh-top-add-question')
print(ipput.text) # 获取文本值
print(input.id,input.location,input.tag_name,input.size) # 获取id、位置、标签名和大小 

8.切换Frame

解析: 网页中有一种节点iframe 也称子Frame,相当于页面的子页面,selenium默认在父节点操作的,可通过switch_to.frame()来切换到子Frame

import time
form selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
browser=webdriver.Chrome()
url='*'
browser.get(url)
browser.switch_to.frame('iframeResult')  # 切换到子节点
try:
    logo=browser.find_element_by_class_name('logo')  # 尝试获取父节点,引起异常,打印结果:NO LOGO
except NoSuchElementException:
    print('NO LOGO')
browser.switch_to.parent_frame()  # 重新切换到父节点,获取父节点,正常获取信息
logo=browser.find_element_by_class_name('logo')
print(logo)
print(logo.text)

9.延时等待

解析:get()方法在网页加载结束后执行,若此时获取page_source或许得到完整的页面,即使有额外的Ajax请求,获取的源码未必完整

# 隐式等待   时间固定
from selenium import webdriver
browser=webdriver.Chrome()
browser.implicitly_wait(10) # 隐式等待  若没有找到节点,将继续等待,超时则抛出异常
browser.get('*')
ipnput=browser.find_element_by_class_name('*')
print(input)
# 显示等待  根据网络条件优化
from selenium import webdriver
form selenium.webdirver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser=webdriver.Chrome()
browser.get('*')
wait=WebDriverWait(browser,10)  # 显示等待,指定最长等待时间
input=wait.until(EC.presence_of_element_located((By.ID,'q')))  # 调用until() 传入要等待条件expected_conditions,
                                                               # presence_of_element_located 代表节点出现,否则抛出异常
button=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.btn-search')))
print(input,button)

title_contains

标题包含某内容

presence_of_element_located

节点加载出来,传入定位元组,eg (By.ID,'p')

visibility_of

节点可见,传入节点对象

text_to_be_present_in_element

某个节点文本包含某文字

element_to_be_clickable

节点可点击

alert_is_present

是否出现警告

 10. 前进和后退

browser.back() # 后退
time.sleep(1)
browser.forward() # 前进
browser.close()

11.Cookies

解析:对Cookies操作,如:获取、添加、删除

print(browser.get_cookies())  # 获取当前Cookies
browser.add_cookies({'name':'name','domain':'*','value':'*'})
print(browser.get_cookies())
browser.delete_all_cookies()  # 删除所有的Cookies
print(browser.get_cookies())

12. 选项卡管理

browser.get('**')
browser.execute_script('window.open()') # window.open() 新开启一个选项卡
print(browser.window_handles)           # window_handles 属性获取当前开启的所有选项卡
browser.switch_to_window(browser.window_handles[1]) # switch_to_window 切换选项卡
browser.get(*)
time.sleep(1)
browser.switch_to_window(browser.window_handles[0])
browser.get('*')

13.异常处理

from selenium import webdriver
form selenium.common.exceptions import TimeoutException,NoSuchElementException
browser=webdriver.Chrome()
try:
    browser.get('*')
except TimeoutException:
    print('Time out')
try:
    browser.find_element_by_id('hello')
except NoSuchElementException:
    print('No Element')
finally:
    browser.close()

# selenium基本实例
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

brower = webdriver.Chrome()
try:
    brower.get('http://www.baidu.com')  # 请求网页
    input=brower.find_element(By.ID,'kw')  # 根据对应ID获取
    input.send_keys('Python')
    input.send_keys(Keys.ENTER)
    wait = WebDriverWait(brower,10)
    wait.until(EC.presence_of_element_located((By.ID,'content_left')))
    print(brower.current_url)
    print(brower.get_cookies())
    print(brower.page_source)  # 打印源代码
finally:
    # brower.close()
    pass

# 查找节点
#  单个节点  ID ,CSS,XPATH
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()

browser.get('https://www.taobao.com')
# 单个节点
first=browser.find_element(By.ID,'q')
second=browser.find_element(By.CSS_SELECTOR,'#q')
third=browser.find_element(By.XPATH,'//*[@id="q"]')
# 多个节点
four=browser.find_elements(By.CSS_SELECTOR,'.service-bd li')
print(first,'\n',second,'\n',third,'\n\n',four)

# 
#  
#  

# 节点交互  send_keys(),clear(),click()
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
browser= webdriver.Chrome()
browser.get('https://www.taobao.com')
input = browser.find_element(By.ID,'q')
input.send_keys('iPone13')
time.sleep(1)
input.clear()
time.sleep(3)
input.send_keys('iPad')
button= browser.find_element(By.CLASS_NAME,'btn-search')
button.click()

# 动作链
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
import time
browser= webdriver.Chrome()
url='https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
browser.get(url)
browser.switch_to.frame('iframeResult')
source = browser.find_element(By.CSS_SELECTOR,'#draggable')
target = browser.find_element(By.CSS_SELECTOR,'#droppable')
actions = ActionChains(browser)
actions.drag_and_drop(source,target)
actions.perform()

# 执行js   execute_script()
from selenium import webdriver

browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
browser.execute_script('alert("To Bottom")')

# 获取属性 get_attribute() 文本text
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
url = 'https://www.zhihu.com/explore'
browser.get(url)
logo = browser.find_element(By.ID,'root')
# print(logo.text)
# print(logo)
# print('\n')
# print(logo.get_attribute('class')) #属性
print(logo.id) # id
print(logo.location) # 位置
print(logo.tag_name) # 标签
print(logo.size) # 大小
browser.close()

# 切换Frame
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchFrameException
from selenium.webdriver.common.by import By
browser =webdriver.Chrome()
url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
browser.get(url)
browser.switch_to.frame('iframeResult')
try:
    logo = browser.find_element(By.CLASS_NAME,'logo')
except NoSuchFrameException:
    print('NO LOGO')
browser.switch_to.parent_frame()
logo = browser.find_element(By.CLASS_NAME,'logo')
print(logo)
print(logo.text)

# 延时等待
from selenium import webdriver
from selenium.webdriver.common.by import By
browser= webdriver.Chrome()
browser.implicitly_wait(10) # 隐式等待
browser.get('https://www.zhihu.com/explore')
input=browser.find_element(By.CLASS_NAME,'zu-top-add-question')
print(input)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # 显式等待
from selenium.webdriver.support import expected_conditions as EC
browser= webdriver.Chrome()
browser.get('https://www.taobao.com/')
wait = WebDriverWait(browser,10)
input =wait.until(EC.presence_of_element_located((By.ID,'q')))
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.btn-search')))
print(input,'\n',button)


# 前进或后退
import time
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.baidu.com/')
browser.get('https://www.taobao.com/')
browser.get('https://www.python.org/')
browser.back()
time.sleep(1)
browser.forward()
browser.close()

# Cookies
from selenium import webdriver
browser =webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
print(browser.get_cookies())
browser.add_cookie({'name':'name','domain':'www.zhihu.com','value':'germey'})
print(browser.get_cookies())
browser.delete_all_cookies()
print(browser.get_cookies())
browser.close()

# [{'domain': '.zhihu.com', 'httpOnly': False, 'name': 'Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49', 'path': '/', 'secure': False, 'value': '1660980908'}, {'domain': 'www.zhihu.com', 'httpOnly': False, 'name': 'JOID', 'path': '/', 'secure': False, 'value': 'WloWBkpXuqcwA0exIFvZv4H41Nw-Gu7sVmUkwXUO2JZlQDTJbPozpV4ATLYi23jHHkOk4SXADwKFCoiclQvPJtE='}, {'domain': '.zhihu.com', 'expiry': 1692516908, 'httpOnly': False, 'name': 'Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49', 'path': '/', 'secure': False, 'value': '1660980908'}, {'domain': 'www.zhihu.com', 'httpOnly': False, 'name': 'KLBRSID', 'path': '/', 'secure': False, 'value': '76ae5fb4fba0f519d97e594f1cef9fab|1660980902|1660980900'}, {'domain': 'www.zhihu.com', 'httpOnly': False, 'name': 'osd', 'path': '/', 'secure': False, 'value': 'UVwQAk5cvKE0B0y3Jl_dtIf-0Ng1HOjoUm4ix3EK05BjRDDCavw3oVUGSrIm0H7BGkev5yPECwmDDIyYng3JItU='}, {'domain': '.zhihu.com', 'expiry': 1695540905, 'httpOnly': False, 'name': '_zap', 'path': '/', 'secure': False, 'value': 'fe594060-fef6-4fca-b59c-4cafe146b257'}, {'domain': '.zhihu.com', 'httpOnly': False, 'name': '_xsrf', 'path': '/', 'secure': False, 'value': '46dc1123-1966-4906-b7fc-c7462df65abe'}, {'domain': 'www.zhihu.com', 'httpOnly': False, 'name': 'SESSIONID', 'path': '/', 'secure': False, 'value': 'GUMboiqSPh0SGzrMLFRjrdl34WhUBf3zpMD5lGohNvW'}, {'domain': '.zhihu.com', 'expiry': 1695540905, 'httpOnly': False, 'name': 'd_c0', 'path': '/', 'secure': False, 'value': 'AMDXsveebhWPTiudfRrzU69qi3k-JbQMQgo=|1660980900'}]
# [{'domain': '.www.zhihu.com', 'httpOnly': False, 'name': 'name', 'path': '/', 'secure': True, 'value': 'germey'}, {'domain': '.zhihu.com', 'httpOnly': False, 'name': 'Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49', 'path': '/', 'secure': False, 'value': '1660980908'}, {'domain': 'www.zhihu.com', 'httpOnly': False, 'name': 'JOID', 'path': '/', 'secure': False, 'value': 'WloWBkpXuqcwA0exIFvZv4H41Nw-Gu7sVmUkwXUO2JZlQDTJbPozpV4ATLYi23jHHkOk4SXADwKFCoiclQvPJtE='}, {'domain': '.zhihu.com', 'expiry': 1692516908, 'httpOnly': False, 'name': 'Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49', 'path': '/', 'secure': False, 'value': '1660980908'}, {'domain': 'www.zhihu.com', 'httpOnly': False, 'name': 'KLBRSID', 'path': '/', 'secure': False, 'value': '76ae5fb4fba0f519d97e594f1cef9fab|1660980902|1660980900'}, {'domain': 'www.zhihu.com', 'httpOnly': False, 'name': 'osd', 'path': '/', 'secure': False, 'value': 'UVwQAk5cvKE0B0y3Jl_dtIf-0Ng1HOjoUm4ix3EK05BjRDDCavw3oVUGSrIm0H7BGkev5yPECwmDDIyYng3JItU='}, {'domain': '.zhihu.com', 'expiry': 1695540905, 'httpOnly': False, 'name': '_zap', 'path': '/', 'secure': False, 'value': 'fe594060-fef6-4fca-b59c-4cafe146b257'}, {'domain': '.zhihu.com', 'httpOnly': False, 'name': '_xsrf', 'path': '/', 'secure': False, 'value': '46dc1123-1966-4906-b7fc-c7462df65abe'}, {'domain': 'www.zhihu.com', 'httpOnly': False, 'name': 'SESSIONID', 'path': '/', 'secure': False, 'value': 'GUMboiqSPh0SGzrMLFRjrdl34WhUBf3zpMD5lGohNvW'}, {'domain': '.zhihu.com', 'expiry': 1695540905, 'httpOnly': False, 'name': 'd_c0', 'path': '/', 'secure': False, 'value': 'AMDXsveebhWPTiudfRrzU69qi3k-JbQMQgo=|1660980900'}]
# []

# 选项卡管理
import time
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')
print(browser.window_handles)
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(1)
browser.switch_to.window(browser.window_handles[0])
browser.get('https://python.org')
time.sleep(3)
browser.close()

代码摘抄之《Python 3网络爬虫开发实战》

你可能感兴趣的:(Python,3网络爬虫,python)