动态渲染说白了就是js渲染,模拟浏览器器操作爬取页面,一般可用来作自动化测试
pip3 install selenium
安装ChromeDriver
ChromeDriver与Chrome对照表
ChromeDriver v2.41 (2018-07-27)—————Supports Chrome v67-69
ChromeDriver v2.40 (2018-06-07)----------Supports Chrome v66-68
ChromeDriver v2.39 (2018-05-30)----------Supports Chrome v66-68
ChromeDriver v2.38 (2018-04-17)----------Supports Chrome v65-67
ChromeDriver v2.37 (2018-03-16)----------Supports Chrome v64-66
ChromeDriver v2.36 (2018-03-02)----------Supports Chrome v63-65
网址;http://npm.taobao.org/mirrors/chromedriver
配置环境变量,然后cmd中查看
C:\Users\Administrator>echo %path%
运行
C:\Users\Administrator>chromedriver
Starting ChromeDriver 2.42.591088 (7b2b2dca23cca0862f674758c9a3933e685c27d5) on port 9515
Only local connections are allowed.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
def f1(browser):
# 访问页面
browser.get('https://www.jd.com')
# 获取渲染后的页面内容
print(browser.page_source)
# 获取当前网址
print(browser.current_url)
# 获取浏览器cookie
print(browser.get_cookies())
# 根据id获取单个节点
input1 = browser.find_element_by_id('key')
# 获取节点属性
print(input1.get_attribute('id'))
# 用css选择器获取单个节点
input2 = browser.find_element_by_css_selector('#key')
# 获取节点的坐标
print(input2.location)
# 获取节点的宽高
print(input2.size)
# 用xpath方法获取单个节点
input3 = browser.find_element_by_xpath('//*[@id="key"]')
print(input3.id)
# 根据name获取单个节点
input4 = browser.find_element_by_name('file')
print(input4.tag_name)
# 根据链接文字获取单个节点
input5 = browser.find_element_by_link_text('京东生鲜')
input6 = browser.find_element_by_partial_link_text('生鲜')
# 获取节点文本值
print(input5.text)
print(input6.text)
def f2(browser):
browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
# 切换到指定iframe
browser.switch_to.frame('iframeResult')
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')
# 动作链
actions = ActionChains(browser)
# 将选定的源移动到目标的位置
actions.drag_and_drop(source, target)
actions.perform()
def main():
# 使用chrome浏览器
browser = webdriver.Chrome()
# 使用Firefox浏览器
# browser = webdriver.Firefox()
# 使用Edge浏览器
# browser = webdriver.Edge()
# 使用Phantom浏览器
# browser = webdriver.PhatomJS()
# 使用Safari浏览器
# browser = webdriver.Safari()
try:
f2(browser)
finally:
# 关闭浏览器
browser.close()
if __name__ == '__main__':
main()
三、案例淘宝--模拟浏览
无头浏览器是指在与进行的时候不弹出浏览器
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import quote
from lxml import etree
chrome_options = webdriver.ChromeOptions()
# 无头浏览器方式(这三句代码)
# chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
# browser = webdriver.Chrome()
# 设置浏览器的宽、高
browser.set_window_size(1400, 700)
# 设置响应时间,变量名命名为wait
wait = WebDriverWait(browser, 10)
# 设置搜索的关键字
KEYWORD = '编程机器人'
# 获取页面
def index_page(page):
if page == 1:
url = 'https://s.taobao.com/search?q=' + quote(KEYWORD)
print(url)
browser.get(url)
# 执行js语句滚动条操作,左右的不动,上下的操作上下的
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
if page > 1:
input = wait.until(
# 获取输入框的位置
# div.form代表含有class为form的div
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
submit = wait.until(
# 获取'确定'按钮的点击位置
# span.btn.J_Submit代表span标签中的class为btn J_submot,源码中的空格这里用点表示
EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
# 每一次点击确定后,清空一下输入框,后面就可以重新输入
input.clear()
input.send_keys(page)
# 点击事件
submit.click()
wait.until(
# 等待元素加载(配对)
EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
page_source = browser.page_source
return page_source
def parse_page(page_source):
etree_html = etree.HTML(page_source)
print(type(etree_html))
# 用Xpath寻找目标位置
products = etree_html.xpath('//div[@id="mainsrp-itemlist"]//div[@class="items"][1]//div[contains(@class, "item")]')
print(products)
for product in products:
item = {}
item['price'] = product.xpath('.//div[contains(@class, "price")]/strong/text()')[0].strip()
item['title'] = product.xpath('.//div[contains(@class, "title")]/a/descendant::*')
item['shop'] = product.xpath('.//div[contains(@class, "shop")]/a/span[2]/text()')[0].strip()
item['image'] = product.xpath('.//div[@class="pic"]//img[contains(@class, "img")]/@data-src')[0].strip()
item['deal'] = product.xpath('.//div[contains(@class, "deal-cnt")]//text()')[0]
item['location'] = product.xpath('.//div[contains(@class, "location")]//text()')[0]
# yield可以减少内存的占用,比用list存放要好一些
yield item
def main():
for page in range(100):
page_source = index_page(page + 1)
products = parse_page(page_source)
for product in products:
print(product['title'])
print(product['price'])
print(product['shop'])
print(product['image'])
if __name__ == '__main__':
main()