【Python爬虫】_04Selenium爬取动态渲染页面

动态渲染说白了就是js渲染,模拟浏览器器操作爬取页面,一般可用来作自动化测试

一、安装和配置Selenium、ChromeDriver

pip3 install selenium

安装ChromeDriver
ChromeDriver与Chrome对照表
ChromeDriver v2.41 (2018-07-27)—————Supports Chrome v67-69
ChromeDriver v2.40 (2018-06-07)----------Supports Chrome v66-68
ChromeDriver v2.39 (2018-05-30)----------Supports Chrome v66-68
ChromeDriver v2.38 (2018-04-17)----------Supports Chrome v65-67
ChromeDriver v2.37 (2018-03-16)----------Supports Chrome v64-66
ChromeDriver v2.36 (2018-03-02)----------Supports Chrome v63-65

网址;http://npm.taobao.org/mirrors/chromedriver

【Python爬虫】_04Selenium爬取动态渲染页面_第1张图片

配置环境变量,然后cmd中查看

C:\Users\Administrator>echo %path%

运行

C:\Users\Administrator>chromedriver
Starting ChromeDriver 2.42.591088 (7b2b2dca23cca0862f674758c9a3933e685c27d5) on port 9515
Only local connections are allowed.

【Python爬虫】_04Selenium爬取动态渲染页面_第2张图片

二、案例

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains

def f1(browser):
	# 访问页面
	browser.get('https://www.jd.com')
	# 获取渲染后的页面内容
	print(browser.page_source)
	# 获取当前网址
	print(browser.current_url)
	# 获取浏览器cookie
	print(browser.get_cookies())

	# 根据id获取单个节点
	input1 = browser.find_element_by_id('key')
	# 获取节点属性
	print(input1.get_attribute('id'))
	
	# 用css选择器获取单个节点
	input2 = browser.find_element_by_css_selector('#key')
	# 获取节点的坐标
	print(input2.location)
	# 获取节点的宽高
	print(input2.size)

	# 用xpath方法获取单个节点
	input3 = browser.find_element_by_xpath('//*[@id="key"]')
	print(input3.id)

	# 根据name获取单个节点
	input4 = browser.find_element_by_name('file')
	print(input4.tag_name)

	# 根据链接文字获取单个节点
	input5 = browser.find_element_by_link_text('京东生鲜')
	input6 = browser.find_element_by_partial_link_text('生鲜')
	# 获取节点文本值
	print(input5.text)
	print(input6.text)

def f2(browser):
	browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
	# 切换到指定iframe
	browser.switch_to.frame('iframeResult')	
	source = browser.find_element_by_css_selector('#draggable')
	target = browser.find_element_by_css_selector('#droppable')
	# 动作链
	actions = ActionChains(browser)
	# 将选定的源移动到目标的位置
	actions.drag_and_drop(source, target)
	actions.perform()


def main():
	# 使用chrome浏览器
	browser = webdriver.Chrome()
	# 使用Firefox浏览器
	# browser = webdriver.Firefox()
	# 使用Edge浏览器
	# browser = webdriver.Edge()
	# 使用Phantom浏览器
	# browser = webdriver.PhatomJS()
	# 使用Safari浏览器
	# browser = webdriver.Safari()

	try:
		f2(browser)
	finally:
		# 关闭浏览器
		browser.close()

if __name__ == '__main__':
	main()

三、案例淘宝--模拟浏览

无头浏览器是指在与进行的时候不弹出浏览器

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import quote
from lxml import etree


chrome_options = webdriver.ChromeOptions()
# 无头浏览器方式(这三句代码)
# chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)

# browser = webdriver.Chrome()
# 设置浏览器的宽、高
browser.set_window_size(1400, 700)
# 设置响应时间,变量名命名为wait
wait = WebDriverWait(browser, 10)
# 设置搜索的关键字
KEYWORD = '编程机器人'


# 获取页面
def index_page(page):
    if page == 1:
        url = 'https://s.taobao.com/search?q=' + quote(KEYWORD)
        print(url)
        browser.get(url)
    # 执行js语句滚动条操作,左右的不动,上下的操作上下的
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    if page > 1:
        input = wait.until(
            # 获取输入框的位置
            # div.form代表含有class为form的div
            EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))

        submit = wait.until(
            # 获取'确定'按钮的点击位置
            # span.btn.J_Submit代表span标签中的class为btn J_submot,源码中的空格这里用点表示
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
        # 每一次点击确定后,清空一下输入框,后面就可以重新输入
        input.clear()
        input.send_keys(page)
        # 点击事件
        submit.click()

        wait.until(
            # 等待元素加载(配对)
            EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))

        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))

    page_source = browser.page_source

    return page_source


def parse_page(page_source):
    etree_html = etree.HTML(page_source)
    print(type(etree_html))
    # 用Xpath寻找目标位置
    products = etree_html.xpath('//div[@id="mainsrp-itemlist"]//div[@class="items"][1]//div[contains(@class, "item")]')

    print(products)

    for product in products:
        item = {}
        item['price'] = product.xpath('.//div[contains(@class, "price")]/strong/text()')[0].strip()

        item['title'] = product.xpath('.//div[contains(@class, "title")]/a/descendant::*')
        item['shop'] = product.xpath('.//div[contains(@class, "shop")]/a/span[2]/text()')[0].strip()
        item['image'] = product.xpath('.//div[@class="pic"]//img[contains(@class, "img")]/@data-src')[0].strip()
        item['deal'] = product.xpath('.//div[contains(@class, "deal-cnt")]//text()')[0]
        item['location'] = product.xpath('.//div[contains(@class, "location")]//text()')[0]
        # yield可以减少内存的占用,比用list存放要好一些
        yield item


def main():
    for page in range(100):
        page_source = index_page(page + 1)

        products = parse_page(page_source)
        for product in products:
            print(product['title'])
            print(product['price'])
            print(product['shop'])
            print(product['image'])

if __name__ == '__main__':
    main()

你可能感兴趣的:(Python)