selenium phantomjs 翻页

翻页

  • 对于这个需求我们两种方法,一个是解析源码
import time

from selenium import webdriver
from selenium.webdriver.support.select import Select
import random
from lxml import etree

def extract_content(item):
    pass
driver = webdriver.PhantomJS()
# driver = webdriver.Chrome()
url = ''
driver.get(url)
doc = etree.HTML(driver.page_source)
page = int(doc.xpath('//*[@id="PageTotalSpan"]/text()')[0]) // 10 + 1  # 获取页码
for i in range(1, page):
    response = etree.HTML(driver.page_source)
    contents = response.xpath('//td[@valign="top"]/table/tbody/tr/td')
    extract_content(contents)
    driver.find_element_by_xpath('//*[@id="_PageBar_Index_list1"]').clear() # 清除页码
    a = random.uniform(1, 2)
    time.sleep(a)
    driver.find_element_by_xpath('//*[@id="_PageBar_Index_list1"]').send_keys(i) # 填写页码
    driver.find_element_by_xpath(
        '//*[@id="PageBarDiv"]/table/tbody/tr/td/table/tbody/tr/td[7]/a/img').click() # 翻页
  • 获取加载后的动态翻页针对于Elements 跟 page source不一致情况
import random
import time

from selenium import webdriver


def extract_content(item):
    pass


url = 'xxxx'
driver = webdriver.PhantomJS()
driver.get(url)
for i in range(1, 10):
    print(i)
    driver.find_element_by_xpath('//*[@id="_PageBar_Index_list1"]').clear()

    driver.find_element_by_xpath('//*[@id="_PageBar_Index_list1"]').send_keys(i)
    driver.find_element_by_xpath(
        '//*[@id="PageBarDiv"]/table/tbody/tr/td/table/tbody/tr/td[7]/a/img').click()
    a = random.uniform(8, 10) # 加载时间
    time.sleep(a)
    contents = driver.find_elements_by_xpath('//table[@id="illExampleDataTable"]/tbody')  # 捕获全部的加载动态
    extract_content(contents)

你可能感兴趣的:(selenium phantomjs 翻页)