selenium+chromedriver获取动态网页数据以及模拟鼠标操作后才能获得的数据

1.下载chromedriver,记住chromedriver和chrome浏览器版本有对应关系

2.获得动态加载后的界面模拟鼠标操作,获得需要点击等特定操作后才能获得的动态加载的数据

3.源码:

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains



#下载后的chromedriver地址,我这里是windows版本的
CHROME_DRIVER_PATH = 'D:\\Code\imgageRecognition\\site_scrapy\\chromedriver.exe'



#下载动态界面,返回可被beatifulsoup4解析的数据
def get_dynamic_html(site_url):
    print('开始加载',site_url,'动态页面')
    chrome_options = webdriver.ChromeOptions()
    #ban sandbox
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    #use headless
    #chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--ignore-ssl-errors')
    driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,chrome_options=chrome_options)
    #print('dynamic laod web is', site_url)
    driver.set_page_load_timeout(100)
    #driver.set_script_timeout(100)
    try:
        driver.get(site_url)
    except Exception as e:
        driver.execute_script('window.stop()')  # 超出时间则不加载
        print(e, 'dynamic web load timeout')
    action = ActionChains(driver)
    womwn_nav_tag = driver.find_element_by_css_selector('.navigation-bar.second-level.clearfix.p_15.active')
    nav_tag_list = womwn_nav_tag.find_elements_by_css_selector('.navigation-bar-item')
    for tag in nav_tag_list:
        print(tag.text)
        #模拟移动鼠标获得动态加载后的数据
        action.move_to_element(tag).perform()
        time.sleep(5)

    data = driver.page_source
    soup = BeautifulSoup(data, 'html.parser')
    try:
        driver.quit()
    except:
        pass
    return soup

 

你可能感兴趣的:(scrapy)