需要注意的问题:
- 使用driver.switch_to.window()之后网页切换成功但是获取的源代码不正确
- 可以使用driver.current_url获取当前窗口网址,再次get延时恰当时间后可以正确获取源代码
driver.switch_to.window(driver.window_handles[1])
print(driver.title)
driver.get(driver.current_url)
time.sleep(4)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
data = [i.text for i in soup.select('爬取内容路径')]
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
url = 'https://www.ptpress.com.cn/shopping/index'
driver = webdriver.Chrome()
driver.get(url)
wait = WebDriverWait(driver, 10)
before = driver.current_window_handle
print(driver.title)
search_btn = driver.find_element_by_css_selector(
'body > div.classifySearch-p > div > div.classifySearchBar > div.allSearch > input')
search_btn.send_keys("python编程")
confrim_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,
'body > div.classifySearch-p > div > div.classifySearchBar > '
'div.allSearch > a > i')))
confrim_btn.click()
driver.switch_to.window(driver.window_handles[1])
print(driver.title)
driver.get(driver.current_url)
time.sleep(4)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
data = [i.text for i in soup.select('#search > div.book-floor > ul > li > p')]
print(data)
driver.close()