selenium 可以动态爬取网页数据,就像真实用户操作浏览器一样,从终端用户的角度测试应用程序,WebDriver通过原生浏览器支持或者浏览器扩展直接控制浏览器
webdriver下载
因为selenuim对浏览器的版本存在兼容问题,顾需要针对指定浏览器下载指定版本。
1、添加依赖
org.seleniumhq.selenium
selenium-java
4.11.0
com.google.guava
guava
32.1.2-jre
2、工具类
import cn.hutool.core.collection.CollectionUtil;
import com.google.common.collect.Lists;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.edge.EdgeDriver;
import org.openqa.selenium.edge.EdgeOptions;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxOptions;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
/**
* Selenium 工具类
*
* @author kou
*/
@Slf4j
@RequiredArgsConstructor
@Component
public class SeleniumUtil {
private final ReptileProperties reptileProperties;
/**
* 获取chromeDriver
*
* @return chromeDriver
*/
public WebDriver chromeDriver() {
// 加载驱动路径
System.setProperty("webdriver.chrome.driver", "D:/chromedriver.exe");
// Chrome默认不允许跨机器调试,需要给启动命令加上白名单
System.setProperty("webdriver.chrome.whitelistedIps", "");
ChromeOptions options = new ChromeOptions();
// 开启一个实验性参数excludeSwitches,用来隐藏window.navigator.webdriver返回true,这个参数必须是List
options.setExperimentalOption("useAutomationExtension", false);
// 开启开发者模式
options.setExperimentalOption("excludeSwitches", Lists.newArrayList("enable-automation"));
// 发现主要是这句是关键
options.addArguments("--disable-blink-features=AutomationControlled");
// options.addArguments("--incognito");
// options.addArguments("--disable-infobars");
//options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");
options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36");
// 禁用沙箱
options.addArguments("--no-sandbox");
// 无头浏览器,这样不会打开浏览器窗口
// options.addArguments("--headless");
// options.addArguments("--disable-gpu");
options.addArguments("--remote-allow-origins=*");
// 初始化一个谷歌浏览器实例,实例名称叫driver
WebDriver driver = new ChromeDriver(options);
return driver;
}
/**
* 获取edgeDriver
*
* @return edgeDriver
*/
public WebDriver edgeDriver() {
// 加载驱动路径
System.setProperty("webdriver.edge.driver", "D:/msedgedriver.exe");
EdgeOptions options = new EdgeOptions();
// 开启一个实验性参数excludeSwitches,用来隐藏window.navigator.webdriver返回true,这个参数必须是List
options.setExperimentalOption("useAutomationExtension", false);
//开启开发者模式
options.setExperimentalOption("excludeSwitches", Lists.newArrayList("enable-automation"));
// 发现主要是这句是关键
options.addArguments("--disable-blink-features=AutomationControlled");
options.addArguments("--incognito", "--disable-infobars");
// options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36");
options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");
// 禁用沙箱
options.addArguments("--no-sandbox");
// 无头浏览器,这样不会打开浏览器窗口
// options.addArguments("--headless");
options.addArguments("--disable-gpu");
options.addArguments("--remote-allow-origins=*");
// 初始化一个谷歌浏览器实例,实例名称叫driver
WebDriver driver = new EdgeDriver(options);
return driver;
}
/**
* 获取firefoxDriver
*
* @return firefoxDriver
*/
public WebDriver firefoxDriver() {
// 加载驱动路径
System.setProperty("webdriver.gecko.driver", "D:/geckodriver.exe");
System.setProperty("webdriver.chrome.whitelistedIps", "");
FirefoxOptions options = new FirefoxOptions();
options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");
// 无头浏览器,这样不会打开浏览器窗口
options.addArguments("--headless");
// 初始化一个谷歌浏览器实例,实例名称叫driver
WebDriver driver = new FirefoxDriver(options);
return driver;
}
/**
* 获取表头
*
* @param table 表格
* @return 表头
*/
public List getTableHead(WebElement table) {
log.info("开始解析表头...");
// 获取表头
WebElement head = table.findElement(By.tagName("thead"));
if (null == head) {
return Collections.emptyList();
}
List headths = head.findElements(By.tagName("th"));
List headList = new ArrayList<>(headths.size());
headths.forEach(t -> {
headList.add(t.getText());
});
log.info("表头解析完成!!!");
return headList;
}
/**
* 获取表数据
*
* @param table 表格
* @return 表头
*/
public List> getTableBody(WebElement table) {
log.info("开始解析表数据...");
// 获取表头
WebElement tbody = table.findElement(By.tagName("tbody"));
if (null == tbody) {
return Collections.emptyList();
}
// 获取body数据行
List bodyTrs = tbody.findElements(By.tagName("tr"));
if (CollectionUtil.isEmpty(bodyTrs)) {
return Collections.emptyList();
}
List> bodyDatas = new ArrayList<>(bodyTrs.size());
bodyTrs.stream().forEach(r -> {
List tds = r.findElements(By.tagName("td"));
List rows = new ArrayList<>(tds.size());
tds.forEach(d -> {
rows.add(d.getText());
});
bodyDatas.add(rows);
});
log.info("表数据解析完成!!!");
return bodyDatas;
}
/**
* 将参数转化为路径参数
*
* @param params 参数
* @return 路径参数
*/
public String convertPathParams(Map params) {
if (CollectionUtil.isEmpty(params)) {
return "";
}
StringBuffer path = new StringBuffer();
for (Map.Entry p : params.entrySet()) {
path.append(p.getKey()).append("=").append(p.getValue().toString()).append("&");
}
return path.substring(0, path.length() - 1);
}
}
3、爬取数据
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;
import java.time.Duration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
* 数据接口实现类
*
* @author kou
*/
@Slf4j
@RequiredArgsConstructor
@Service
public class DataServiceImpl {
private final SeleniumUtil seleniumUtil;
/**
* 获取页面数据
*
* @return 数据
*/
@Override
public Map getHtmlData() {
try {
Map data = new HashMap<>();
String url = "url";
Map params = new HashMap<>();
params.put("pageNum", 1);
params.put("pageSize", 1000);
String fullUrl = url + seleniumUtil.convertPathParams(params);
WebDriver driver = seleniumUtil.firefoxDriver();
driver.get(fullUrl);
// 打开一个站点
log.info("开始访问:{}", fullUrl);
driver.get(fullUrl);
String title = driver.getTitle();
log.info("网页:{}", title);
// 获取表格数据
WebElement table = driver.findElement(By.id("table"));
//显式等待,针对某个元素等待,等待超时时间100s,2s检测一次
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(100), Duration.ofSeconds(2));
// wait.until(ExpectedConditions.presenceOfElementLocated(By.id("table")));
wait.until(new ExpectedCondition() {
@Override
public WebElement apply(WebDriver text) {
log.info("开始检查tbody数据是否已加载");
WebElement table = text.findElement(By.id("table")).findElement(By.tagName("tbody"));
if (!table.isDisplayed()) {
log.info("检查结果:tbody数据未加载完,等待加载...");
return null;
}
log.info("检查结果:tbody数据加载完成!!!");
return table;
}
});
// 获取表头
List headList = seleniumUtil.getTableHead(table);
List> bodyList = seleniumUtil.getTableBody(table);
data.put("header", headList);
data.put("body", bodyList);
driver.close();
return data;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}