1 selenium-java+httpclient实现爬取页面,并且通过jdbc批量插入mysql
2 可解决开启请求监控,自动获取token,ajax数据加密返回,无法直接拿数据等问题
3 chromedriver的使用自行百度(如果步骤全对,还报错,请用管理员权限运行你开发工具
)
4 注意:以下代码为demo,需自己根据实际业务修改
示例:selenium-java
mavne依赖:
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.22</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.11.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.11.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.11.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.23</version>
</dependency>
代码如下(示例):
package test;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.devtools.DevTools;
import org.openqa.selenium.devtools.v106.network.Network;
import org.openqa.selenium.devtools.v106.network.model.Headers;
import org.openqa.selenium.devtools.v106.network.model.ResourceType;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import entity.Fa;
import util.MyHttpUtil;
import util.MySqlStrategy;
import util.SerializableUtil;
import util.Utils;
/**
*
*
* @author admin
*
*/
public class CrawlerTest {
private static String token = "xxxx";
final static String driverAddr = "C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe";
//登录
final static String url1 = "https://xxxxx/system/login?";
//获取详情接口
final static String querySaasUrlTemplate = "https://xxxxxx?id=#{id}";
final static String url2 = "https://xxxxxx?";
final static String userName = "uername";
final static String passWord = "password";
final static File idCacheFile = new File("id.bat");
//搜索名称持久化文件
final static File searchNameFile = new File("searchName.bat");
final static Set<String> idSet=getCacheSet(idCacheFile);
final static Set<String> searchNameSet=getCacheSet(searchNameFile);
public static void main(String[] args) {
System.setProperty("webdriver.chrome.driver", driverAddr);
// 设置浏览器options
ChromeOptions options = new ChromeOptions();
// 关闭界面上的---Chrome正在受到自动软件的控制
options.setExperimentalOption("excludeSwitches", new String[] { "enable-automation" });
ChromeDriver driver = new ChromeDriver(options);
Map<String, Object> command = new HashMap<>();
// window.navigator.webdirver
command.put("source", "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
driver.executeCdpCommand("Page.addScriptToEvaluateOnNewDocument", command);
// driver.executeScript("https://raw.githubusercontent.com/wendux/Ajax-hook/master/dist/ajaxhook.min.js");
// driver.get("htps://www.baidu.com");
// 首先登录
driver.get(url1);
driver.manage().window().maximize();
Utils.sleep(5000);
// 设置用户名
driver.findElement(By.xpath("//*[@id=\"phone_number\"]")).sendKeys(userName);
Utils.sleep(1000);
// 设置密码
driver.findElement(By.xpath("//*[@id=\"password\"]")).sendKeys(passWord);
Utils.sleep(1000);
// 勾选同意
driver.findElement(By.xpath("//*[@id=\"agreement\"]")).click();
Utils.sleep(1000);
// 登录
driver.findElement(
By.xpath("//*[@id=\"root\"]/div/div[2]/div[1]/div[2]/div/div/form/div[4]/div/div/div/button")).click();
// 获取window窗口句柄
String handel1 = driver.getWindowHandle();
Utils.sleep(1000);
System.out.println("登录成功");
Utils.sleep(3000);
driver.get(url1);
Utils.sleep(3000);
// 打开一个新窗口
String js = "window.open(\"" + url2 + "\");";
((JavascriptExecutor) driver).executeScript(js);
Utils.sleep(2000);
// 切换窗口
Object[] obj = driver.getWindowHandles().toArray();
// 监听数据(下标为1的窗口)
createRequestListener(1, driver);
driver.switchTo().window(obj[1].toString());
Utils.sleep(1000);
//
String searchName="搜索名称";
//已经爬取过,不在获取
if(searchNameSet.contains(searchName)){
System.out.println(searchName+":已经处理过");
return;
}
driver.findElement(By.xpath("//*[@id=\"name\"]")).sendKeys(searchName);
// 查询
driver.findElement(By.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[1]/div/form/div[6]/button"))
.click();
Utils.sleep(2000);
WebElement webElement = null;
try {
// 通过是否有下一页按钮,判断是否有数据(没有数据,这一行会抛出异常退出)
webElement = driver.findElement(By
.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[3]/div/div/div/div/div/ul/li[5]/button"));
} catch (Exception exception) {
// 跳出循环
System.out.println("没有数据");
}
// 一个字处理完所有数据插入数据库
List<Fa> faList = new ArrayList<>();
// 为了防止死循环,最多1000次
for (int i = 0; i < 1000; i++) {
// 第一次数据不点击
if (i != 0) {
// 分页处理----
// 判断是否有可以点击
Boolean isEnabled = webElement.isEnabled();
if (isEnabled) {
// 可以点击
webElement.click();
// 点击完休眠等待
Utils.sleep(2000);
} else {
// 不可以点击说明下一页处理完毕
break;
}
// 每次点击后休眠2秒,取数据
}
// 说明有数据,直接获取
WebElement tableWebElement = driver.findElement(By.xpath(
"//*[@id=\"root\"]/section/section/div[2]/div/div[3]/div/div/div/div/div/div/div/div/table/tbody"));
List<WebElement> trList = tableWebElement.findElements(By.tagName("tr"));
System.out.println("");
System.out.println("当前数据页数:" + (i + 1));
for (WebElement element : trList) {
Utils.sleep(500);
// System.out.println(element.getText().replace(" ", ""));
// 获取详情数据按钮
// WebElement
// detailElement=element.findElement(By.xpath("//*[@id=\"root\"]/section/section/div[2]/div/div[3]/div/div/div/div/div/div/div/div/table/tbody/tr[1]/td[9]/div/span[1]"));
// detailElement.click();
// 等待获取json数据完成
//判断该条数据是否已经完成
String detailId = element.getAttribute("data-row-key");
if(idSet.contains(detailId)) {
//该条数据已经处理
continue;
}
// 单位
String unit = element.findElement(By.xpath("//td[5]")).getText().replace(" ", "");
// 国家名称
String countriesName = element.findElement(By.xpath("//td[7]")).getText().replace(" ", "");
// 通过获取的id发送http请求
String querySaasUrl = querySaasUrlTemplate.replace("#{id}", detailId);
String result = MyHttpUtil.getRequest(token, querySaasUrl);
// json解析数据
ObjectMapper mapper = new ObjectMapper();// 定义一个转化对象
try {
JsonNode jsonNode = mapper.readTree(result);
if ("200".equals(String.valueOf(jsonNode.get("code")))) {
JsonNode dataNode = jsonNode.get("data");
System.out.println(dataNode);
Fa fa = mapper.readValue(dataNode.toString(), Fa.class);
fa.setUnit(unit);
fa.setCountriesName(countriesName);
// 筛入ajax返回的所有数据
fa.setRowData(dataNode.toString());
faList.add(fa);
} else {
System.out.println("获取json数据失败");
System.out.println(jsonNode.toPrettyString());
System.exit(0);
}
} catch (Exception e) {
System.out.print("数据解析异常:");
e.printStackTrace();
// 退出
System.exit(0);
}
}
// System.out.println(tableWebElement.getText());
}
// 插入数据到mysql
if(!faList.isEmpty()) {
MySqlStrategy.insertValue(faList);
}
//将本次跑的参数缓存
searchNameSet.add(searchName);
for(Fa factory:faList) {
idSet.add(factory.getRowId());
}
//序列化
SerializableUtil.serialization(searchNameFile,searchNameSet);
SerializableUtil.serialization(idCacheFile, idSet);
//
try {
Thread.currentThread().join();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* 切换多个窗口需要多个监听
*
* @param i 窗口下标(只区哪个窗口监控的数据,无实际意义)
* @param driver
*/
private static void createRequestListener(int i, ChromeDriver driver) {
DevTools devTools = driver.getDevTools();
devTools.createSession();
devTools.send(
Network.enable(java.util.Optional.empty(), java.util.Optional.empty(), java.util.Optional.empty()));
devTools.addListener(Network.requestWillBeSent(), res -> {
Utils.sleep(10);
System.out.println("RequestHeaders:" + res.getRequest().getHeaders());
System.out.println("RequestHeaders:" + res.getRequest().getUrl());
Headers header = res.getRequest().getHeaders();
synchronized (CrawlerTest.class) {
if (header.containsKey("Authorization")) {
token = (String) header.get("Authorization");
// 获取token后销毁改监视器
devTools.close();
System.out.println("获取到了token:" + token);
}
}
});
}
/**
* 根据url获取ajax数据
*
* @param pattern
* @param callback
*/
public static void interceptResponseXHRByUrl(int i, DevTools devTools) {
devTools.addListener(Network.responseReceived(), responseReceived -> {
try {
if (ResourceType.XHR != responseReceived.getType()
|| !responseReceived.getResponse().getUrl().contains("/xxxxxx")) {
return;
}
// 取类型为XHR
String data = "监控数据" + i + ":" + responseReceived.getType() + ":"
+ responseReceived.getResponse().getUrl();
Utils.sleep(2);
FileUtils.write(new File("log/re.txt"), data, "UTF-8", true);
FileUtils.write(new File("log/re.txt"), "\r\n", "UTF-8", true);
devTools.send(Network.getResponseBody(responseReceived.getRequestId()));
} catch (Exception e) {
e.printStackTrace();
} finally {
}
});
}
/**
* 创建一个set集合
* @return
*/
private static Set<String> getCacheSet(File file) {
//
Set<String> set=new LinkedHashSet<>();
//反序列化值
Set<String> cacheSet=SerializableUtil.deserialization(file, set);
if(cacheSet!=null) {
set=cacheSet;
}
return set;
}
}
代码如下(示例):
package entity;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Data;
@Data
@JsonIgnoreProperties(ignoreUnknown = true)
public class Fa{
private String rowData;
@JsonProperty("id")
private String rowId;
private String unit;
private String countriesName;
private List<FaDetail> detailData;
}
代码如下(示例):
package entity;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import lombok.Data;
@Data
@JsonIgnoreProperties(ignoreUnknown = true)
public class FaDetail {
private Long faId;
private String type;
}
代码如下(示例):
package util;
public class Config {
//驱动,8.0固定为该格式
public static final String JDBC_DRIVER = "com.mysql.cj.jdbc.Driver";
//数据库地址,修改该数据库名称
public static final String DB_URL = "jdbc:mysql://192.168.111.102:3306/crawler?useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai";
//用户名
public static final String USER = "root";
//密码
public static final String PASSWORD = "Sailing123`";
}
代码如下(示例):
package util;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class MyHttpUtil {
private static final String token="a7ee88f8-21d6-4b1d-bfa8-ff478a473304+1000001239406480";
private static final String url="https://xxxxxx?id=xxxxxx";
private static final CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
public static void main(String[] args) {
getRequest(token,url);
}
public static String getRequest(String token,String url){
HttpGet httpGet=new HttpGet(url.toString());
httpGet.setHeader("authorization", token);
try {
CloseableHttpResponse closeableHttpResponse = closeableHttpClient.execute(httpGet);
String responseString= EntityUtils.toString(closeableHttpResponse.getEntity());
return responseString;
} catch (ParseException | IOException e) {
e.printStackTrace();
System.out.println("请求数据出错,请排查问题");
System.exit(1);
}finally {
//将连接放回连接池中(下次重新使用)
httpGet.releaseConnection();
}
return null;
}
}
代码如下(示例):
package util;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import entity.Fa;
import entity.FaDetail;
public class MySqlStrategy {
private final static String url = Config.DB_URL;
private final static String user = Config.USER;
private final static String password = Config.PASSWORD;
private static Connection conn = getConnection();
// ALTER TABLE factor AUTO_INCREMENT=1;
public static void main(String[] args) {
insertValue(null);
}
private static Connection getConnection() {
try {
conn = DriverManager.getConnection(url, user, password);
} catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
public static void insertValue(List<Fa> datalist) {
String sql = "insert into fa values(?,?,?,?,?)";
String gasSql = "insert into fa_detail values(?,?,?)";
try {
conn.setAutoCommit(false);
} catch (SQLException e2) {
e2.printStackTrace();
}
try(PreparedStatement statement = conn.prepareStatement(sql, PreparedStatement.RETURN_GENERATED_KEYS);
PreparedStatement detailStatement = conn.prepareStatement(gasSql,
PreparedStatement.RETURN_GENERATED_KEYS)) {
for (int i = 0; i < datalist.size(); i++) {
Fa fa = datalist.get(i);
creatFaParam(fa, statement);
statement.addBatch();
}
statement.executeBatch();
ResultSet generatedKeys = statement.getGeneratedKeys();
List<Long> idList = new ArrayList<>();
while (generatedKeys.next()) {
idList.add(generatedKeys.getLong(1));
}
//关闭该结果集
close(null,null,generatedKeys);
// 给子表插入主表id
for (int i = 0; i < datalist.size(); i++) {
Fa factory = datalist.get(i);
List<FaDetail> detailList = factory.getDetailData();
if (detailList != null) {
for (FaDetail gas : detailList) {
gas.setFaId(idList.get(i));
// 准备批量数据
creatFaDetailParam(detailStatement, gas);
detailStatement.addBatch();
}
}
}
// 对子表进行批量插入
detailStatement.executeBatch();
conn.commit();
} catch (Exception e1) {
//回滚
try {
conn.rollback();
} catch (SQLException e) {
}
//说明有重复的key,直接返回
if(e1.getMessage().contains("Duplicate entry")) {
return;
}else {
//退出程序,排查问题
e1.printStackTrace();
System.exit(1);
}
}
}
private static void creatFaDetailParam(PreparedStatement statement, FaDetail detail) throws SQLException {
statement.setString(1, null);
statement.setLong(2, detail.getFaId());
statement.setString(3, detail.getType());
}
private static void creatFaParam(Fa fa, PreparedStatement statement) throws SQLException {
statement.setString(1, null);
statement.setString(2, fa.getRowData());
statement.setLong(3, Long.valueOf(fa.getRowId()));
statement.setString(4, fa.getUnit());
statement.setString(5, fa.getCountriesName());
}
public static void close(Connection connection, Statement statement, ResultSet resultSet) {
try {
if (connection != null)
connection.close();
} catch (Exception e) {
e.printStackTrace();
}
try {
if (statement != null)
statement.close();
} catch (Exception e) {
e.printStackTrace();
}
try {
if (resultSet != null)
resultSet.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
代码如下(示例):
package util;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.LinkedHashSet;
import java.util.Set;
public class SerializableUtil{
public static void main(String[] args) {
File file=new File("test.dat");
Set<String> set=new LinkedHashSet<>();
set.add("hello");
SerializableUtil.serialization(file, set);
Set<String> set1=SerializableUtil.deserialization(file,new LinkedHashSet<String>());
System.out.println(set1);
}
public static <T> void serialization(File file, T t) {
try {
ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(file));
oos.writeObject(t);
oos.flush();
oos.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static <T> T deserialization(File file, T t) {
if (!file.exists()) {
return null;
}
try {
ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file));
t = (T) ois.readObject();
ois.close();
return t;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}
代码如下(示例):
package util;
public class Utils {
public static void sleep(Integer time){
try {
Thread.sleep(time);
} catch (InterruptedException e) {
//
}
}
}
selenium-java结合httpclient满足大部分网站爬虫代码就到这儿了