本文利用selenium、百度OCR在线文字识别完成某一网站的模拟登陆操作,通过OCR识别验证码完成登陆后返回cookie,常用于Java爬虫。
* https://ai.baidu.com/file/658A35ABAB2D404FBF903F64D47C1F72
* https://ai.baidu.com/file/C8D81F3301E24D2892968F09AE1AD6E2
* https://ai.baidu.com/file/544D677F5D4E4F17B4122FBD60DB82B3
* https://ai.baidu.com/file/470B3ACCA3FE43788B5A963BF0B625F3
* 第五个手动添加,如下
import org.json.JSONObject;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.List;
import java.util.Map;
/**
* 获取token类
*/
public class AuthService {
/**
* 获取权限token
* @return 返回示例:
* {
* "access_token": "24.460da4889caad24cccdb1fea17221975.2592000.1491995545.282335-1234567",
* "expires_in": 2592000
* }
*/
public static String getAuth() {
// 官网获取的 API Key 更新为你注册的
String clientId = "百度云应用的AK";
// 官网获取的 Secret Key 更新为你注册的
String clientSecret = "百度云应用的SK";
return getAuth(clientId, clientSecret);
}
/**
* 获取API访问token
* 该token有一定的有效期,需要自行管理,当失效时需重新获取.
* @param ak - 百度云官网获取的 API Key
* @param sk - 百度云官网获取的 Securet Key
* @return assess_token 示例:
* "24.460da4889caad24cccdb1fea17221975.2592000.1491995545.282335-1234567"
*/
public static String getAuth(String ak, String sk) {
// 获取token地址
String authHost = "https://aip.baidubce.com/oauth/2.0/token?";
String getAccessTokenUrl = authHost
// 1. grant_type为固定参数
+ "grant_type=client_credentials"
// 2. 官网获取的 API Key
+ "&client_id=" + ak
// 3. 官网获取的 Secret Key
+ "&client_secret=" + sk;
try {
URL realUrl = new URL(getAccessTokenUrl);
// 打开和URL之间的连接
HttpURLConnection connection = (HttpURLConnection) realUrl.openConnection();
connection.setRequestMethod("GET");
connection.connect();
// 获取所有响应头字段
Map<String, List<String>> map = connection.getHeaderFields();
// 遍历所有的响应头字段
for (String key : map.keySet()) {
System.err.println(key + "--->" + map.get(key));
}
// 定义 BufferedReader输入流来读取URL的响应
BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
String result = "";
String line;
while ((line = in.readLine()) != null) {
result += line;
}
/**
* 返回结果示例
*/
System.err.println("result:" + result);
JSONObject jsonObject = new JSONObject(result);
String access_token = jsonObject.getString("access_token");
return access_token;
} catch (Exception e) {
System.err.printf("获取token失败!");
e.printStackTrace(System.err);
}
return null;
}
}
chromedriver版本 | 支持的chrome版本 |
---|---|
v2.46 | v72-74 |
v2.45 | v70-72 |
v2.44 | v69-71 |
v2.43 | v69-71 |
v2.42 | v68-70 |
v2.41 | v67-69 |
v2.40 | v66-68 |
v2.39 | v66-68 |
v2.38 | v65-67 |
v2.37 | v64-66 |
v2.36 | v63-65 |
v2.35 | v62-64 |
v2.34 | v61-63 |
v2.33 | v60-62 |
v2.32 | v59-61 |
v2.31 | v58-60 |
v2.30 | v58-60 |
v2.29 | v56-58 |
v2.28 | v55-57 |
v2.27 | v54-56 |
v2.26 | v53-55 |
v2.25 | v53-55 |
v2.24 | v52-54 |
v2.23 | v51-63 |
v2.22 | v49-52 |
v2.21 | v46-50 |
v2.20 | v43-48 |
v2.19 | v43-47 |
v2.18 | v43-42 |
v2.17 | v42-43 |
v2.16 | v42-45 |
v2.15 | v40-43 |
v2.14 | v39-42 |
v2.13 | v38-41 |
v2.12 | v36-40 |
v2.11 | v36-40 |
v2.10 | v33-36 |
v2.9 | v31-34 |
ps:下载完毕后把chromedriver.exe放置在运行工程的根目录下
<dependencies>
<!-- selenium依赖 -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<!-- 日志依赖 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
</dependency>
<!-- 百度ocr包 -->
<dependency>
<groupId>com.baidu.aip</groupId>
<artifactId>java-sdk</artifactId>
<version>4.4.1</version>
</dependency>
</dependencies>
/**
* @Author: zf
* @Date: 2020/08/26 09:08:35
* @Desc: 调用百度OCR接口来识别验证码, 直接返回识别后得到的字符串内容
*/
public class BaiDuOcr {
// 传入参数为图片的本地地址
public static String webImage(String imageUrl) {
// 请求url
String url = "https://aip.baidubce.com/rest/2.0/ocr/v1/webimage";
// 识别得到的字串
String result = null;
try {
// 本地文件路径
String filePath = imageUrl;
byte[] imgData = FileUtil.readFileByBytes(filePath);
String imgStr = Base64Util.encode(imgData);
String imgParam = URLEncoder.encode(imgStr, "UTF-8");
String param = "image=" + imgParam;
// 获取百度Ocrtoken
String accessToken = AuthService.getAuth(API Key, Secret Key);
result = HttpUtil.post(url, accessToken, param);
result = result.substring(result.lastIndexOf(":") + 3, result.lastIndexOf("\""));
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
}
import org.apache.commons.io.FileUtils;
import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @Author: zf
* @Date: 2020/08/26 11:38:06
* @Desc: 采用selenium+百度OCR,通过截取网页上的验证码图片发送至百度OCR处理后完成登录操作,并且返回对应的cookie
*/
public class CookieLogin {
private static String cookies = null;
/*
* 参数说明:
* url:请求地址url
* nameId:用户名输入框所在标签的id
* pwdId:密码输入框所在标签的id
* imageId:验证码图片所在标签的id
* codeId:验证码输入框所在标签的id
* loginClass: 登录按钮所在标签的class
* name: 用户名
* pwd: 密码
* */
public static String doLogin(String url, String nameId, String pwdId, String imageId, String codeId,
String loginClass, String name, String pwd) {
System.getProperties().setProperty("webDriver.chrome.dirver", "chromedriver.exe");
// 启动模拟浏览器
ChromeOptions chromeOptions = new ChromeOptions();
chromeOptions.setHeadless(true);
WebDriver driver = new ChromeDriver(chromeOptions);
driver.get(url);
driver.manage().window().maximize();
// 定义验证码变量
String verify = null;
// 寻找账号编辑框
driver.findElement(By.id(nameId)).clear();
driver.findElement(By.id(nameId)).sendKeys(name);
// 寻找密码编辑框
driver.findElement(By.id(pwdId)).clear();
driver.findElement(By.id(pwdId)).sendKeys(pwd);
// 创建一个时间戳,防止验证码图片文件重名
String timestamp = System.currentTimeMillis() + "";
// 寻找验证码容器
WebElement ele = driver.findElement(By.id(imageId));
// 创建一个快照
File screenshot = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
// 读取截图
BufferedImage fullImg = null;
try {
fullImg = ImageIO.read(screenshot);
// 获取页面上元素的位置
org.openqa.selenium.Point point = ele.getLocation();
// 获取元素宽高
int eleWidth = ele.getSize().getWidth();
int eleHeight = ele.getSize().getHeight();
// 计算比例
float rate = (float) fullImg.getWidth() / 1280;
// 裁剪整个页面截图只得到元素截图
BufferedImage eleScreenshot = fullImg.getSubimage((int) (point.getX() * rate),
(int) (point.getY() * rate),
(int) (eleWidth * rate),
(int) (eleHeight * rate));
ImageIO.write(eleScreenshot, "png", screenshot);
// 将验证码截图保存到本地
File screenshotLocation = new File(timestamp + ".jpg");
FileUtils.copyFile(screenshot, screenshotLocation);
// 调用百度OCR
String filePath = timestamp + ".jpg";
String result = BaiDuOcr.webImage(filePath);
// 删除本地文件
screenshotLocation.delete();
// 寻找验证码编辑框
driver.findElement(By.id(codeId)).clear();
driver.findElement(By.id(codeId)).sendKeys(result);
// 模拟点击登录按钮
driver.findElement(By.className(loginClass)).click();
Thread.sleep(5000);
// 获取cookie信息
cookies = driver.manage().getCookies().toString();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
driver.quit();
}
return cookies;
}
}
/**
* @Author: zf
* @Date: 2020/08/27 22:42:19
*/
public class getCookie {
public static void main(String[] args) {
String Cookie = CookieLogin.doLogin(请求地址url,
用户名输入框所在标签的id,
密码输入框所在标签的id,
验证码图片所在标签的id,
"验证码输入框所在标签的id,
登录按钮所在标签的class,
用户名,
密码);
System.out.println(Cookie);
}
}
log4j.rootLogger=INFO, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
# What WebDriver to use for the tests
#driver=phantomjs
#driver=firefox
driver=chrome
#driver=http://localhost:8910
#driver=http://localhost:4444/wd/hub
# PhantomJS specific config (change according to your installation)
#phantomjs_exec_path=/Users/Bingo/bin/phantomjs-qt5
#phantomjs_exec_path=d:/phantomjs.exe
#chrome_exec_path=E:\\demo\\crawler\\chromedriver.exe
#phantomjs_driver_path=/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/src/main.js
#phantomjs_driver_loglevel=DEBUG
chrome_driver_loglevel=DEBUG
最近在学习Java的爬虫,在网络上查找使用selenium进行模拟识别验证码登录后返回cookie的教程非常的少,因此写下这篇教程。此外,在爬虫程序的实际开发中,对于爬虫带验证码登录的情况非常多见,对于验证码比较清晰的情况可以使用教程里的百度OCR或tesseract-ocr工具进行识别,但是对于比较复杂的验证码以及滑块、文字类的复杂验证码,一般会接入打码平台来完成。