1.下载谷歌模拟器驱动:http://npm.taobao.org/mirrors/chromedriver/
2.chromedriver的版本要与你使用的chrome版本对应,查看位置:设置->帮助->关于 Google
3.尖叫科技第三方验证码识别(这个效率比较高,如需账号可以与本人联系):http://www.jianjiaoshuju.com/
1.登录按钮不是链接,而是单击事件的时候,则用到了Selenium浏览器自动化框架,去模拟点击的操作。
2.当自己去写或者用免费的Tesseract-OCR4.0之类的识别验证码的工具需要投入较多的精力,时间成本太高。
1.Selenium模拟点击登录按钮,并获取验证码
2.第三方工具识别验证码
3.手输短信验证码,本地读取文本(前提手机号码是你的)
4.通过manage().getCookies();方法获取Cookies
5.指定body的xpath,获取页面内容
1.工具类:根据图片地址转换为base64编码字符串
/**
* 根据图片地址转换为base64编码字符串
* @param imgFile:图片地址
* @return base64编码字符串
*/
public static String getImageBase(String imgFile) {
InputStream inputStream = null;
byte[] data = null;
try {
inputStream = new FileInputStream(imgFile);
data = new byte[inputStream.available()];
inputStream.read(data);
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
// 加密
BASE64Encoder encoder = new BASE64Encoder();
return encoder.encode(data);
}
2.工具类:识别验证码的第三方api
/**
*
* @param imgBase:图片的base64编码字符串
* @param appcode:尖叫数据鉴权AppCode
* @param appKey:尖叫数据鉴权AppKey
* @param appSecret:尖叫数据鉴权AppSecret
* @return 返回json串
* 正常返回示例
* {
* "msg": "查询成功!",
* "v_type": "ne4",
* "v_code": "37PW",
* "errCode": 0
* }
*/
public static String getImgCode(String imgBase, String appcode, String appKey, String appSecret) {
String host = "http://apigateway.jianjiaoshuju.com";
String path = "/api/v_1/yzm.html";
String method = "POST";
Map headers = new HashMap();
Map querys = new HashMap();
Map bodys = new HashMap();
headers.put("appcode", appcode);
headers.put("appKey", appKey);
headers.put("appSecret", appSecret);
headers.put("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
bodys.put("v_pic", imgBase);
bodys.put("v_type", "ne4");
try {
HttpResponse response = HttpUtils.doPost(host, path, method, headers, querys, bodys);
System.out.println(response.toString());
JSONObject jsonObject = GetHttpEntity(response);
return jsonObject.getString("v_code");
} catch (Exception e) {
e.printStackTrace();
return "第三方接口调用失败";
}
}
3.工具类:本地读取文本
/**
* @param filePath:本地短信验证码的txt文本路径
* @return 短信验证码
*/
public static String readTxt(String filePath) {
String lineTxt = null;
try {
File file = new File(filePath);
if(file.isFile() && file.exists()) {
InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "utf-8");
BufferedReader br = new BufferedReader(isr);
while ((lineTxt = br.readLine()) != null) {
return lineTxt;
}
br.close();
} else {
return "文件不存在!";
}
} catch (Exception e) {
return "文件读取错误!";
}
return lineTxt;
}
4.Selenium模拟登录并获取数据
import net.sf.json.JSONObject;
import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.sql.*;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Date;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class BlogPageProcessor implements PageProcessor {
//谷歌模拟器驱动本地路径
static String chromeDriverLocalPath = "C:\\Program Files (x86)\\Google\\chromedriver.exe";
//模拟登录地址
static String loginPath = "http://www.xxxx.com/";
//jianjiaoshuju第三方api权限
static String appcode = "xxxx";
static String appKey = "xxxx";
static String appSecret = "xxxx";
//账户
static String loginName = "xxxx";
static String password = "xxxx";
//短信验证码路径
static String validateCodePath = "E:\\validateCode.txt";
//cookies
static Set cookies;
/**
* 提取cookies
* @return 整个body
* @throws Exception
*/
public static String SeleniumLogin() throws Exception {
//加载谷歌模拟器驱动
System.setProperty("webdriver.chrome.driver", chromeDriverLocalPath);
ChromeDriver driver = new ChromeDriver();
driver.manage().window().maximize();
driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
// 打开模拟登录网站
driver.get(loginPath);
driver.findElement(By.className("login-btn")).click();
WebElement element = driver.findElement(By.xpath("//*[@id=\"pwd_captcha_code_img\"]"));
File screenshot = driver.getScreenshotAs(OutputType.FILE);
BufferedImage image = ImageIO.read(screenshot);
Point point = element.getLocation();
int width = element.getSize().getWidth();
int height = element.getSize().getHeight();
//截取具体验证码区域
BufferedImage subImage = image.getSubimage(point.getX(), point.getY(), width, height);
ImageIO.write(subImage, "png", screenshot);
//图形验证码转换Base64
String imageBase = getImageBase(screenshot.getPath());
//第三方识别验证码(有可能识别错误,有待处理)
String imgString = getImgCode(imageBase, appcode, appKey, appSecret);
//模拟登录
driver.findElementById("loginName").sendKeys(loginName);
driver.findElementById("pwd_img_code").sendKeys(imgString);
driver.findElementById("password").sendKeys(password);
driver.findElementById("loginAjaxBtn").click();
//短信验证码
driver.findElementById("mobileCaptchaBtn").click();
Thread.sleep(50000);
String validateCode = readTxt(validateCodePath);
driver.findElementById("captcha").sendKeys(validateCode);
driver.findElementById("validatePhoneBtn").click();
WebElement body = driver.findElementByXPath("/html/body");
//登录后的cookies
cookies = driver.manage().getCookies();
//返回整个body
return body.getText();
}
}