Java Selenium自动化识别图形验证码及短信验证模拟登录,并获取Cookies和页面内容

一、资源

1.下载谷歌模拟器驱动:http://npm.taobao.org/mirrors/chromedriver/

2.chromedriver的版本要与你使用的chrome版本对应,查看位置:设置->帮助->关于 Google

Java Selenium自动化识别图形验证码及短信验证模拟登录,并获取Cookies和页面内容_第1张图片

3.尖叫科技第三方验证码识别(这个效率比较高,如需账号可以与本人联系):http://www.jianjiaoshuju.com/

二、抛出的问题

1.登录按钮不是链接,而是单击事件的时候,则用到了Selenium浏览器自动化框架,去模拟点击的操作。

2.当自己去写或者用免费的Tesseract-OCR4.0之类的识别验证码的工具需要投入较多的精力,时间成本太高。

三、解决的思路

1.Selenium模拟点击登录按钮,并获取验证码

2.第三方工具识别验证码

3.手输短信验证码,本地读取文本(前提手机号码是你的)

4.通过manage().getCookies();方法获取Cookies

5.指定body的xpath,获取页面内容

四、代码示例

1.工具类:根据图片地址转换为base64编码字符串

    /**
     * 根据图片地址转换为base64编码字符串
     * @param imgFile:图片地址
     * @return base64编码字符串
     */
    public static String getImageBase(String imgFile) {
        InputStream inputStream = null;
        byte[] data = null;
        try {
            inputStream = new FileInputStream(imgFile);
            data = new byte[inputStream.available()];
            inputStream.read(data);
            inputStream.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        // 加密
        BASE64Encoder encoder = new BASE64Encoder();
        return encoder.encode(data);
    }

 2.工具类:识别验证码的第三方api

    /**
     *
     * @param imgBase:图片的base64编码字符串
     * @param appcode:尖叫数据鉴权AppCode
     * @param appKey:尖叫数据鉴权AppKey
     * @param appSecret:尖叫数据鉴权AppSecret
     * @return 返回json串
     * 正常返回示例
     * {
     *   "msg": "查询成功!",
     *   "v_type": "ne4",
     *   "v_code": "37PW",
     *   "errCode": 0
     * }
     */
    public static String getImgCode(String imgBase, String appcode, String appKey, String appSecret) {

        String host = "http://apigateway.jianjiaoshuju.com";
        String path = "/api/v_1/yzm.html";
        String method = "POST";

        Map headers = new HashMap();
        Map querys = new HashMap();
        Map bodys = new HashMap();
        headers.put("appcode", appcode);
        headers.put("appKey", appKey);
        headers.put("appSecret", appSecret);
        headers.put("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
        bodys.put("v_pic", imgBase);
        bodys.put("v_type", "ne4");
        try {
            HttpResponse response = HttpUtils.doPost(host, path, method, headers, querys, bodys);
            System.out.println(response.toString());
            JSONObject jsonObject = GetHttpEntity(response);
            return jsonObject.getString("v_code");
        } catch (Exception e) {
            e.printStackTrace();
            return "第三方接口调用失败";
        }
    }

3.工具类:本地读取文本

    /**
     * @param filePath:本地短信验证码的txt文本路径
     * @return 短信验证码
     */
    public static String readTxt(String filePath) {
        String lineTxt = null;
        try {
            File file = new File(filePath);
            if(file.isFile() && file.exists()) {
                InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "utf-8");
                BufferedReader br = new BufferedReader(isr);
                while ((lineTxt = br.readLine()) != null) {
                    return lineTxt;
                }
                br.close();
            } else {
                return "文件不存在!";
            }
        } catch (Exception e) {
            return "文件读取错误!";
        }
        return lineTxt;
    }

 4.Selenium模拟登录并获取数据

import net.sf.json.JSONObject;
import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.sql.*;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Date;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class BlogPageProcessor implements PageProcessor {


    //谷歌模拟器驱动本地路径
    static String chromeDriverLocalPath = "C:\\Program Files (x86)\\Google\\chromedriver.exe";
    //模拟登录地址
    static String loginPath = "http://www.xxxx.com/";
    //jianjiaoshuju第三方api权限
    static String appcode = "xxxx";
    static String appKey = "xxxx";
    static String appSecret = "xxxx";
    //账户
    static String loginName = "xxxx";
    static String password = "xxxx";
    //短信验证码路径
    static String validateCodePath = "E:\\validateCode.txt";
    //cookies
    static Set cookies;



    /**
     * 提取cookies
     * @return 整个body
     * @throws Exception
     */
    public static String SeleniumLogin() throws Exception {

        //加载谷歌模拟器驱动
        System.setProperty("webdriver.chrome.driver", chromeDriverLocalPath);
        ChromeDriver driver = new ChromeDriver();
        driver.manage().window().maximize();
        driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);

        // 打开模拟登录网站
        driver.get(loginPath);
        driver.findElement(By.className("login-btn")).click();
        WebElement element = driver.findElement(By.xpath("//*[@id=\"pwd_captcha_code_img\"]"));
        File screenshot =  driver.getScreenshotAs(OutputType.FILE);
        BufferedImage image = ImageIO.read(screenshot);
        Point point = element.getLocation();
        int width = element.getSize().getWidth();
        int height = element.getSize().getHeight();
        //截取具体验证码区域
        BufferedImage subImage = image.getSubimage(point.getX(), point.getY(), width, height);
        ImageIO.write(subImage, "png", screenshot);
        //图形验证码转换Base64
        String imageBase = getImageBase(screenshot.getPath());
        //第三方识别验证码(有可能识别错误,有待处理)
        String imgString = getImgCode(imageBase, appcode, appKey, appSecret);
        //模拟登录
        driver.findElementById("loginName").sendKeys(loginName);
        driver.findElementById("pwd_img_code").sendKeys(imgString);
        driver.findElementById("password").sendKeys(password);
        driver.findElementById("loginAjaxBtn").click();
        //短信验证码
        driver.findElementById("mobileCaptchaBtn").click();
        Thread.sleep(50000);
        String validateCode = readTxt(validateCodePath);
        driver.findElementById("captcha").sendKeys(validateCode);
        driver.findElementById("validatePhoneBtn").click();
        WebElement body = driver.findElementByXPath("/html/body");
        
        //登录后的cookies
        cookies = driver.manage().getCookies();
        
        //返回整个body
        return body.getText();

    }
}

 

你可能感兴趣的:(数据获取)