Tesseract-OCR的下载安装:
http://www.51testing.com/html/14/87714-3693118.html
selenium如何识别验证码:
http://www.bubuko.com/infodetail-973335.html
1、保存动态页面中的图片文件到本地;
download(URL地址,"code.png");
public void download(String strUrl,String strPath){
FileOutputStream outStream =null;
try {
URL url = newURL(strUrl); //打开链接
HttpURLConnection conn = (HttpURLConnection)url.openConnection(); conn.setConnectTimeout(5 *1000); //通过输入流获取图片数据 InputStream inStream =conn.getInputStream(); //得到图片的二进制数据,以二进制封装得到数据,具有通用性
byte[] data =readInputStream(inStream); //new一个文件对象用来保存图片,默认保存当前工程根目录
File imageFile = newFile(strPath); //创建输出流
outStream = newFileOutputStream(imageFile); //写入数据 outStream.write(data); //关闭输出流
outStream.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (outStream != null) {
outStream.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
2、java运行tesseract批量处理bat文件,产生TXT文件保存到本地;
String cmd = "cmd /c startG:\\uxin1\\selenium_xin\\code.bat";
Thread.sleep(2000);
try {
Runtime.getRuntime().exec(cmd); }
catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String filepath="G:/uxin1/selenium_xin/code.txt";
3、读取TXT文件;
//读取图片转换的文本的内容
public static void readTextFile(StringfilePath) {
try {
String encoding = "GBK";
File file = new File(filePath);
if (file.isFile() && file.exists()){ // 判断文件是否存在
InputStreamReader read = new InputStreamReader(
new FileInputStream(file), encoding);//考虑到编码格式 BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
while ((lineTxt = bufferedReader.readLine()) != null){ System.out.println(lineTxt);
}
read.close();
} else {
System.out.println("----找不到指定的文件");
} } catch (Exception e) {
System.out.println("读取文件内容出错");
e.printStackTrace();
}
}
以上!成功的图片如下,一个字,爽!