Java整合ORC识别验证码

首先需要下载orc软件,这里使用tesseract-orc软件好了
下载地址:点击下载
下载好了,就安装。
我们可以测试一下,在命令行输入tesseract 会出现以下情况
Java整合ORC识别验证码_第1张图片
如果没有出现以下情况,需要手动的配置环境变量,方法如下:将安装目录添加到path中。
Java整合ORC识别验证码_第2张图片

由于有的验证码有干扰点和颜色差,这里提供源码,将图片进行处理。

ClearImageHelper.java

import java.awt.Color;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;

import javax.imageio.ImageIO;

public class ClearImageHelper {

	public static void main(String[] args) throws IOException {
		cleanImage(new File("1.jpg"), "2.jpg");
	}

	// sfile是带处理的图像
	// destDir是处理后保存的路径
	public static void cleanImage(File sfile, String destDir) throws IOException {
		File destF = new File(destDir);
		if (!destF.exists()) {
			destF.mkdirs();
		}

		BufferedImage bufferedImage = ImageIO.read(sfile);
		int h = bufferedImage.getHeight();
		int w = bufferedImage.getWidth();

		// 灰度化
		int[][] gray = new int[w][h];
		for (int x = 0; x < w; x++) {
			for (int y = 0; y < h; y++) {
				int argb = bufferedImage.getRGB(x, y);
				// 图像加亮
				int r = (int) (((argb >> 16) & 0xFF) * 1.1 + 30);
				int g = (int) (((argb >> 8) & 0xFF) * 1.1 + 30);
				int b = (int) (((argb >> 0) & 0xFF) * 1.1 + 30);
				if (r >= 255) {
					r = 255;
				}
				if (g >= 255) {
					g = 255;
				}
				if (b >= 255) {
					b = 255;
				}
				gray[x][y] = (int) Math.pow(
						(Math.pow(r, 2.2) * 0.2973 + Math.pow(g, 2.2) * 0.6274 + Math.pow(b, 2.2) * 0.0753), 1 / 2.2);
			}
		}

		// 二值化
		int threshold = ostu(gray, w, h);
		BufferedImage binaryBufferedImage = new BufferedImage(w, h, BufferedImage.TYPE_BYTE_BINARY);
		for (int x = 0; x < w; x++) {
			for (int y = 0; y < h; y++) {
				if (gray[x][y] > threshold) {
					gray[x][y] |= 0x00FFFF;
				} else {
					gray[x][y] &= 0xFF0000;
				}
				binaryBufferedImage.setRGB(x, y, gray[x][y]);
			}
		}

		// 去除噪点
		for (int y = 0; y < h; y++) {
			for (int x = 0; x < w; x++) {
				int sum = 0;
				if (isBlack(binaryBufferedImage.getRGB(x, y))) {
					sum = totalBlack(binaryBufferedImage, x, y);
				}
				// 如果非空白点的总数小于1,则认为是噪点,去除该噪点
				if (sum <= 1) {
					binaryBufferedImage.setRGB(x, y, 0xffffff);
				}
			}
		}

		ImageIO.write(binaryBufferedImage, "jpg", new File(destDir));
	}

	// 得到该点周围的非空白点
	public static int totalBlack(BufferedImage binaryBufferedImage, int x, int y) {
		int sum = 0;
		for (int i = x - 1; i <= x + 1; i++) {
			if (i < 0 || i >= binaryBufferedImage.getWidth()) {
				continue;
			}
			for (int j = y - 1; j <= y + 1; j++) {
				if (j < 0 || j >= binaryBufferedImage.getHeight()) {
					continue;
				}
				try {
					if (isBlack(binaryBufferedImage.getRGB(i, j))) {
						sum++;
					}
				} catch (Exception e) {
					continue;
				}
			}
		}
		return sum;
	}

	public static boolean isBlack(int colorInt) {
		Color color = new Color(colorInt);
		if (color.getRed() + color.getGreen() + color.getBlue() <= 300) {
			return true;
		}
		return false;
	}

	public static boolean isWhite(int colorInt) {
		Color color = new Color(colorInt);
		if (color.getRed() + color.getGreen() + color.getBlue() > 300) {
			return true;
		}
		return false;
	}

	public static int isBlackOrWhite(int colorInt) {
		if (getColorBright(colorInt) < 30 || getColorBright(colorInt) > 730) {
			return 1;
		}
		return 0;
	}

	public static int getColorBright(int colorInt) {
		Color color = new Color(colorInt);
		return color.getRed() + color.getGreen() + color.getBlue();
	}

	public static int ostu(int[][] gray, int w, int h) {
		int[] histData = new int[w * h];
		// Calculate histogram
		for (int x = 0; x < w; x++) {
			for (int y = 0; y < h; y++) {
				int red = 0xFF & gray[x][y];
				histData[red]++;
			}
		}

		// Total number of pixels
		int total = w * h;

		float sum = 0;
		for (int t = 0; t < 256; t++)
			sum += t * histData[t];

		float sumB = 0;
		int wB = 0;
		int wF = 0;

		float varMax = 0;
		int threshold = 0;

		for (int t = 0; t < 256; t++) {
			wB += histData[t]; // Weight Background
			if (wB == 0)
				continue;

			wF = total - wB; // Weight Foreground
			if (wF == 0)
				break;

			sumB += (float) (t * histData[t]);

			float mB = sumB / wB; // Mean Background
			float mF = (sum - sumB) / wF; // Mean Foreground

			// Calculate Between Class Variance
			float varBetween = (float) wB * (float) wF * (mB - mF) * (mB - mF);

			// Check if new maximum found
			if (varBetween > varMax) {
				varMax = varBetween;
				threshold = t;
			}
		}

		return threshold;
	}
}

该算法有待改进,只能处理一部分验证码。运行以上代码后,将会生成2.jpg即修改过后的验证码图片。
我们接下来对2.jpg进行识别。

			Runtime runtime = Runtime.getRuntime();
			runtime.exec("cmd /c tesseract 2.jpg 1");
			Thread.sleep(2000);
			BufferedReader bufferedReader = new BufferedReader(new FileReader("1.txt"));
			String validate = bufferedReader.readLine();
			bufferedReader.close();

这样字符串validate就是验证码的文字了。
原理就是Java代理模拟cmd命令,执行了tesseract 2.jpg 1 命令。
tesseract识别2.jpg,并将图片内容输出到1.txt文件中。再将1.txt文件读取到代码中。

你可能感兴趣的:(java,tesseract,识别,验证码,整合)