之前没有想过要通过爬取页面的方式来获取数据,想想毕竟api得到的数据有限,国内使用强智系统的不少,也算是做了点贡献吧。
识别的过程倒是蛮顺利。肉眼验证似乎没有不正确的
话不多说,我们看到的登录界面的验证码是这样的:
如果你的目标和这个一毛一样,那么你便可以拿来用咯
思路很清晰,首先我们知道图像实际上是个二维数组,我们将图片二值化,即处理成只有黑白两色的图像,与训练好的的二维数组逐一相比较,最接近哪个便是哪个字符。
如字母a的二维数组:
//a
{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0},
{0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0},
{0,0,1,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0},
{0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0},
{0,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0},
{0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0},
{0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0},
{0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0},
{0,1,1,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0},
{0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0},
{0,0,0,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}}
验证码形如,经过爬取大量验证码可以发现,验证码图片为80*40px,周围有1px黑边,实际上黑边我忽略了,因为再后面的切割中被剪掉了。接着说图片,图片只含有小写字母和数字,没有字母o和数字0,即一共34个字符,后期要至少准备34个二维数组用于比较
打开ps
找一个图片有重合的例子,发现每个字符占水平20px,垂直24px,可以初步处理下,裁剪掉上下的空白:
BufferedImage image = ImageIO.read(f);
image = image.getSubimage(0, 9, 80, 24);
public static BufferedImage grayImage(BufferedImage bufferedImage) throws Exception {
int width = bufferedImage.getWidth();
int height = bufferedImage.getHeight();
BufferedImage grayBufferedImage = new BufferedImage(width, height, bufferedImage.getType());
for (int i = 0; i < bufferedImage.getWidth(); i++) {
for (int j = 0; j < bufferedImage.getHeight(); j++) {
final int color = bufferedImage.getRGB(i, j);
final int r = (color >> 16) & 0xff;
final int g = (color >> 8) & 0xff;
final int b = color & 0xff;
int gray = (int) (0.3 * r + 0.59 * g + 0.11 * b);
int newPixel = colorToRGB(255, gray, gray, gray);
grayBufferedImage.setRGB(i, j, newPixel);
}
}
return grayBufferedImage;
}
/**
* 颜色分量转换为RGB值
*
* @param alpha
* @param red
* @param green
* @param blue
* @return
*/
private static int colorToRGB(int alpha, int red, int green, int blue) {
int newPixel = 0;
newPixel += alpha;
newPixel = newPixel << 8;
newPixel += red;
newPixel = newPixel << 8;
newPixel += green;
newPixel = newPixel << 8;
newPixel += blue;
return newPixel;
}
public static BufferedImage binaryImage(BufferedImage image) throws Exception {
int w = image.getWidth();
int h = image.getHeight();
float[] rgb = new float[3];
double[][] coordinates = new double[w][h];
int black = new Color(0, 0, 0).getRGB();
int white = new Color(255, 255, 255).getRGB();
BufferedImage bi = new BufferedImage(w, h, BufferedImage.TYPE_BYTE_BINARY);
;
for (int x = 0; x < w; x++) {
for (int y = 0; y < h; y++) {
int pixel = image.getRGB(x, y);
rgb[0] = (pixel & 0xff0000) >> 16;
rgb[1] = (pixel & 0xff00) >> 8;
rgb[2] = (pixel & 0xff);
float avg = (rgb[0] + rgb[1] + rgb[2]) / 3;
coordinates[x][y] = avg;
}
}
// 这里是阈值,白底黑字还是黑底白字,大多数情况下建议白底黑字,后面都以白底黑字为例
double SW = 192;
for (int x = 0; x < w; x++) {
for (int y = 0; y < h; y++) {
if (coordinates[x][y] < SW) {
bi.setRGB(x, y, black);
} else {
bi.setRGB(x, y, white);
}
}
}
return bi;
}
按照前面的分析将图片分割成4部分,分别处理之:
int width = image.getWidth();
int height = image.getHeight();
//System.out.println("width\t" + width + "\theight\t" + height);
//int subWidth = width / 4;
newim[0] = binaryImage(image.getSubimage(4, 0, 20, height));
newim[1] = binaryImage(image.getSubimage(22, 0, 20, height));
newim[2] = binaryImage(image.getSubimage(40, 0, 20, height));
newim[3] = binaryImage(image.getSubimage(58, 0, 20, height));
实际上现在每个分割的图片便可以用0,1表示,如果用██填充的话。。。
████████████
██████████████████
██ ████████
██████
██████
██████
██████
██████
████████████
████████████████
████████
████████
██████
██████
██████
████ ██████
██████████████████
██████████████
{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1},
{0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0},
{0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0},
{0,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0},
{0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0},
{0,1,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0},
{0,1,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0},
{0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}},
之后我们用训练好的数组比较选出结果即可
StringBuilder sb = new StringBuilder();
int count = -1;
double total = 1;
int hit = 0;
while (++count < 4) {
int chCount = 0;
int target = 0;//最接近的指针
double temp = -1.0;
while (chCount < MyCharacter.g.length) {
total = 1;
hit = 0;
for (int i = 0; i < g[count].length; i++) {
for (int j = 0; j < g[count][i].length; j++) {
if (MyCharacter.g[chCount][i][j] == 1) {
//System.out.print("-");
++total;
if(g[count][i][j] == MyCharacter.g[chCount][i][j]) {
//System.out.print("+");
++hit;
}
}
}
}
//System.out.println(hit);
//System.out.println(total);
//System.out.println(MyCharacter.ch[chCount]+" "+hit/total);
if(hit/total > temp) {
target = chCount;
temp = hit/total;
}
else {
}
++chCount;
}
//比例最大的给他
sb.append(MyCharacter.ch[target]);
}
return sb.toString();
参考: java图像处理