一、验证码来源
本篇验证码的来源于全国企业信用信息公示系统(天津市)的查询验证码,验证码样式如下图所示,并把图片名称进行修改,以便作为训练样本。
二、验证码识别大致思路
仔细观察上面的验证码,可以发现,除了背景噪音外,要识别的内容在图片中的位置是固定的,每张验证码中都有6个字符,实际上后三个“等于?”不需要识别,因此识别的重点就是两个数字加上一个运算符号,通过观察发现,每张图片中第一个数字在图片中的位置是固定的,且字符无扭曲、旋转等,字符与字符间分界线很明显。因此,我们可以通过指定位置大小对原始图片进行裁剪,这样就可以对每一个字符进行准确识别。
三、具体操作过程:
1、去背景噪音;
2、裁剪图片,将每个字符裁剪出来,并设置同样大小的像素,如50*50大小
3、图片二值化,空白的地方用0填充,非空白地方用1填充
4、将50*50的二维数组转变成1*2500的数组保存
5、将该小截图的正确标签值加入到队首作为训练集标签
6、将所有样本(比如100个)重复上述步骤,最后得到100*2501的训练集
7、将测试集样本重复1-5作为测试集
8、通过某种机器学习分类算法(比如随机森林)将测试集数据进行训练
9、利用训练完成的分类器识别测试集数据,注意,训练集的样本标签参与训练运算,测试集的样本标签不参与上述1-9步骤中的任何计算。
10、将分类器分类结果与测试集的样本标签进行逐一比对,计算正确度(测试集的样本标签只参与此处运算)
四、代码如下
library(jpeg) library(reshape) library(randomForest) library(dplyr) ################################################################################################## qubeijing <- function(picname){ #####################################去背景噪音################## picpath <- paste("./天津/原始/",picname,sep = "") orgpic=readJPEG(picpath) name <-strsplit(x=picname,split = "[.]")[[1]][1] label<- str_split(name,"") for (i in 1:30) { for (j in 1:200){ d <- orgpic[i,j,]*255 if ((d[2]>100)){ orgpic[i,j,] <- c(1,1,1) } } } writeJPEG(orgpic, target=paste("./天津/去背景/",label[[1]][1],label[[1]][2],label[[1]][3],"去背景.jpg",sep = ""), quality = 0.95) ######################### 切图 ######################## picname1 <- paste("./天津/去背景/",label[[1]][1],label[[1]][2],label[[1]][3],"去背景.jpg",sep = "") orgpic=readJPEG(picname1) orgpic1 <- array(1, c(20, 25, 3)) orgpic2 <- array(1, c(20, 25, 3)) orgpic3 <- array(1, c(20, 25, 3)) for (i in 3:22) { for (j in 10:34){ orgpic1[i-2,j-9,] <- orgpic[i,j,] } } for (i in 3:22) { for (j in 40:64){ orgpic2[i-2,j-39,] <- orgpic[i,j,] } } for (i in 3:22) { for (j in 70:94){ orgpic3[i-2,j-69,] <- orgpic[i,j,] } } writeJPEG(orgpic1, target=paste("./天津/前图/",label[[1]][1],"-",label[[1]][1],label[[1]][2],label[[1]][3],"前图.jpg",sep = ""), quality = 0.95) writeJPEG(orgpic2, target=paste("./天津/中图/",label[[1]][2],"-",label[[1]][1],label[[1]][2],label[[1]][3],"中图.jpg",sep = ""), quality = 0.95) writeJPEG(orgpic3, target=paste("./天津/后图/",label[[1]][3],"-",label[[1]][1],label[[1]][2],label[[1]][3],"后图.jpg",sep = ""), quality = 0.95) picname2 <- paste("./天津/前图/",label[[1]][1],"-",label[[1]][1],label[[1]][2],label[[1]][3],"前图.jpg",sep = "") picname3 <- paste("./天津/中图/",label[[1]][2],"-",label[[1]][1],label[[1]][2],label[[1]][3],"中图.jpg",sep = "") picname4 <- paste("./天津/后图/",label[[1]][3],"-",label[[1]][1],label[[1]][2],label[[1]][3],"后图.jpg",sep = "") orgpic4=readJPEG(picname2) orgpic5=readJPEG(picname3) orgpic6=readJPEG(picname4) arraypic1 <- array(1, c(20, 25)) arraypic2 <- array(1, c(20, 25)) arraypic3 <- array(1, c(20, 25)) for (i in 1:20) { for (j in 1:25){ d1 <- orgpic4[i,j,]*255 if ((d1[1]>200)&&(d1[2]>200)&&(d1[3]>200)){ arraypic1[i,j] <- 0 } } } for (i in 1:20) { for (j in 1:25){ d2 <- orgpic5[i,j,]*255 if ((d2[1]>200)&&(d2[2]>200)&&(d2[3]>200)){ arraypic2[i,j] <- 0 } } } for (i in 1:20) { for (j in 1:25){ d3 <- orgpic6[i,j,]*255 if ((d3[1]>200)&&(d3[2]>200)&&(d3[3]>200)){ arraypic3[i,j] <- 0 } } } y1 <- as.numeric(label[[1]][1]) y2 <- label[[1]][2] if(y2 == "乘"){ y2 <- -2 # -2代表乘 }else{ y2 <- -1 # -1代表加 } y3 <- as.numeric(label[[1]][3]) arraypic4 <- cbind(y1,array(arraypic1,dim = c(1,500))) arraypic5 <- cbind(y2,array(arraypic2,dim = c(1,500))) arraypic6 <- cbind(y3,array(arraypic3,dim = c(1,500))) write(arraypic4[1,],file = 'train.txt',append = T,ncolumns = 501,sep = ",") write(arraypic5[1,],file = 'train.txt',append = T,ncolumns = 501,sep = ",") write(arraypic6[1,],file = 'train.txt',append = T,ncolumns = 501,sep = ",") } #####################################对测试数据进行处理##################################################### qubeijing1 <- function(picname){ #####################################去背景噪音################## picpath <- paste("./天津/测试/原始/",picname,sep = "") orgpic=readJPEG(picpath) name <-strsplit(x=picname,split = "[.]")[[1]][1] label<- str_split(name,"") for (i in 1:30) { for (j in 1:200){ d <- orgpic[i,j,]*255 if ((d[2]>100)){ orgpic[i,j,] <- c(1,1,1) } } } writeJPEG(orgpic, target=paste("./天津/测试/去背景/",label[[1]][1],label[[1]][2],label[[1]][3],"去背景.jpg",sep = ""), quality = 0.95) ######################### 切图 ######################## picname1 <- paste("./天津/测试/去背景/",label[[1]][1],label[[1]][2],label[[1]][3],"去背景.jpg",sep = "") orgpic=readJPEG(picname1) orgpic1 <- array(1, c(20, 25, 3)) orgpic2 <- array(1, c(20, 25, 3)) orgpic3 <- array(1, c(20, 25, 3)) for (i in 3:22) { for (j in 10:34){ orgpic1[i-4,j-11,] <- orgpic[i,j,] } } for (i in 3:22) { for (j in 40:64){ orgpic2[i-2,j-39,] <- orgpic[i,j,] } } for (i in 3:22) { for (j in 70:94){ orgpic3[i-2,j-69,] <- orgpic[i,j,] } } writeJPEG(orgpic1, target=paste("./天津/测试/前图/",label[[1]][1],"-",label[[1]][1],label[[1]][2],label[[1]][3],"前图.jpg",sep = ""), quality = 0.95) writeJPEG(orgpic2, target=paste("./天津/测试/中图/",label[[1]][2],"-",label[[1]][1],label[[1]][2],label[[1]][3],"中图.jpg",sep = ""), quality = 0.95) writeJPEG(orgpic3, target=paste("./天津/测试/后图/",label[[1]][3],"-",label[[1]][1],label[[1]][2],label[[1]][3],"后图.jpg",sep = ""), quality = 0.95) picname2 <- paste("./天津/测试/前图/",label[[1]][1],"-",label[[1]][1],label[[1]][2],label[[1]][3],"前图.jpg",sep = "") picname3 <- paste("./天津/测试/中图/",label[[1]][2],"-",label[[1]][1],label[[1]][2],label[[1]][3],"中图.jpg",sep = "") picname4 <- paste("./天津/测试/后图/",label[[1]][3],"-",label[[1]][1],label[[1]][2],label[[1]][3],"后图.jpg",sep = "") orgpic4=readJPEG(picname2) orgpic5=readJPEG(picname3) orgpic6=readJPEG(picname4) arraypic1 <- array(1, c(20, 25)) arraypic2 <- array(1, c(20, 25)) arraypic3 <- array(1, c(20, 25)) for (i in 1:20) { for (j in 1:25){ d1 <- orgpic4[i,j,]*255 if ((d1[1]>200)&&(d1[2]>200)&&(d1[3]>200)){ arraypic1[i,j] <- 0 } } } for (i in 1:20) { for (j in 1:25){ d2 <- orgpic5[i,j,]*255 if ((d2[1]>200)&&(d2[2]>200)&&(d2[3]>200)){ arraypic2[i,j] <- 0 } } } for (i in 1:20) { for (j in 1:25){ d3 <- orgpic6[i,j,]*255 if ((d3[1]>200)&&(d3[2]>200)&&(d3[3]>200)){ arraypic3[i,j] <- 0 } } } y1 <- as.numeric(label[[1]][1]) y2 <- label[[1]][2] if(y2 == "乘"){ y2 <- -2 # -2代表乘 }else if(y2 == "加"){ y2 <- -1 # -1代表加 }else { y2 <- -3 # 无用项 } y3 <- as.numeric(label[[1]][3]) arraypic4 <- cbind(y1,array(arraypic1,dim = c(1,500))) arraypic5 <- cbind(y2,array(arraypic2,dim = c(1,500))) arraypic6 <- cbind(y3,array(arraypic3,dim = c(1,500))) write(arraypic4[1,],file = 'test.txt',append = T,ncolumns = 501,sep = ",") write(arraypic5[1,],file = 'test.txt',append = T,ncolumns = 501,sep = ",") write(arraypic6[1,],file = 'test.txt',append = T,ncolumns = 501,sep = ",") } #####################################相当于主函数################ a <- list.files("天津/原始") b <- length(a) for (i in c(1:b)){ picname <- a[i] qubeijing(picname) } c <- list.files("天津/测试/原始") d <- length(c) for (i in c(1:d)){ picname <- c[i] qubeijing1(picname) } #####################################训练模型################### library(randomForest) library(readr) num <- read.csv('train.txt',header = F) numTrees <- 25 labels <- as.factor(num[,1]) train <- num[,-1] rf <- randomForest(train, labels, ntree=numTrees) #################################### 预测 ################## test <- read.csv('test.txt',header = F) testlabels <- test[,1] num <- dim(test)[1] words <- 0 for(j in c(1:num)){ newdata <- test[j,] newdatalables <- as.factor(newdata[,1]) newdata <- newdata[,-1] result <- predict(rf,newdata = newdata,type="response") words <- append(words,as.numeric(as.character(result))) } ##################################### 计算正确率 ################## words <- words[-1] print(words) err_num <- 0 for(ii in 1:length(testlabels)){ if(testlabels[ii] != words[ii]){ err_num <- err_num + 1 } } print((length(testlabels)-err_num)/length(testlabels)) ####################### 计算结果 ##################################### for (t in seq(1,length(testlabels),3)){ first <- words[t] second <- words[t+2] if(words[t+1]==-1){ print(paste(first," + ", second," 的计算结果是: ",first + second,sep="")) }else{ print(paste(first," * ", second," 的计算结果是: ",first * second,sep="")) } }
代码方面可以看得出来对待训练集和测试集是完全一致的,唯一区别是一个作为训练集train.txt,一个作为测试集test.txt,文件夹的关系是主目录"天津"下有"原始"、“去背景”、“前图”、“中图”、“后图”、“测试”6个文件夹,分别用来保存初始转态的验证码,去掉背景后的验证码,第一个字符小截图,第二个字符小截图,第三个字符小截图,以及测试集所有图片,其中“测试”文件夹也对应有“原始”、“去背景”、“前图”、“中图”和“后图”。
计算结果实际上完全可以达到100%,下面粘贴几个图,下面这个是去背景后的,由于我们是指定范围去背景的,所以图片中偏下、偏右的地方仍有噪音,这个没有任何影响。最下面三个小图就分别是前中后三个字符截图。