上一篇博客已经介绍了强智教务系统的验证码识别,发现大家对这个挺感兴趣的,我就继续写一篇验证码相关的博客,因为是初学,所以如果我哪个地方有问题望大佬可以指出
python3.6
PIL
sklearn
numpy
下载验证码,然后标记,因为方正教务系统对于频繁下载验证码的ip会封,所以我们每次下载200张,隔一段时间再下载一次
import requests
url = "http://****/CheckCode.aspx"
for i in range(200):
filename = "./code/"+str(i)+".png"
response = requests.get(url)
with open(filename , "wb") as f:
f.write(response.content)
方正的验证码和强智验证码对比,方正有个明显的特征,所有的字符颜色全部是蓝色,所以在二值化的过程中我们直接可以通过点的RGB值来进行,例如判断出这一个点是蓝色我们就让这个像素点的(r , g , b)变成(0,0,0),这样既可以达到二值化的效果又可以达到降噪的效果,当然基于这个特征还有方法能进行二值化,我下面要用的方法是先进行二值化,二值化以后其实所有的蓝色都会是同一个值,然后我们再进行过滤一下
示例图片
1.二值化
2.降噪
3.字符分割(原理,我们这一次使用平均分割,虽然有一部分粘连在一起的我们没办法彻底分割,但字符特征还是存在的)
因为这次的验证码难度比上一篇的大一些,我尝试使用上一篇所采用的转换方式,结果准确率只有50%,于是我想试一下用线性回归看准确率怎么样,结果更低,最后我发现如果我们不把每个字符转换成一个特征,而是转换成一个矩阵,这样的准确率可以达到96%以上,所以这一次转换方式是将每一个字符对应的矩形区域转换成一个一维矩阵
准后数据类似于
图片数据:[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
结果:[‘0’, ‘f’, ‘k’, ‘2’]
from PIL import Image , ImageDraw
import cv2
import hashlib
import sklearn as sk
'''
图片处理类
'''
class ImageHandler():
threshold = None #二值化处理阈值
im = None #保存当前类所处理的图片
spliter = []
data = []
labels = []
def __init__(self , filename):
self.filename = filename
self.data = []
self.labels = []
self.spliter = []
self.im = None
self.threshold = 120
'''
图片文件读取
'''
def readFile(self):
self.im = Image.open(self.filename)
'''
图片二值化
'''
#def toBinary_img(self , im):
# im = cv2.cvtColor(im,cv2.COLOR_BGR2GRAY)
# th1 = cv2.adaptiveThreshold(im, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 21, 1)
# return th1
'''
手动将图片二值化
顺便去掉边框
'''
def toBinary_data(self):
if(self.im == None):
self.readFile()
im = self.im
im = im.convert('RGB')
pixdata = im.load()
width , height = im.size
for j in range(height):
for i in range(width):
r , g , b = pixdata[i , j]
if(i == 0 or i == width-1 or j == 0 or j == height-1):
pixdata[i , j] = (255 , 255 , 255)
continue
if(r < 10.0 and g < 10.0 and b > 80.0):
pixdata[i , j] = (0 , 0 , 0)
else:
pixdata[i , j] = (255 , 255 , 255)
'''
for j in range(height):
for i in range(width):
if(i == 0 or i == width-1 or j == 0 or j == height-1):
pixdata[i , j] = 255
continue
if(pixdata[i , j] == 17):
pixdata[i , j] = 0
else:
pixdata[i , j] = 255
'''
im = im.convert('L')
self.im = im
'''
降噪(点线降噪)
'''
def dot_noise(self):
if(self.im == None):
self.readFile()
im = self.im
w , h = im.size
pixdata = im.load()
#从左至右降噪
for y in range(h):
for x in range(w):
if(pixdata[x , y] == 0):
sum = 0 #四周总共有多少个白点
if(pixdata[x-1 , y] == 255):
sum+=1
if(pixdata[x+1 , y] == 255):
sum+=1
if(pixdata[x , y+1] == 255):
sum+=1
if(pixdata[x , y-1] == 255):
sum+=1
if(pixdata[x-1 , y-1] == 255):
sum+=1
if(pixdata[x-1 , y+1] == 255):
sum+=1
if(pixdata[x+1 , y-1] == 255):
sum+=1
if(pixdata[x+1 , y+1] == 255):
sum+=1
if(sum >= 7):
pixdata[x , y] = 255
'''
#从右至左降噪
for y in range(h):
for x in range(w-1 , -1 , -1):
if(pixdata[x , y] == 0):
sum = 0 #四周总共有多少个白点
if(pixdata[x-1 , y] == 255):
sum+=1
if(pixdata[x+1 , y] == 255):
sum+=1
if(pixdata[x , y+1] == 255):
sum+=1
if(pixdata[x , y-1] == 255):
sum+=1
if(sum >= 3):
pixdata[x , y] = 255
'''
#从下至上降噪
for y in range(h-1 , -1 , -1):
for x in range(w-1 , -1 , -1):
if(pixdata[x , y] == 0):
sum = 0 #四周总共有多少个白点
if(pixdata[x-1 , y] == 255):
sum+=1
if(pixdata[x+1 , y] == 255):
sum+=1
if(pixdata[x , y+1] == 255):
sum+=1
if(pixdata[x , y-1] == 255):
sum+=1
if(pixdata[x-1 , y-1] == 255):
sum+=1
if(pixdata[x-1 , y+1] == 255):
sum+=1
if(pixdata[x+1 , y-1] == 255):
sum+=1
if(pixdata[x+1 , y+1] == 255):
sum+=1
if(sum >= 7):
pixdata[x , y] = 255
self.im = im
'''
切割
'''
def cut_img(self):
self.spliter = [[5, 1, 17, 21], [17, 1, 29, 21], [29, 1, 41, 21], [41, 1, 53, 21]]
def test(self):
'''测试切割后的结果'''
for index , i in enumerate(self.spliter):
box = tuple(i)
region = self.im.crop(box)
w , h = region.size
w = w - 1
h = h - 1
draw = ImageDraw.Draw(region)
draw.line((0 , 0 , w , 0) , fill=160)
draw.line((0 , 0 , 0 , h) , fill=160)
draw.line((w , 0 , w , h) , fill=160)
draw.line((0 , h , w , h) , fill=160)
del draw
self.im.paste(region, box)
#region.save(str(index)+".png" , "PNG")
'''
将分割好的图片转换成机器学习数据
'''
def img_2_train_data(self):
f = self.filename.replace(".png" , "")[-4:]
for index , i in enumerate(self.spliter):
box = tuple(i)
region = self.im.crop(box)
w , h = region.size
pixdata = region.load()
d = []
for y in range(h):
for x in range(w):
if(pixdata[x , y] == 0):
d.append(1)
else:
d.append(0)
self.data.append(d)
self.labels.append(f[index])
from numpy import *
import numpy as np
from sklearn import neighbors
import os
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.externals import joblib
from ImageHandler import ImageHandler
from PIL import Image
img_dir = "./code"
files = os.listdir(img_dir)
data = []
result = []
for i in range(len(files)):
path = img_dir+"/"+files[i]
image = ImageHandler(path)
image.toBinary_data()
image.dot_noise()
image.cut_img()
image.img_2_train_data()
if(len(data) == 0):
data = image.data
else:
data = np.concatenate((data , image.data))
if(len(result) == 0):
result = image.labels
else:
result = np.concatenate((result , image.labels))
del image
print("数据准备完毕")
x = data.reshape(-1 , len(data[0]))
y = result.reshape(-1 , 1)
print(x)
x = np.array(x)
y = np.array(y)
# 拆分训练数据与测试数据
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
# 训练KNN分类器
clf = neighbors.KNeighborsClassifier()
clf.fit(x, y)
print("训练完成")
# 保存分类器模型
joblib.dump(clf, './knn/knn.pkl')
print("结束")
print("检验准确率")
# # 测试结果打印
pre_y_train = clf.predict(x_train)
pre_y_test = clf.predict(x_test)
class_name1 = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9' , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j' , 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't' , 'u', 'v', 'w']
class_name2 = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9' , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j' , 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't' , 'u', 'v', 'w']
print (classification_report(y_train, pre_y_train, target_names=class_name1))
print (classification_report(y_test, pre_y_test, target_names=class_name2))
以上就是全部代码,这次的代码是基于上一篇的,只是进行了部分修改,就是拿到新数据后进行识别的代码