以我之前做出来的验证码图片生成的数据为例,其中源码在这一篇文章中《爬虫进阶:验证码突破–6、机器学习识别简单图片字母验证码》有过涉及,这里不再赘述
本篇文章主要通过对sklearn中的几个模型的对之前处理的带标识的验证码字符数据进行学习后,然后再测试样本数据
需要用到的python包:PIL(pillow),sklearn,os
#coding:utf-8
from PIL import Image
import os
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
train_data_path = "D:/CapCha/outdir"
读取训练集文件夹下的单字母/数字图像文件
def read_train_data(train_data_path):
"""
读取训练集文件夹下的单字母/数字图像文件
:return:image_array, image_label:图像list、图像label list """
files = os.listdir(train_data_path)
image_array = []
image_label = []
for capt_per_char_file in files:
image_label += list(capt_per_char_file.split("_")[0])
for capt_per_char_file in files:
path = os.path.join(train_data_path, capt_per_char_file)
image = Image.open(path)
image_array.append(image)
#print(capt_per_char_file + " 读取成功")
return image_array, image_label
生成特征矩阵
def feature_transfer(image):
"""
生成特征矩阵
计算每副图像的行和、列和,共image_width + image_height个特征
:param image:图像list
:return:
"""
image_width, image_height=(30,30)
image = image.resize((image_width, image_height)) #标准化图像格式
feature = []#计算特征
for x in range(image_width):#计算行特征
feature_width = 0
for y in range(image_height):
if image.getpixel((x, y)) == 0:
feature_width += 1
feature.append(feature_width)
for y in range(image_height): #计算列特征
feature_height = 0
for x in range(image_width):
if image.getpixel((x, y)) == 0:
feature_height += 1
feature.append(feature_height)
# print('feature length :',len(feature))
return feature
生成数据集
def TestDataGen():
image_array, image_label = read_train_data(train_data_path)
image_feature = []
for num, image in enumerate(image_array):
feature = feature_transfer(image)
image_feature.append(feature)
return image_feature, image_label
模型预测
def model_predict(model_name):
clf = joblib.load(model_name)
# test_labels = []
#读入验证文件夹的文件,并转化为特征矩阵
CAPT_PATH = r"D:/test/outdir"
capt_per_char_list = os.listdir(CAPT_PATH)
image_array, image_label = read_train_data(CAPT_PATH)
image_feature = []
for num, image in enumerate(image_array):
feature = feature_transfer(image)
image_feature.append(feature)
char_list = ''.join([i.split('.')[0][0] for i in capt_per_char_list])
print(char_list)
print(''.join(clf.predict(image_feature)))
随机森林分类
def RandomForestClassifier_test():
train_table, train_labels = TestDataGen()
#将数据生成为训练集+测试集
x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0)
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
joblib.dump(clf, 'rf.model')
print("随机森林:score:",score)
决策树分类
def DecisionTreeClassfier_test():
train_table, train_labels = TestDataGen()
x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
joblib.dump(clf, 'DecisionTree.model')
print("决策树:score:", score)
KNN分类
def KNN_test():
train_table, train_labels = TestDataGen()
x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
joblib.dump(clf, 'KNN.model')
print("K-最近邻:score:", score)
逻辑回归
def logisticRegression_test():
train_table, train_labels = TestDataGen()
x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
joblib.dump(clf, 'LR.model')
print("逻辑回归:score:", score)
神经网络
def CNN_test():
train_table, train_labels = TestDataGen()
x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(max_iter=1000,learning_rate_init=0.002)
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
joblib.dump(clf, 'cnn.model')
print("神经网络:score:", score)
支持向量机
def SVM_test():
train_table, train_labels = TestDataGen()
x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
from sklearn.svm import SVC
clf = SVC()
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
joblib.dump(clf, 'SVM_SVC.model')
print("支持向量机:score:", score)
def SVM_test2():
train_table, train_labels = TestDataGen()
x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
joblib.dump(clf, 'SVM_LinearSVC.model')
print("支持向量机(linear):score:", score)
朴素贝叶斯
def naive_bayes_test():
train_table, train_labels = TestDataGen()
x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
clf = GaussianNB()
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
joblib.dump(clf, 'GaussianNB.model')
print("高斯朴素贝叶斯:score:", score)
clf = MultinomialNB()
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
joblib.dump(clf, 'MultinomialNB.model')
print("多项式朴素贝叶斯score:", score)
接下来看看运行结果
if __name__ == '__main__':
RandomForestClassifier_test()
DecisionTreeClassfier_test()
KNN_test()
logisticRegression_test()
CNN_test()
SVM_test2()
SVM_test()
naive_bayes_test()
#model_predict("knn.model")
运行结果:
随机森林:score: 0.99
决策树:score: 0.938
K-最近邻:score: 0.986666666667
逻辑回归:score: 0.968666666667
神经网络:score: 0.979333333333
支持向量机(linear):score: 0.941333333333
支持向量机:score: 0.934
高斯朴素贝叶斯:score: 0.792
多项式朴素贝叶斯score: 0.613333333333
从上面的结果来看,随机森林,knn和cnn识别率比较好。