python sklearn分类模型算法的学习

以我之前做出来的验证码图片生成的数据为例,其中源码在这一篇文章中《爬虫进阶:验证码突破–6、机器学习识别简单图片字母验证码》有过涉及,这里不再赘述
本篇文章主要通过对sklearn中的几个模型的对之前处理的带标识的验证码字符数据进行学习后,然后再测试样本数据
需要用到的python包:PIL(pillow),sklearn,os

#coding:utf-8
from PIL import Image
import os
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
train_data_path = "D:/CapCha/outdir"

读取训练集文件夹下的单字母/数字图像文件

def read_train_data(train_data_path):
    """
    读取训练集文件夹下的单字母/数字图像文件
    :return:image_array, image_label:图像list、图像label list    """
    files = os.listdir(train_data_path)
    image_array = []
    image_label = []
    for capt_per_char_file in files:
        image_label += list(capt_per_char_file.split("_")[0])
    for capt_per_char_file in files:
        path = os.path.join(train_data_path, capt_per_char_file)
        image = Image.open(path)
        image_array.append(image)
        #print(capt_per_char_file + " 读取成功")
    return image_array, image_label

生成特征矩阵

def feature_transfer(image):
    """
    生成特征矩阵
    计算每副图像的行和、列和,共image_width + image_height个特征
    :param image:图像list
    :return:
    """
    image_width, image_height=(30,30)
    image = image.resize((image_width, image_height)) #标准化图像格式
    feature = []#计算特征
    for x in range(image_width):#计算行特征
        feature_width = 0
        for y in range(image_height):
            if image.getpixel((x, y)) == 0:
                feature_width += 1
        feature.append(feature_width)
    for y in range(image_height): #计算列特征
        feature_height = 0
        for x in range(image_width):
            if image.getpixel((x, y)) == 0:
                feature_height += 1
        feature.append(feature_height)
    # print('feature length :',len(feature))
    return feature

生成数据集

def TestDataGen():
    image_array, image_label = read_train_data(train_data_path)
    image_feature = []
    for num, image in enumerate(image_array):
        feature = feature_transfer(image)
        image_feature.append(feature)
    return image_feature, image_label

模型预测

def model_predict(model_name):
    clf = joblib.load(model_name)
    # test_labels = []
    #读入验证文件夹的文件,并转化为特征矩阵
    CAPT_PATH = r"D:/test/outdir"
    capt_per_char_list = os.listdir(CAPT_PATH)
    image_array, image_label = read_train_data(CAPT_PATH)
    image_feature = []
    for num, image in enumerate(image_array):
        feature = feature_transfer(image)
        image_feature.append(feature)
    char_list = ''.join([i.split('.')[0][0] for i in capt_per_char_list])
    print(char_list)
    print(''.join(clf.predict(image_feature)))

随机森林分类

def RandomForestClassifier_test():
    train_table, train_labels = TestDataGen()
    #将数据生成为训练集+测试集
    x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)        
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0)
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    joblib.dump(clf, 'rf.model')
    print("随机森林:score:",score)

决策树分类

def DecisionTreeClassfier_test():
    train_table, train_labels = TestDataGen()
    x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
    from sklearn import tree
    clf = tree.DecisionTreeClassifier()
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    joblib.dump(clf, 'DecisionTree.model')
    print("决策树:score:", score)

KNN分类

def KNN_test():
    train_table, train_labels = TestDataGen()
    x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
    from sklearn.neighbors import KNeighborsClassifier
    clf = KNeighborsClassifier()
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    joblib.dump(clf, 'KNN.model')
    print("K-最近邻:score:", score)

逻辑回归

def logisticRegression_test():
    train_table, train_labels = TestDataGen()
    x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    joblib.dump(clf, 'LR.model')
    print("逻辑回归:score:", score)

神经网络

def CNN_test():
    train_table, train_labels = TestDataGen()
    x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
    from sklearn.neural_network import MLPClassifier
    clf =  MLPClassifier(max_iter=1000,learning_rate_init=0.002)
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    joblib.dump(clf, 'cnn.model')
    print("神经网络:score:", score)

支持向量机

def SVM_test():
    train_table, train_labels = TestDataGen()
    x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
    from sklearn.svm import SVC
    clf = SVC()
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    joblib.dump(clf, 'SVM_SVC.model')
    print("支持向量机:score:", score)
def SVM_test2():
    train_table, train_labels = TestDataGen()
    x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
    from sklearn.svm import LinearSVC
    clf = LinearSVC()
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    joblib.dump(clf, 'SVM_LinearSVC.model')
    print("支持向量机(linear):score:", score)

朴素贝叶斯

def naive_bayes_test():
    train_table, train_labels = TestDataGen()
    x_train, x_test, y_train, y_test = train_test_split(train_table, train_labels, test_size=0.2, random_state=0)
    from sklearn.naive_bayes import GaussianNB
    from sklearn.naive_bayes import MultinomialNB
    clf = GaussianNB()
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    joblib.dump(clf, 'GaussianNB.model')
    print("高斯朴素贝叶斯:score:", score)
    clf = MultinomialNB()
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    joblib.dump(clf, 'MultinomialNB.model')
    print("多项式朴素贝叶斯score:", score)

接下来看看运行结果

if __name__ == '__main__':
    RandomForestClassifier_test()
    DecisionTreeClassfier_test()
    KNN_test()
    logisticRegression_test()
    CNN_test()
    SVM_test2()
    SVM_test()
    naive_bayes_test()
    #model_predict("knn.model")

运行结果:

随机森林:score: 0.99
决策树:score: 0.938
K-最近邻:score: 0.986666666667
逻辑回归:score: 0.968666666667
神经网络:score: 0.979333333333
支持向量机(linear):score: 0.941333333333
支持向量机:score: 0.934
高斯朴素贝叶斯:score: 0.792
多项式朴素贝叶斯score: 0.613333333333

从上面的结果来看,随机森林,knn和cnn识别率比较好。

你可能感兴趣的:(机器学习)