4个分类器分类cifar_10_batches性能比较

4个分类器的分类性能比较

1. 题目要求
给定的图像数据集,利用KNN、朴素贝叶斯、线性回归、逻辑回归比较分类性能。
主要的数据集cifar_10_batches(60000张图,(32,32),10类,5个训练集,一个测试集)
要求:输出各个数据集的分类精度。
2. 说明
本次代码主要加上如何对上次代码KNN、朴素贝叶斯、线性回归、逻辑回归的分类性能比较进行优化完善,(pandas输出,消除了警告)整体框架不变,主要加入如何读取cifar_10_batches的读取,该方法来自chaixl_Hello_World,有一些问题已经修正。
3. 代码实现

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
###
# 聚类精度模板
import numpy as np
from scipy.optimize import linear_sum_assignment
def cluster_acc(y_true, y_pred):
    y_true = np.array(y_true).astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    ind = linear_sum_assignment(w.max() - w)
    ind = np.asarray(ind)
    ind = np.transpose(ind)
    return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size
#
## 朴素贝叶斯与KNN分类
def knn_gnb_lr_lsr(X, labels, title_knn="XXX.KNN",\
                title_gnb="XXX.GNB",title_lr="XXX.LR",\
                title_lsr="XXX=LSR",n = 3):
    # 划分训练集和测试集
    x_train, x_test, labels_train, labels_test =\
        train_test_split(X, labels, test_size=0.2, random_state=22)

    # 使用KNN进行分类
    knn = KNeighborsClassifier()
    knn.fit(x_train, labels_train)
    label_sample = knn.predict(x_test)
    knn_acc=cluster_acc(labels_test, label_sample)
    print(title_knn,"=",knn_acc)

    # 使用高斯朴素贝叶斯进行分类
    gnb = GaussianNB()  # 使用默认配置初始化朴素贝叶斯
    gnb.fit(x_train, labels_train)  # 训练模型
    label_sample = gnb.predict(x_test)
    gnb_acc = cluster_acc(labels_test, label_sample)
    print(title_gnb,"=", gnb_acc)

    # 线性回归
    lr = LinearRegression()
    lr.fit(x_train, labels_train)
    label_sample = lr.predict(x_test)
    label_sample = np.round(label_sample)
    label_sample=label_sample.astype(np.int64)
    lr_acc = cluster_acc(labels_test, label_sample)
    print(title_lr, "=", lr_acc)

    #Logistic regression 需要事先进行标准化
    #创建一对多的逻辑回归对象
    # 标准化特征
    scaler = StandardScaler()
    X_ = scaler.fit_transform(X,labels)
    # 划分训练集和测试集
    x_train, x_test, labels_train, labels_test = \
        train_test_split(X_, labels, test_size=0.2)
    log_reg = LogisticRegression(max_iter=3000)#multinomial
    #训练模型
    log_reg.fit(x_train, labels_train)
    label_sample = log_reg.predict(x_test)
    lsr_acc = cluster_acc(labels_test, label_sample)
    print(title_lsr, "=", lsr_acc)

    return round(knn_acc,n),round(gnb_acc,n),round(lr_acc,n),round(lsr_acc,n)

def get_imgdata(file,sfile,re_size=16,n=5):
    import pickle
    import numpy as np
    from skimage.transform import resize

    def unpickle(file):
        with open(file, 'rb') as f:
            cifar_dict = pickle.load(f, encoding='latin1')
        return cifar_dict
    # 定义用来存放图像数据 图像标签 图像名称list  最后返回的cifar_image cifar_label即是图像cifar-10 对应的数据和标签
    tem_cifar_image = []
    tem_cifar_label = []
    tem_cifar_image_name = []
    for i in range(1, n+1):
        # 存放是你的文件对应的目录
        cifar_file = sfile + str(i)
        cifar = unpickle(cifar_file)
        cifar_label = cifar['labels']
        cifar_image = cifar['data']
        cifar_image_name = cifar['filenames']
        # 使用transpose()函数是因为cifar存放的是图像标准是 通道数 高 宽 所以要修改成  高 宽 通道数
        cifar_image = cifar_image.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float")
        cifar_image = np.asarray([resize(x_img, [re_size, re_size]) for x_img in cifar_image])
        cifar_label = np.array(cifar_label)
        cifar_image_name = np.array(cifar_image_name)
        tem_cifar_image.append(cifar_image)
        tem_cifar_label.append(cifar_label)
        tem_cifar_image_name.append(cifar_image_name)
    cifar_image = np.concatenate(tem_cifar_image)
    cifar_label = np.concatenate(tem_cifar_label)
    cifar_image_name = np.concatenate(tem_cifar_image_name)
    return cifar_image, cifar_label, cifar_image_name

file = "D:\\作品\\python\\cifar-10-python\\cifar-10-batches-py\\batches.meta"
sfile = "D:\\作品\\python\\cifar-10-python\\cifar-10-batches-py\\data_batch_"
n=5              #n表示要获取几个数据集
re_size=8
X = []
Y = []
z = []
X,Y,Z = get_imgdata(file,sfile,re_size,n)
X = X.reshape(n*10000,-1)
X_ = []
Y_ = []
for i in range(n):
    X_.append(X[i*10000:(i+1)*10000])
    Y_.append(Y[i*10000:(i+1)*10000])
X_ = np.array(X_)
Y_ = np.array(Y_)
knn_acc = []
gnb_acc = []
lr_acc = []
lsr_acc = []
for i in range(n):
    t1,t2,t3,t4= \
        knn_gnb_lr_lsr(X_[i], Y_[i])
    knn_acc.append(t1)
    gnb_acc.append(t2)
    lr_acc.append(t3)
    lsr_acc.append(t4)
# 使用pandas输出
title1 = []
for i in range(n):
    t = 'data_bath'+str(i+1)
    title1.append(t)
title2 = ["KNN     ", "Naive Bayes","linear regression","Logistic regression"]
data = pd.DataFrame([knn_acc,gnb_acc,lr_acc,lsr_acc],index=title2,columns=title1)
print(data)

4. 输出
4个分类器分类cifar_10_batches性能比较_第1张图片

5. 数据集地址
CIFAR-10 python version

你可能感兴趣的:(机器学习)