朴素贝叶斯MNIST手写识别分类python实现

朴素贝叶斯训练数据时,就是求其似然和先验概率。预测时,求最大后验概率。代码中的mnist-original.mat文件需要自行下载,并放入当前路径的datasets\mldata文件夹下。

from sklearn.datasets import fetch_mldata
from collections import Counter
import numpy as np

#加载MNIST数据集
def load_mnist():
    mnist = fetch_mldata('MNIST original', data_home='./datasets')
    x, y = mnist["data"], mnist["target"]
    x = np.where(x>0, 1, x) #将大于1的数组全部换成1
    X_train, X_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:]
        
    return X_train, y_train, X_test, y_test

class NaiveBayes():
    prob_c = None #类别为c的先验概率
    prob_cj = None #类别为C,维度为j的似然概率
    y_pred = None #预测结果,类别
    y_pred_prob = None #预测结果,类别概率
    
    #训练模型,计算先验概率和似然概率
    def fit(self, X_train, y_train):
        #计算各类别先验概率 p(y)
        prob_c = []
        y_count = Counter(y_train)
        for key in y_count.keys():
            prob_c.append(y_count[key]/len(y_train))
        
        #计算每一维的条件概率 p(x_ij | y), x_ij表示第i行第j维的元素
        prob_cj = [] #第c类,第j维的条件概率
        for c in y_count.keys():
            prob_temp = []
            for j in range(X_train.shape[1]):
                c_train = X_train[y_train==c] #类别为c的训练样本
                c_train_j = c_train[:, j] #类别为c样本中第j维(列)的数据
                c_train_j_1 = c_train_j[c_train_j == 1] #类别为c样本中第j维(列)值为1的数据
                prob_1_cj = (c_train_j_1.shape[0] + 1)/(c_train_j.shape[0] + c_train.shape[1]) # add_one smoothing计算条件概率
                prob_temp.append(prob_1_cj)
            prob_cj.append(prob_temp)
        
        self.prob_c = prob_c
        self.prob_cj = prob_cj
        
        return prob_c, prob_cj
    
    #预测
    def predict(self, X_test):
        y_pred_prob = []
        #对每一条测试样本
        for x in X_test:
            temp_list = []
            #对每一类,计算后验概率
            for i in range(len(self.prob_c)):
                prob_cond = 1
                c_test_index = np.where(x==1)[0]
                for z in c_test_index:
                    prob_cond *= self.prob_cj[i][z]
                
                post_prob =  prob_cond * self.prob_c[i]
                temp_list.append(post_prob)
            y_pred_prob.append(temp_list)
        #后验概率最大的索引值,就是该测试样本的所属类别
        y_pred = np.argmax(y_pred_prob, axis=1)
        
        self.y_pred_prob = y_pred_prob
        self.y_pred = y_pred
        
        return y_pred
    
    def lossFun(self, y_pred, y_test):
        p = 0
        for i in range(len(y_pred)):
            if y_pred[i] == y_test[i]:
                p += 1
        return p/len(y_pred)

if __name__ == '__main__':
    X_train, y_train, X_test, y_test = load_mnist()
    clf = NaiveBayes()
    prob_c, prob_cj = clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    accurate = clf.lossFun(y_pred, y_test)
    print('各类数量统计:', Counter(y_pred))
    print('准确率: ', accurate)

运行结果:
在这里插入图片描述

你可能感兴趣的:(机器学习,朴素贝叶斯,MNIST,朴素贝叶斯,MNIST,python)