python实现 Gaussian naive bayes高斯朴素贝叶斯

import numpy as np
import matplotlib.pyplot as plt
from scipy import io
from scipy import stats
import os
from sklearn.metrics import precision_recall_curve
from sklearn.naive_bayes import GaussianNB
def prior(yTrain):
    '''
    :param yTrain: 训练数据集样本的类别标签    (218, 1)
    :return: num_classes*1 numpy.ndarray 每个类别的先验概率
    '''
    num_total=yTrain.shape[0]
    # num_class_1=np.sum(yTrain==1)
    # num_class_2 = np.sum(yTrain == 2)
    # num_class_3 = np.sum(yTrain == 3)
    # num_class_4 = np.sum(yTrain == 4)
    # num_class_5 = np.sum(yTrain == 5)

    p=np.zeros((5,1))
    for i in range(p.shape[0]):
        p[i,0]=np.sum(yTrain==i+1)/num_total
    return p

def likelihood(xTrain, yTrain):
    '''
    :param xTrain: 训练数据集的样本特征矩阵
    :param yTrain: 训练数据集样本的标签矩阵
    :return: 条件概率:条件均值和条件方差
            M  numpy.ndarray  shape=[num_feat,num_classes]
            其中的第c列,第m行表示,在当前类别为c的条件下,特征m的均值
            V  numpy.ndarray  shape=[num_feat,num_classes]
            其中的第c列,第m行表示,在当前类别为c的条件下,特征m的方差
    '''

    num_feat=xTrain.shape[1]

    num_classes=5

    M=np.zeros((num_feat,num_classes))
    V=np.ones((num_feat,num_classes))

    for c in range(num_classes):
        # num_class_examples=np.sum(yTrain==c+1)

        # num_class_examples : scalar  表示训练数据集中属于当前类别的样本总数

        temp_class_examples=xTrain[np.where(np.squeeze(yTrain,axis=1)==c+1)]

        # print('temp_class_examples',temp_class_examples.shape)

        # temp_class_examples  numpy.ndarray shape=[num_class_examples,num_feat]
        # 表示训练数据集中属于当前类别的样本的所有特征

        M[:,c]=np.mean(temp_class_examples,axis=0)

        # temp_mean numpy.ndarray shape=[num_feat,] 表示在当前类别下,每个维度特征的平均值

        V[:,c]=np.var(temp_class_examples,axis=0)

    return M, V

def naiveBayesClassify(xTest, M, V, p):
    '''
    :param xTest: 测试数据样本的特征矩阵  numpy.ndarray shape=[num_test,num_feat]
    :param M: 根据训练数据集得到的条件均值  numpy.ndarray  shape=[num_feat,num_classes]
    :param V: 根据训练数据集得到的条件方差  numpy.ndarray  shape=[num_feat,num_classes]
    :param p: 根据训练数据集得到的各个类别的先验概率  numpy.ndarray  shape=[num_classes,1]
    :return: nb: 对于测试数据集所预测的类别  numpy.ndarray shape=[num_test,1]
    '''
    # 对于测试数据集中的每个样本(xTest矩阵中的每一行),计算出一个矩阵 temp_matrix=[num_feat,num_classes]
    # 表示当前测试样本属于每个类别下,当前维度特征的条件概率(根据高斯分布的概率密度函数得到)
    num_test=xTest.shape[0]
    num_feat=xTest.shape[1]
    num_classes=5
    nb=np.zeros((num_test,1))
    prob=np.zeros((num_test,1))
    sample_prob=np.ones((num_test,num_classes))
    for i in range(num_test):
        temp_sample=xTest[i,:]
        for c in range(num_classes):
            for dim in range(num_feat):
                feat_mean=M[dim][c]
                feat_var=V[dim][c]
                # 根据高斯分布的均值和方差计算概率密度函数 当前的特征取值为xTest[i][dim]
                sample_prob[i,c]*=((np.exp(-(xTest[i][dim]-feat_mean)**2/(2*feat_var)))/(np.sqrt(2*np.pi*feat_var)))

                # 概率密度函数在某个离散数值上面的取值可能大于1
                # 高斯分布的概率密度函数可能大于1,之需要保证最终概率密度函数曲线的积分值等于1即可
                # 比如区间为[0,0.2]上的均匀分布,其概率密度函数为
                # pdf(x)=5 if 0<=x<=2
                # pdf(x)=0 if x<0 or x>2

                # print('--------temp',((np.exp(-(xTest[i][dim]-feat_mean)**2/(2*feat_var)))/(np.sqrt(2*np.pi*feat_var))))
                # print(stats.norm(feat_mean, np.sqrt(feat_var)).pdf(xTest[i][dim]))

            sample_prob[i,c]*=p[c,0]

            # print('in naiveBayesClassify', sample_prob[i,c])

        nb[i,0]=np.argmax(sample_prob[i,:])+1

        prob[i,0]=np.max(sample_prob[i,:])
    return nb,prob

def PR_curve():
    return

if __name__=='__main__':
    mat=io.loadmat(os.path.join('F:\\machine_learning\\yaling\\hw_3\\naiveBayes\\naiveBayes','ecoli.mat'))
    print(type(mat))

    xTrain=mat['xTrain']
    yTrain=mat['yTrain']
    xTest=mat['xTest']
    yTest=mat['yTest']

    # print(type(xTrain),xTrain.shape,xTrain)

    # xTrain 训练数据集样本特征   (218, 5)

    # yTrain 训练样本集类别标签   (218, 1) [1, 2, 3, 4, 5] 类别标签中的类别

    # print(type(yTrain), yTrain.shape,list(set(list(np.squeeze(yTrain,1)))))

    print(type(xTest),xTest.shape)

    print(type(yTest),yTest.shape,list(set(list(np.squeeze(yTest,1)))))

    # xTest 测试数据集的样本特征   (109, 5)

    # yTest 测试数据集的类别标签   (109, 1) [1, 2, 3, 4, 5]

    # 朴素贝叶斯算法的步骤:

    # 1.计算出每个类别的先验概率

    p=prior(yTrain)

    # print('p',type(p),p.shape,p)

    # p   (5, 1)
    # [[0.47247706]
    #  [0.23394495]
    #  [0.14220183]
    #  [0.10091743]
    #  [0.05045872]]

    # 2.求出矩阵M和V,矩阵中的第m行,第c列的元素表示类别c的条件下,特征m的条件概率
    # 将条件概率建模成为高斯分布,则矩阵M和矩阵V分别表示相应条件概率的均值和方差

    M,V=likelihood(xTrain,yTrain)

    clf = GaussianNB()

    clf.fit(xTrain,yTrain)
    #
    # package_mean=clf.theta_
    # package_var=clf.sigma_

    # 调用scipy函数 package所计算出来的每个类别中每个特征的均值和每个类别中每个特征的方差

    # print('mean and variance check',np.sum(M-package_mean),np.sum(V-package_var))

    # mean and variance check 0.0 -1.2204103179425108e-09

    # print(M,V)

    # 3.根据先验概率和每个维度特征的条件概率,求出argmax问题,求出使得联合概率密度取得最大值的类别作为预测类别

    nb,prob=naiveBayesClassify(xTest, M, V, p)

    # print(nb.shape,list(set(list(np.squeeze(nb,axis=1)))))

    # print('prob', prob, np.max(prob), np.min(prob))

    # (109, 1) [0.0, 1.0, 2.0, 3.0, 4.0]

    package_pred=clf.predict(xTest)

    # print('package_pred',package_pred.shape,package_pred)

    # print('prediction check',np.sum(package_pred-np.squeeze(nb,axis=1)))

    # prediction check 0.0

    # 4.分析在测试数据集上的性能,并将测试性能写入evaluation.txt文件

    # (1)accuracy: 正确预测的样本数/总测试样本数

    accuracy=np.sum(nb==yTest)/yTest.shape[0]
    print(accuracy)

    # 0.8715596330275229   训练数据集上的准确率
    # 0.8348623853211009   测试数据集上的准确率

    # precision的定义:P=TP/(TP+FP)=TP/all_pred
    # 即预测的所有positive examples正样本点中,有多少个正样本点是预测正确的

    # recall的定义:R=TP/(TP+FN)=TP/all_gt
    # 即预测正确的正样本点数与ground truth label正样本点数的比例

    prec_1=np.sum(nb[np.where(np.squeeze(yTest,axis=1)==1)]==1)/np.sum(nb==1)

    rec_1=np.sum(nb[np.where(np.squeeze(yTest,axis=1)==1)]==1)/np.sum(yTest==1)

    prec_5 = np.sum(nb[np.where(np.squeeze(yTest,axis=1)==5)]==5) / np.sum(nb == 5)

    rec_5 = np.sum(nb[np.where(np.squeeze(yTest,axis=1)==5)]==5) / np.sum(yTest == 5)

    with open(os.path.join('F:\\machine_learning\\yaling\\hw_3\\naiveBayes\\naiveBayes','evaluation.txt'),'w') as eval:
        eval.write('accuracy:%.4f'%(accuracy))
        eval.write('\n')
        eval.write('precision_1:%.4f' % (prec_1))
        eval.write('\n')
        eval.write('recall_1:%.4f' % (rec_1))
        eval.write('\n')
        eval.write('precision_5:%.4f' % (prec_5))
        eval.write('\n')
        eval.write('recall_5:%.4f' % (rec_5))
        eval.write('\n')

    # 分别画出5个前景类别的PR曲线

    # print('nb',np.max(nb),np.min(nb))

    num_classes=5
    num_test=xTest.shape[0]
    for i in range(num_classes):

        # plt.figure(1)  # 创建图表1
        plt.title('num_classes='+str(i+1)+' Precision/Recall Curve')  # give plot a title
        plt.xlabel('Recall')  # make axis labels
        plt.ylabel('Precision')

        temp_gt=np.squeeze(yTest,axis=1)

        temp_gt=np.where(temp_gt==i+1,1,0)

        temp_pred=np.where(np.squeeze(nb,axis=1)==i+1,np.squeeze(prob,axis=1),0)

        # print('temp_gt', temp_gt, np.max(temp_gt), np.min(temp_gt))

        # print('temp_pred',temp_pred,np.max(temp_pred),np.min(temp_pred))

        index=np.argsort(-temp_pred)

        # 对预测的概率值按照降序排列

        temp_gt=temp_gt[index]

        temp_pred=temp_pred[index]

        # print('temp_pred',temp_pred,np.max(temp_pred),np.min(temp_pred))

        # y_true和y_scores分别是gt label和predict score
        precision, recall, thresholds = precision_recall_curve(temp_gt, temp_pred)

        plt.figure(1)
        plt.plot(recall, precision)  # 分别表示x轴和y轴,PR曲线中,precision是Y轴,recall是x轴
        plt.show()

        # 实现对于当前类别所预测的概率值,绘制PR曲线
        # PR曲线的绘制并不需要像绘制ROC曲线一样,设定不同的threshold进行描点
        # 而是将预测为前景类别的概率值从高到低进行排序,然后对于每个预测的点,给出PR值
        # 即PR曲线上的散点个数与测试集中的样本点数相等

        # tp=np.zeros(num_test)
        # fp=np.zeros(num_test)
        #
        # for j in range(num_test):
        #     if temp_pred[j]>0.5:
        #         if temp_gt[j]==1:
        #             tp[j]=1
        #         else:
        #             fp[j]=1
        #
        # precision=np.cumsum(tp)/np.cumsum(tp+fp)
        # recall=np.cumsum(tp)/np.sum(temp_gt)


        plt.figure(1)
        plt.plot(recall, precision)# 分别表示x轴和y轴,PR曲线中,precision是Y轴,recall是x轴
        plt.show()

你可能感兴趣的:(numpy,python)