统计学习方法 李航 贝叶斯模型 python sklearn 实现 及课后习题

  • 李航:
    朴素贝叶斯(naive bayes)法是基于贝叶斯定理与特征条件独立假设的分类方法。
    对于给定的训练数据集,首先基于特征条件独立假设学习学习输入输出的联合概率分 布;然后基于此模型,对给定的输入X,利用贝叶斯定理求出后延概率最大的输入y。
    朴素贝叶斯法实现简单,学习效率高。

python代码书中4.2例题

import numpy as np

class bayes(object):
    def __init__(self, data, label, num_class, L):
        # data : (list) samples_nums * [features_nums]
        self.data = data
        self.label = label
        self.num_class = num_class
        self.L = L
        self.p_prams = []
        self.p_label = np.zeros(self.num_class)
        self.fea_condition = []
        self.model = self.__model()
    def __model(self):
        self.__get_p_gram()

    def __get_p_gram(self):
        self.__generation_features_conditional()
        start = 0
        for i in range(self.num_class):
            # i 代表第i类
            Ik = list(self.label).count(i)
            self.p_label[i] = (Ik + self.L)/ (len(self.label) + self.num_class * self.L)
            end = Ik + start
            index_sort = np.argsort(self.label)
            condition_k = []#condition_k   是一个列表,保存第I类每个特征不同取值的概率
            for index, condition in enumerate(self.fea_condition):
                temp = self.data[index_sort[start:end], index].reshape(1, -1)[0]
                condition_kj = []  # 保存第index个特征不同取值的概率
                for c in condition:
                    condition_kj.append((list(temp).count(c) + self.L)/(end - start + len(condition) * self.L))
                condition_k.append(condition_kj)
            start = end
            self.p_prams.append(condition_k)

    def  __generation_features_conditional(self):
        #找出每种特征出现的所有情况
        features_nums = self.data.shape[1]
        for j in range(features_nums):
            # j代表第j个特征
            self.fea_condition.append(np.unique(self.data[:, j]))

    def classify(self, target):
        p = list(self.p_label)
        for index, _ in enumerate(p):
            for fea_index,fea in enumerate(list(target)):
                fea_local = list(self.fea_condition[index]).index(fea)#每个特征值在所属的S集合中的位置
                p[index] *= self.p_prams[index][fea_index][fea_local]
        c = np.asarray(p).argsort()[-1]
        return p, c


data = np.array([[1, 1], [1, 2], [1, 2], [1, 1], [1, 1], [2, 1], [2, 2], [2, 2],[2, 3], [2, 3], [3, 3], [3, 2], [3, 2], [3, 3], [3, 3]])
label = np.array([0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0])
target = [2, 1]
num_class = 2
L = 1 #拉普拉斯平滑参数
model = bayes(data, label, num_class, L)
p, c = model.classify(target)
print('Target belong %s, \nP is %s.\n' % (c, p[c]))

sklearn代码所用数据为kaggle中mnist数据,将特征PCA至六维

# -*- coding: utf-8 -*-

"""
使用sklearn实现的贝叶斯算法进行分类的一个实例,
使用数据集是Kaggle数字手写体数据库
"""



import pandas as pd

import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA

import sklearn



# 加载数据集

def load_data(filename, n, mode):

    data_pd = pd.read_csv(filename)

    data = np.asarray(data_pd)

    pca = PCA(n_components=n)

    if not mode == 'test':

        dateset = pca.fit_transform(data[:, 1:])

        return dateset, data[:, 0]

    else:

        dateset = pca.fit_transform(data)

        return dateset, 1



def main(train_data_path, test_data_path, n_dim):

    train_data, train_label = load_data(train_data_path, n_dim, 'train')

    print("Train set :" + repr(len(train_data)))

    test_data, _ = load_data(test_data_path, n_dim, 'test')

    print("Test set :" + repr(len(test_data)))

    bys = GaussianNB()

    # 训练数据集

    bys.fit(train_data, train_label)

    # 训练准确率

    score = bys.score(train_data, train_label)

    print(">Training accuracy = " + repr(score))

    predictions = []

    for index in range(len(test_data)):

        # 预测

        result = bys.predict([test_data[index]])

        predict = bys.predict_proba([test_data[index]])

        predictions.append([index + 1, result[0]])

        print(">Index : %s, predicted = %s" % (index + 1, result[0]))

    columns = ['ImageId', 'Label']

    save_file = pd.DataFrame(columns=columns, data=predictions)

    save_file.to_csv('bys.csv', index=False, encoding="utf-8")



if __name__ == "__main__":

    train_data_path = 'train.csv'

    test_data_path = 'test.csv'

    n_dim = 6

    main(train_data_path, test_data_path, n_dim)

课后习题

喜欢的关注点赞哈

你可能感兴趣的:(统计学习方法 李航 贝叶斯模型 python sklearn 实现 及课后习题)