- 李航:
朴素贝叶斯(naive bayes)法是基于贝叶斯定理与特征条件独立假设的分类方法。
对于给定的训练数据集,首先基于特征条件独立假设学习学习输入输出的联合概率分 布;然后基于此模型,对给定的输入X,利用贝叶斯定理求出后延概率最大的输入y。
朴素贝叶斯法实现简单,学习效率高。
python代码书中4.2例题
import numpy as np
class bayes(object):
def __init__(self, data, label, num_class, L):
# data : (list) samples_nums * [features_nums]
self.data = data
self.label = label
self.num_class = num_class
self.L = L
self.p_prams = []
self.p_label = np.zeros(self.num_class)
self.fea_condition = []
self.model = self.__model()
def __model(self):
self.__get_p_gram()
def __get_p_gram(self):
self.__generation_features_conditional()
start = 0
for i in range(self.num_class):
# i 代表第i类
Ik = list(self.label).count(i)
self.p_label[i] = (Ik + self.L)/ (len(self.label) + self.num_class * self.L)
end = Ik + start
index_sort = np.argsort(self.label)
condition_k = []#condition_k 是一个列表,保存第I类每个特征不同取值的概率
for index, condition in enumerate(self.fea_condition):
temp = self.data[index_sort[start:end], index].reshape(1, -1)[0]
condition_kj = [] # 保存第index个特征不同取值的概率
for c in condition:
condition_kj.append((list(temp).count(c) + self.L)/(end - start + len(condition) * self.L))
condition_k.append(condition_kj)
start = end
self.p_prams.append(condition_k)
def __generation_features_conditional(self):
#找出每种特征出现的所有情况
features_nums = self.data.shape[1]
for j in range(features_nums):
# j代表第j个特征
self.fea_condition.append(np.unique(self.data[:, j]))
def classify(self, target):
p = list(self.p_label)
for index, _ in enumerate(p):
for fea_index,fea in enumerate(list(target)):
fea_local = list(self.fea_condition[index]).index(fea)#每个特征值在所属的S集合中的位置
p[index] *= self.p_prams[index][fea_index][fea_local]
c = np.asarray(p).argsort()[-1]
return p, c
data = np.array([[1, 1], [1, 2], [1, 2], [1, 1], [1, 1], [2, 1], [2, 2], [2, 2],[2, 3], [2, 3], [3, 3], [3, 2], [3, 2], [3, 3], [3, 3]])
label = np.array([0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0])
target = [2, 1]
num_class = 2
L = 1 #拉普拉斯平滑参数
model = bayes(data, label, num_class, L)
p, c = model.classify(target)
print('Target belong %s, \nP is %s.\n' % (c, p[c]))
sklearn代码所用数据为kaggle中mnist数据,将特征PCA至六维
# -*- coding: utf-8 -*-
"""
使用sklearn实现的贝叶斯算法进行分类的一个实例,
使用数据集是Kaggle数字手写体数据库
"""
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
import sklearn
# 加载数据集
def load_data(filename, n, mode):
data_pd = pd.read_csv(filename)
data = np.asarray(data_pd)
pca = PCA(n_components=n)
if not mode == 'test':
dateset = pca.fit_transform(data[:, 1:])
return dateset, data[:, 0]
else:
dateset = pca.fit_transform(data)
return dateset, 1
def main(train_data_path, test_data_path, n_dim):
train_data, train_label = load_data(train_data_path, n_dim, 'train')
print("Train set :" + repr(len(train_data)))
test_data, _ = load_data(test_data_path, n_dim, 'test')
print("Test set :" + repr(len(test_data)))
bys = GaussianNB()
# 训练数据集
bys.fit(train_data, train_label)
# 训练准确率
score = bys.score(train_data, train_label)
print(">Training accuracy = " + repr(score))
predictions = []
for index in range(len(test_data)):
# 预测
result = bys.predict([test_data[index]])
predict = bys.predict_proba([test_data[index]])
predictions.append([index + 1, result[0]])
print(">Index : %s, predicted = %s" % (index + 1, result[0]))
columns = ['ImageId', 'Label']
save_file = pd.DataFrame(columns=columns, data=predictions)
save_file.to_csv('bys.csv', index=False, encoding="utf-8")
if __name__ == "__main__":
train_data_path = 'train.csv'
test_data_path = 'test.csv'
n_dim = 6
main(train_data_path, test_data_path, n_dim)
课后习题
喜欢的关注点赞哈