"""
朴素贝叶斯模型,通过计算后验概率的分母,并比较大小,预测label\
计算时使用了拉普拉斯平滑,参数为_lambda
适合预测特征和标签取值只有有限个的情况,不适合前几次使用的iris数据集,它们的特征数量是一个区间内的任意实数
使用数据集为西瓜数据集3.0,数据比较少,最后效果不太好
编号,色泽,根蒂,敲声,纹理,脐部,触感,密度,含糖率,好瓜
1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,0.697,0.460,是
2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,0.774,0.376,是
3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,0.634,0.264,是
4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,0.608,0.318,是
5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,0.556,0.215,是
6,青绿,稍蜷,浊响,清晰,稍凹,软粘,0.403,0.237,是
7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,0.481,0.149,是
8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,0.437,0.211,是
9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,0.666,0.091,否
10,青绿,硬挺,清脆,清晰,平坦,软粘,0.243,0.267,否
11,浅白,硬挺,清脆,模糊,平坦,硬滑,0.245,0.057,否
12,浅白,蜷缩,浊响,模糊,平坦,软粘,0.343,0.099,否
13,青绿,稍蜷,浊响,稍糊,凹陷,硬滑,0.639,0.161,否
14,浅白,稍蜷,沉闷,稍糊,凹陷,硬滑,0.657,0.198,否
15,乌黑,稍蜷,浊响,清晰,稍凹,软粘,0.360,0.370,否
16,浅白,蜷缩,浊响,模糊,平坦,硬滑,0.593,0.042,否
17,青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,0.719,0.103,否
"""
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
df = pd.read_table("./data/西瓜数据集3_0.txt", sep=",")
X = np.asarray(df.iloc[:, 1:7])
y = np.asarray(df.iloc[:, 9])
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.7)
class NaiveBayes():
def __init__(self, _lambda):
self._lambda = _lambda
self.ConditionalProbability = []
self.PrioriProbabilities = None
self.labels = None
def train(self, train_X, train_y):
y = np.asarray(train_y)
X = np.asarray(train_X)
train_size = X.shape[0]
self.labels, self.labels_count = np.unique(y, return_counts=True)
label_num = len(self.labels)
label_count_dict = dict(zip(self.labels, self.labels_count))
self.PrioriProbabilities = dict(
zip(self.labels, (self._lambda + self.labels_count) / (train_size + label_num * self._lambda)))
for feature_index in range(X.shape[1]):
result = {}
feature = X[:, feature_index]
for index in range(len(feature)):
result[(feature[index], y[index])] = result.get((feature[index], y[index]), 0) + 1
"""
S_j是能够与label产生实际组合的特征数量,不是所有的特征数量,大坑,此处要注意,例如,对于第一个特征色泽:
{('浅白', '否'): 3, ('青绿', '是'): 2, ('青绿', '否'): 3, ('乌黑', '是'): 1, ('浅白', '否'): 1, ('乌黑', '否'): 1}
'是'这个label对应的特征数目(S_j)是2,它没有和'浅白'这个特征取值产生组合,因此S_j是2,如果按照S_j算的话条件概率之和不是1,会小于1的,因为分母大了
而对于'否'这个label,它和浅白,乌黑,青绿都产生了组合,它的S_j是3
很容易误认为某个特征的所有可能取值就是S_j的值,并不是这样的,需要看具体数据,产生实际组合才算
orz,这里的坑要注意呀
"""
label_features_count_dic = {}
for key in result.keys():
label = key[1]
label_features_count_dic[label] = label_features_count_dic.get(label, 0) + 1
feature_label_data = []
for key in result.keys():
feature_label_data.append(
(key[0], key[1], (result[key] + self._lambda) / (
label_count_dict[key[1]] + label_features_count_dic[key[1]] * self._lambda)))
self.ConditionalProbability.append(feature_label_data)
def predict(self, x):
max_percent = -1
predict_label = None
for label in self.PrioriProbabilities.keys():
temp_percent = self.PrioriProbabilities[label]
for feature_index, feature_value in enumerate(x):
for data in self.ConditionalProbability[feature_index]:
"""
self.ConditionalProbability=[
[('浅白', '否', 0.4), ('青绿', '否', 0.3), ('青绿', '是', 0.2857142857142857), ('乌黑', '否', 0.3), ('乌黑', '是', 0.42857142857142855), ('浅白', '是', 0.2857142857142857)],
[('稍蜷', '否', 0.5), ('蜷缩', '否', 0.3), ('蜷缩', '是', 0.6666666666666666), ('稍蜷', '是', 0.3333333333333333), ('硬挺', '否', 0.2)],
[('沉闷', '否', 0.4), ('浊响', '是', 1.0), ('浊响', '否', 0.4), ('清脆', '否', 0.2)],
[('稍糊', '否', 0.5), ('清晰', '是', 1.0), ('清晰', '否', 0.2), ('模糊', '否', 0.3)],
[('凹陷', '否', 0.3), ('稍凹', '否', 0.4), ('凹陷', '是', 0.6666666666666666), ('稍凹', '是', 0.3333333333333333), ('平坦', '否', 0.3)],
[('硬滑', '否', 0.6666666666666666), ('硬滑', '是', 1.0), ('软粘', '否', 0.3333333333333333)]
]
"""
if data[0] == x[feature_index] and data[1] == label:
temp_percent *= data[2]
break
if temp_percent >= max_percent:
max_percent = temp_percent
predict_label = label
return predict_label
NaiveBayes = NaiveBayes(_lambda=1)
NaiveBayes.train(train_X, train_y)
for index, x in enumerate(test_X):
print(x)
print("预测:{}".format(NaiveBayes.predict(x)))
print("实际:{}".format(test_y[index]))