朴素贝叶斯模型实现

"""
朴素贝叶斯模型,通过计算后验概率的分母,并比较大小,预测label\
计算时使用了拉普拉斯平滑,参数为_lambda
适合预测特征和标签取值只有有限个的情况,不适合前几次使用的iris数据集,它们的特征数量是一个区间内的任意实数
使用数据集为西瓜数据集3.0,数据比较少,最后效果不太好
编号,色泽,根蒂,敲声,纹理,脐部,触感,密度,含糖率,好瓜
1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,0.697,0.460,是
2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,0.774,0.376,是
3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,0.634,0.264,是
4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,0.608,0.318,是
5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,0.556,0.215,是
6,青绿,稍蜷,浊响,清晰,稍凹,软粘,0.403,0.237,是
7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,0.481,0.149,是
8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,0.437,0.211,是
9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,0.666,0.091,否
10,青绿,硬挺,清脆,清晰,平坦,软粘,0.243,0.267,否
11,浅白,硬挺,清脆,模糊,平坦,硬滑,0.245,0.057,否
12,浅白,蜷缩,浊响,模糊,平坦,软粘,0.343,0.099,否
13,青绿,稍蜷,浊响,稍糊,凹陷,硬滑,0.639,0.161,否
14,浅白,稍蜷,沉闷,稍糊,凹陷,硬滑,0.657,0.198,否
15,乌黑,稍蜷,浊响,清晰,稍凹,软粘,0.360,0.370,否
16,浅白,蜷缩,浊响,模糊,平坦,硬滑,0.593,0.042,否
17,青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,0.719,0.103,否
"""

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

df = pd.read_table("./data/西瓜数据集3_0.txt", sep=",")
X = np.asarray(df.iloc[:, 1:7])
y = np.asarray(df.iloc[:, 9])
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.7)


class NaiveBayes():
	def __init__(self, _lambda):
		self._lambda = _lambda
		self.ConditionalProbability = []  # 条件概率结果
		self.PrioriProbabilities = None
		self.labels = None

	def train(self, train_X, train_y):
		y = np.asarray(train_y)
		X = np.asarray(train_X)
		# print(X)
		# print(y)
		# 计算先验分布
		train_size = X.shape[0]
		self.labels, self.labels_count = np.unique(y, return_counts=True)
		label_num = len(self.labels)
		label_count_dict = dict(zip(self.labels, self.labels_count))
		self.PrioriProbabilities = dict(
			zip(self.labels, (self._lambda + self.labels_count) / (train_size + label_num * self._lambda)))
		# print(self.PrioriProbabilities)
		# print(PrioriProbability)
		# 对每个特征的每一个取值,使用字典存储计数,{(特征取值,label):个数},
		for feature_index in range(X.shape[1]):
			result = {}
			feature = X[:, feature_index]
			# print(feature)
			for index in range(len(feature)):
				result[(feature[index], y[index])] = result.get((feature[index], y[index]), 0) + 1
			# 计算条件概率
			# print(result)
			# print(len(feature_values))

			"""
			S_j是能够与label产生实际组合的特征数量,不是所有的特征数量,大坑,此处要注意,例如,对于第一个特征色泽:
			{('浅白', '否'): 3, ('青绿', '是'): 2, ('青绿', '否'): 3, ('乌黑', '是'): 1, ('浅白', '否'): 1, ('乌黑', '否'): 1}
			'是'这个label对应的特征数目(S_j)是2,它没有和'浅白'这个特征取值产生组合,因此S_j是2,如果按照S_j算的话条件概率之和不是1,会小于1的,因为分母大了
			而对于'否'这个label,它和浅白,乌黑,青绿都产生了组合,它的S_j是3
			很容易误认为某个特征的所有可能取值就是S_j的值,并不是这样的,需要看具体数据,产生实际组合才算
			orz,这里的坑要注意呀
			"""

			# 统计每个label实际对应的特征取值数目
			label_features_count_dic = {}
			for key in result.keys():
				label = key[1]
				label_features_count_dic[label] = label_features_count_dic.get(label, 0) + 1
			feature_label_data = []
			for key in result.keys():
				# print((value+self._lambda)/(label_count_dict[key[1]]+len(feature_values)*self._lambda))
				feature_label_data.append(
					(key[0], key[1], (result[key] + self._lambda) / (
							label_count_dict[key[1]] + label_features_count_dic[key[1]] * self._lambda)))
			self.ConditionalProbability.append(feature_label_data)

	# print(self.ConditionalProbability)

	def predict(self, x):
		max_percent = -1
		predict_label = None
		for label in self.PrioriProbabilities.keys():
			temp_percent = self.PrioriProbabilities[label]
			for feature_index, feature_value in enumerate(x):
				for data in self.ConditionalProbability[feature_index]:
					"""
					self.ConditionalProbability=[
					[('浅白', '否', 0.4), ('青绿', '否', 0.3), ('青绿', '是', 0.2857142857142857), ('乌黑', '否', 0.3), ('乌黑', '是', 0.42857142857142855), ('浅白', '是', 0.2857142857142857)],
					 [('稍蜷', '否', 0.5), ('蜷缩', '否', 0.3), ('蜷缩', '是', 0.6666666666666666), ('稍蜷', '是', 0.3333333333333333), ('硬挺', '否', 0.2)],
					  [('沉闷', '否', 0.4), ('浊响', '是', 1.0), ('浊响', '否', 0.4), ('清脆', '否', 0.2)], 
					  [('稍糊', '否', 0.5), ('清晰', '是', 1.0), ('清晰', '否', 0.2), ('模糊', '否', 0.3)], 
					  [('凹陷', '否', 0.3), ('稍凹', '否', 0.4), ('凹陷', '是', 0.6666666666666666), ('稍凹', '是', 0.3333333333333333), ('平坦', '否', 0.3)], 
					  [('硬滑', '否', 0.6666666666666666), ('硬滑', '是', 1.0), ('软粘', '否', 0.3333333333333333)]
					]
					"""
					# 对每一个特征,只在后验概率的这个特征上搜索
					if data[0] == x[feature_index] and data[1] == label:
						temp_percent *= data[2]
						break
			if temp_percent >= max_percent:
				max_percent = temp_percent
				predict_label = label
		return predict_label


NaiveBayes = NaiveBayes(_lambda=1)
NaiveBayes.train(train_X, train_y)

for index, x in enumerate(test_X):
	print(x)
	print("预测:{}".format(NaiveBayes.predict(x)))
	print("实际:{}".format(test_y[index]))

你可能感兴趣的:(统计学习方法代码实现,python,机器学习,人工智能)