请手动实现(不准调用任何现成的机器学习工具包中的朴素贝叶斯分类器)朴素贝叶斯分类器算法(包括概率平滑方法),并在ppt中列出的D14数据集进行训练和验证:将D14数据集随机打乱后,取10个样例为训练集,另外4个测试集;输出测试结果。
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
'''
朴素贝叶斯代码的实现步骤
提取数据集数据
分析处理数据集数据
计算概率(先验概率、条件概率、联合概率)
根据贝叶斯公式计算预测概率
其中处理不同数据类型(伯努利,多项式,连续型)和0概率情况,还用到了:
概率密度函数
拉普拉斯平滑
'''
#定义属性值
outlook = ["Sunny", "Overcast","Rain"]
Temperature = ["Hot", "Mild","Cool"]
Humidity = ["High","Normal"]
Wind = ["Strong", "Weak"]
PlayTennis=["Yes","No"]
Play = []
Play.append(outlook)
Play.append(Temperature)
Play.append(Humidity)
Play.append(Wind)
Play.append(PlayTennis)
#数据集
data = [ ["Sunny","Hot","High","Weak","No"],
["Sunny","Hot","High","Strong","No"],
["Overcast","Hot","High","Weak","Yes"],
["Rain","Mild","High","Weak","Yes"],
["Rain","Cool","Normal","Weak","Yes"],
["Rain","Cool","Normal","Strong","No"],
["Overcast","Cool","Normal","Strong","Yes"],
["Sunny","Mild","High","Weak","No"],
["Sunny","Cool","Normal","Weak","Yes"],
["Rain","Mild","Normal","Weak","Yes"],
["Sunny","Mild","Normal","Strong","Yes"],
["Overcast","Mild","High","Strong","Yes"],
["Overcast", "Hot", "Normal", "Weak", "Yes"],
["Rain","Mild","High","Strong","No"],
]
length = len(data)
random.shuffle(data)
for i in range(length):
print(data[i])
train = data[:10]
train_length = len(train)
test= data[10:]
test_length = len(test)
def count_PlayTennis_total(data):
count = defaultdict(int)
for i in range(train_length):
count[data[i][4]]+=1
return count
#先验概率
def cal_base_rates(data):
y = count_PlayTennis_total(data)
cal_base_rates = {
}
for label in y.keys():
priori_prob = (y[label]+1) / (len(train)+2)
cal_base_rates[label] = priori_prob
return cal_base_rates
print(cal_base_rates(train))
def count_sj(attr, Play):
for i in range(len(Play)):
if attr in Play[i]:
return len(Play[i])
#似然概率p(x|y) 也叫条件概率
def likelihold_prob(data):
#计算各个特征值在已知结果下的概率(likelihood probabilities)
y = count_PlayTennis_total(data)
likelihold = {
}
for i,c in y.items():
#创建一个临时的字典,临时存储各个特征值的概率
attr_prob = defaultdict(int)
for j in range(train_length):
if data[j][4]==i:
for attr in range(4):
attr_prob[data[j][attr]]+=1
for keys,values in attr_prob.items():
sj = count_sj(keys, Play)
attr_prob[keys]=(values+1)/(c+sj)
likelihold[i] = attr_prob
return likelihold
LikeHold = likelihold_prob(train)
print(LikeHold)
############################################################
def Test(data,test):
y = count_PlayTennis_total(data)
likehold = likelihold_prob(data)
playtennis = cal_base_rates(data)
RATE = defaultdict(int)
print(test)
for i, _ in y.items():
rates=1
for j in range(4):
attr = test[j]
rates *= likehold[i][attr]
rates=rates * playtennis[i]
RATE[i] = rates
print("预测结果: " )
print(RATE)
return sorted(RATE,key=lambda x:RATE[x])[-1]
if __name__=='__main__':
print(cal_base_rates(train))
print(likelihold_prob(train))
print(Test(train,test[0][:4]))
print(Test(train, test[1][:4]))
print(Test(train, test[2][:4]))
print(Test(train, test[3][:4]))
我根据李航统计学习方法那本书上的公式编写的 数据量很小 所以测试准确率很差。。。