手动实现朴素贝叶斯分类器算法(平滑方法)根据天气情况预测要不要去打网球

请手动实现(不准调用任何现成的机器学习工具包中的朴素贝叶斯分类器)朴素贝叶斯分类器算法(包括概率平滑方法),并在ppt中列出的D14数据集进行训练和验证:将D14数据集随机打乱后,取10个样例为训练集,另外4个测试集;输出测试结果。
手动实现朴素贝叶斯分类器算法(平滑方法)根据天气情况预测要不要去打网球_第1张图片

import random

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
'''
朴素贝叶斯代码的实现步骤
    提取数据集数据
    分析处理数据集数据
    计算概率(先验概率、条件概率、联合概率)
    根据贝叶斯公式计算预测概率
其中处理不同数据类型(伯努利,多项式,连续型)和0概率情况,还用到了:
    概率密度函数
    拉普拉斯平滑
'''
#定义属性值
outlook = ["Sunny", "Overcast","Rain"]
Temperature = ["Hot", "Mild","Cool"]
Humidity = ["High","Normal"]
Wind = ["Strong", "Weak"]
PlayTennis=["Yes","No"]
Play = []
Play.append(outlook)
Play.append(Temperature)
Play.append(Humidity)
Play.append(Wind)
Play.append(PlayTennis)

#数据集
data = [  ["Sunny","Hot","High","Weak","No"],
          ["Sunny","Hot","High","Strong","No"],
          ["Overcast","Hot","High","Weak","Yes"],
          ["Rain","Mild","High","Weak","Yes"],
          ["Rain","Cool","Normal","Weak","Yes"],
          ["Rain","Cool","Normal","Strong","No"],
          ["Overcast","Cool","Normal","Strong","Yes"],
          ["Sunny","Mild","High","Weak","No"],
          ["Sunny","Cool","Normal","Weak","Yes"],
          ["Rain","Mild","Normal","Weak","Yes"],
          ["Sunny","Mild","Normal","Strong","Yes"],
          ["Overcast","Mild","High","Strong","Yes"],
          ["Overcast", "Hot", "Normal", "Weak", "Yes"],
          ["Rain","Mild","High","Strong","No"],
          ]

length = len(data)
random.shuffle(data)
for i in range(length):
    print(data[i])
train = data[:10]
train_length = len(train)
test= data[10:]
test_length = len(test)

def count_PlayTennis_total(data):
    count = defaultdict(int)
    for i in range(train_length):
        count[data[i][4]]+=1
    return count

#先验概率
def cal_base_rates(data):
    y = count_PlayTennis_total(data)
    cal_base_rates = {
     }
    for label in y.keys():
        priori_prob = (y[label]+1) / (len(train)+2)
        cal_base_rates[label] = priori_prob
    return cal_base_rates

print(cal_base_rates(train))

def count_sj(attr, Play):
    for i in range(len(Play)):
        if attr in Play[i]:
            return len(Play[i])

#似然概率p(x|y) 也叫条件概率
def likelihold_prob(data):
    #计算各个特征值在已知结果下的概率(likelihood probabilities)
    y = count_PlayTennis_total(data)
    likelihold = {
     }
    for i,c in y.items():
        #创建一个临时的字典,临时存储各个特征值的概率
        attr_prob = defaultdict(int)
        for j in range(train_length):
            if data[j][4]==i:
                for attr in range(4):
                    attr_prob[data[j][attr]]+=1
        for keys,values in attr_prob.items():
            sj =  count_sj(keys, Play)
            attr_prob[keys]=(values+1)/(c+sj)
        likelihold[i] = attr_prob
    return likelihold

LikeHold = likelihold_prob(train)
print(LikeHold)
############################################################
def Test(data,test):
    y = count_PlayTennis_total(data)
    likehold = likelihold_prob(data)
    playtennis = cal_base_rates(data)
    RATE = defaultdict(int)
    print(test)
    for i, _ in y.items():
        rates=1
        for j in range(4):
            attr = test[j]
            rates *= likehold[i][attr]
        rates=rates * playtennis[i]
        RATE[i] = rates
    print("预测结果: " )
    print(RATE)
    return sorted(RATE,key=lambda x:RATE[x])[-1]

if __name__=='__main__':
    print(cal_base_rates(train))
    print(likelihold_prob(train))
    print(Test(train,test[0][:4]))
    print(Test(train, test[1][:4]))
    print(Test(train, test[2][:4]))
    print(Test(train, test[3][:4]))

我根据李航统计学习方法那本书上的公式编写的 数据量很小 所以测试准确率很差。。。

你可能感兴趣的:(机器学习算法,机器学习,python,算法)