1.1 题目的主要研究内容
1.2 题目研究的工作基础或实验条件
(1)硬件环境:Intel(R) Core(TM) i7-7700HQ CPU
(2)软件环境:本实验的软件环境主要为Windows操作系统,Pycharm工具。
1.3 设计思想
(1)提取数据集数据
(2)分析处理数据集数据
(3)计算概率(先验概率、条件概率、联合概率)
(4)根据贝叶斯公式计算预测概率
1.4 流程图
1.5 主要程序代码(要求必须有注释)
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
#定义属性值
outlook = ["Sunny", "Overcast","Rain"]
Temperature = ["Hot", "Mild","Cool"]
Humidity = ["High","Normal"]
Wind = ["Strong", "Weak"]
PlayTennis=["Yes","No"]
Play = []
Play.append(outlook)
Play.append(Temperature)
Play.append(Humidity)
Play.append(Wind)
Play.append(PlayTennis)
#数据集
data = [ ["Sunny","Hot","High","Weak","No"],
["Sunny","Hot","High","Strong","No"],
["Overcast","Hot","High","Weak","Yes"],
["Rain","Mild","High","Weak","Yes"],
["Rain","Cool","Normal","Weak","Yes"],
["Rain","Cool","Normal","Strong","No"],
["Overcast","Cool","Normal","Strong","Yes"],
["Sunny","Mild","High","Weak","No"],
["Sunny","Cool","Normal","Weak","Yes"],
["Rain","Mild","Normal","Weak","Yes"],
["Sunny","Mild","Normal","Strong","Yes"],
["Overcast","Mild","High","Strong","Yes"],
["Overcast", "Hot", "Normal", "Weak", "Yes"],
["Rain","Mild","High","Strong","No"],
]
length = len(data)
random.shuffle(data)
for i in range(length):
print(data[i])
train = data[:10]
train_length = len(train)
test= data[10:]
test_length = len(test)
def count_PlayTennis_total(data):
count = defaultdict(int)
for i in range(train_length):
count[data[i][4]]+=1
return count
#先验概率
def cal_base_rates(data):
y = count_PlayTennis_total(data)
cal_base_rates = {}
for label in y.keys():
priori_prob = (y[label]+1) / (len(train)+2)
cal_base_rates[label] = priori_prob
return cal_base_rates
print(cal_base_rates(train))
def count_sj(attr, Play):
for i in range(len(Play)):
if attr in Play[i]:
return len(Play[i])
def likelihold_prob(data):
#计算各个特征值在已知结果下的概率(likelihood probabilities)
y = count_PlayTennis_total(data)
likelihold = {}
for i,c in y.items():
#创建一个临时的字典,临时存储各个特征值的概率
attr_prob = defaultdict(int)
for j in range(train_length):
if data[j][4]==i:
for attr in range(4):
attr_prob[data[j][attr]]+=1
for keys,values in attr_prob.items():
sj = count_sj(keys, Play)
attr_prob[keys]=(values+1)/(c+sj)
likelihold[i] = attr_prob
return likelihold
LikeHold = likelihold_prob(train)
print(LikeHold)
############################################################
def Test(data,test):
y = count_PlayTennis_total(data)
likehold = likelihold_prob(data)
playtennis = cal_base_rates(data)
RATE = defaultdict(int)
print(test)
for i, _ in y.items():
rates=1
for j in range(4):
attr = test[j]
rates *= likehold[i][attr]
rates=rates * playtennis[i]
RATE[i] = rates
print("预测结果: " )
print(RATE)
return sorted(RATE,key=lambda x:RATE[x])[-1]
if __name__=='__main__':
print(cal_base_rates(train))
print(likelihold_prob(train))
print(Test(train,test[0][:4]))
print(Test(train, test[1][:4]))
print(Test(train, test[2][:4]))
print(Test(train, test[3][:4]))
1.6 运行结果及分析
该算法以十个样例为训练集,四个为测试集,通过处理,能够大致预测在不同天气的情况下,是否可以去打网球。