《统计学习方法》第四章,测试数据同书本一样
trainingData = [
[1, 'S', -1], [1, 'M', -1], [1, 'M', 1], [1, 'S', 1], [1, 'S', -1],
[2, 'S', -1], [2, 'M', -1], [2, 'M', 1], [2, 'L', 1], [2, 'L', 1],
[3, 'L', 1], [3, 'M', 1], [3, 'M', 1], [3, 'L', 1], [3, 'L', -1]
]
# 计算先验概率
yP = 0
for record in trainingData:
if (record[2] == 1):
yP += 1
yN = len(trainingData) - yP
yPositive = yP / len(trainingData)
yNegative = 1 - yPositive
precede=[yPositive,yNegative]
# 计算属性的特征值种类
attrSet = []
for i in range(len(trainingData[0])):
attrs = []
for j in trainingData:
attrs.append(j[i])
attrSet.append(list(set(attrs)))
#print(attrSet)
# 辅助函数,用来统计
def countFun(indexes, attrs):
count = 0
for i in trainingData:
flag = True
for j in range(len(indexes)):
if i[indexes[j]] != attrs[j]:
flag = False
if flag:
count += 1
return count
# 计算条件概率
conditional = {
}
for i in range(len(attrSet)):
for j in attrSet[i]:
conditional[j] = [countFun([i, 2], [j, 1]) / yP, countFun([i, 2], [j, -1]) / yN]
#print(conditional)
# 给定实例x=(2,'S'),计算概率
x = [2, 'S']
res = {
}
for i in range(2):
temp = 1
temp *= precede[i]
for j in range(len(attrSet) - 1):
temp *= conditional[x[j]][i]
res[attrSet[2][i]]=temp
print(res)
运行结果:
{1: 0.02222222222222222, -1: 0.06666666666666667}
从而可认定x=(2,‘S’)归属y=-1类
参考论坛:
https://blog.csdn.net/weixin_42363997/article/details/85060134?ops_request_misc=%25257B%252522request%25255Fid%252522%25253A%252522161369666416780274141715%252522%25252C%252522scm%252522%25253A%25252220140713.130102334.pc%25255Fall.%252522%25257D&request_id=161369666416780274141715&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2allfirst_rank_v2~rank_v29-3-85060134.first_rank_v2_pc_rank_v29_10&utm_term=%25E6%259C%25B4%25E7%25B4%25A0%25E8%25B4%259D%25E5%258F%25B6%25E6%2596%25AF%25E6%25B3%2595+%25E4%25BB%25A3%25E7%25A0%2581