统计学习方法——朴素贝叶斯法原理
1. 朴素贝叶斯法的极大似然估计
2. 朴素贝叶斯极大似然学习及分类算法
算法过程:
2. Python实现
def priorProbability(labelList):
labelSet = set(labelList)
labelCountDict = {}
for label in labelList:
if label not in labelCountDict:
labelCountDict[label] = 0
labelCountDict[label] += 1
priorProbabilityDict = {}
for label in labelSet:
priorProbabilityDict[label] = labelCountDict[label]/len(labelList)
return priorProbabilityDict
def conditionProbability(dataSet,labelList):
dimNum = len(dataSet[0])
characterVal = []
for i in range(dimNum):
temp = []
for j in range(len(dataSet)):
if dataSet[j][i] not in temp:
temp.append(dataSet[j][i])
characterVal.append(temp)
probability = []
labelSet = list(set(labelList))
for dim in range(dimNum):
tempMemories = {}
for val in characterVal[dim]:
for label in labelSet:
labelCount = 0
mixCount = 0
for i in range(len(labelList)):
if labelList[i] == label:
labelCount += 1
if dataSet[i][dim] == val:
mixCount += 1
tempMemories[str(val) + "|" + str(label)] = mixCount/labelCount
probability.append(tempMemories)
return probability
def naiveBayes(x,dataSet,labelList):
priorProbabilityDict = priorProbability(labelList)
probability = conditionProbability(dataSet,labelList)
bayesProbability = {}
labelSet = list(set(labelList))
for label in labelSet:
tempProb = priorProbabilityDict[label]
for dim in range(len(x)):
tempProb *= probability[dim][str(x[dim])+"|"+str(label)]
bayesProbability[label] = tempProb
result = sorted(bayesProbability.items(),key= lambda x:x[1],reverse=True)
return result[0][0]
dataSet = ([[1,"s"],[1,"m"],[1,"m"],[1,"s"],[1,"s"],[2,"s"],[2,"m"],[2,"m"],
[2,"l"],[2,"l"],[3,"l"],[3,"m"],[3,"m"],[3,"l"],[3,"l"]])
labelList = [-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1]
print(naiveBayes([2,"s"],dataSet,labelList))
这个实现过程和书上的不太一样,这里每一个特征的取值范围和类的取值范围是根据数据集中的数来进行确定,即每一个特征的取值范围不考虑那些没有出现在训练数据集中的特征值。而书上的算法,每一个特征的取值范围是事先给出的,在这个取值范围中的特征值,可能会出现在训练数据集中,可能不出现。但在估计先验概率和条件概率的时候,过程是一样的。这是这个实现过程的一个不足。