朴素贝叶斯分类器:假设特征之间相互独立
另外,有两种模式 词集set(一个词在文档中只出现一次,即词向量中每个词的权重相等),伯努利模型;词袋bag,多项式模型
code:https://github.com/apachecn/AiLearning/blob/master/src/py2.x/ml/4.NaiveBayes/bayes.py
import numpy as np
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #[0,0,1,1,1......]
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
return returnVec
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
def trainNB(trainMatirx,trainCategory):
'''
:param trainMatirx: 将文档转为了词向量矩阵
:param trainCategory: 文档每个句子所属类别
:return: 贝叶斯公式,需要有先验、每个特征的条件概率
'''
numTrainDocs = len(trainMatirx)
numTrainWords = len(trainMatirx[0]) # 将每一个句子转为 用词向量表示
# (1) pA :A的先验
pA = sum(trainCategory) / float(numTrainDocs)
# (2) 由于存储每个特征的条件概率
p0Num = np.ones(numTrainWords) # 每个 单词特征的 初始值 ,列表 [] 的每个元素用于统计每个单词出现的次数
p1Num = np.ones(numTrainWords)
p0Sum = 2.0 # 整文本 词向量中一共出现的单词个数。 0类别一共的个数,初始值
p1Sum = 2.0
for i in range(numTrainDocs):
if trainCategory[i]==0:
p0Num += trainMatirx[i]
p0Sum += sum(trainMatirx[i])
else:
p1Num += trainMatirx[i]
p1Sum += sum(trainMatirx[i])
p0Vec = np.log(p0Num/p0Sum)
p1Vec = np.log(p1Num/p1Sum)
return p0Vec,p1Vec,pA
def classifyNB(vec2Classify,p0Vec,p1Vec,pA):
'''
:param vec2Classify: [1,0,0,1……] 类型的词向量,分别和p1Vec、p2Vec每个元素相乘,表示 在测试样本中,只用样本出现的特征进行相乘,
从而求得某个类别后验概率最大 (即书中提到的 伯努利模型 实现)
:param p0Vec:
:param p1Vec:
:param pA:
:return:
'''
p1 = np.sum(vec2Classify*p1Vec) + np.log(pA)
p2 = np.sum(vec2Classify*p0Vec) + np.log(1-pA)
if p1>p2:
return 1
else:
return 0
def testingNB():
data,label = loadDataSet()
VocabList = createVocabList(data)
print("set: ")
print(VocabList)
print("----------------")
trainMat = []
for d in data:
trainMat.append(setOfWords2Vec(VocabList,d))
p0V,p1V,pA = trainNB(trainMat,label)
print("template: ")
print("p0: ",p0V)
print("p1: ",p1V)
print("pa: ",pA)
print("----------------------------------------------------")
#test = ['love', 'my', 'dalmation']
test = ['stupid', 'garbage']
array_test = np.array(setOfWords2Vec(VocabList,test))
print(test,"classified as: ",classifyNB(array_test,p0V,p1V,pA))
if __name__ =="__main__":
testingNB()