#-*- Coding:utf-8 -*-
# Author:LHF Time:2019/9/25
from numpy import *
#词表到向量的转换函数
def loadDataSet(): #用于训练的词表
postingList = [['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec = [0,1,0,1,0,1] #1.代表侮辱性文字,0,代表正常言论 因为2,4,6行中包含stupid,所以他们是含有侮辱词汇的列表
return postingList,classVec#返回词表,和他们各自的标签
"""这个函数是将词表中的元素转变成集合,既没有重复元素"""
def creatVocabList(dataSet):#将词表导入函数
vocabSet = set([])#创建一个空集
for document in dataSet:#将词表中的每一行取出
vocabSet = vocabSet | set(document) #操作符| 用于求两个集合的并集,利用for循环找出最终的集合
return list(vocabSet)#返回集合
"""将经过处理的词表转换成向量"""
def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList)#创建一个和词汇表等长的0列表
for word in inputSet:#在待检测的列表中,分别监测每个元素
if word in vocabList:
returnVec[vocabList.index(word)] = 1#如果待检测的单词在词汇表中,则将这个单词的索引设置为1
else:
print("the word: %s is not in my Vocabulary" % word)
return returnVec
"""朴素贝叶斯分类器训练函数"""
def trainNB0(trainMatrix,trainCategory):#将转化得到的向量矩阵和对应标签传入函数
numTrainDocs = len(trainMatrix)#计算向量矩阵的行数
numWords = len(trainMatrix[0])#计算向量矩阵的列数
pAbusive = sum(trainCategory)/float(numTrainDocs)#将trainCategory中的1相加,既属于侮辱性言论的列表/总列表数
p0Num = ones(numWords);p1Num = ones(numWords)#创建两个列表,p0Num用来装
p0Denom = 2.0;p1Denom = 2.0
for i in range(numTrainDocs):#这个循环是重点
if trainCategory[i] == 1:#如果这行向量的标签为1
p1Num += trainMatrix[i]#则将这行向量加到存储侮辱性言论的列表中
p1Denom += sum(trainMatrix[i])#总的为1的数加起来 目的是在创建的侮辱性列表(p1Num )中,求的每个出现词语的比例
else:
p0Num += trainMatrix[i]#则将这行向量加到存储非侮辱性言论的列表中
p0Denom += sum(trainMatrix[i])#总的为1的数加起来 目的是在创建的非侮辱性列表(p0Num )中,求的每个出现词语的比例
p1Vect = log(p1Num/p1Denom) #在侮辱性列表中,每个词语占总词语个数的比例
p0Vect = log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
#朴素贝叶斯分类函数
def classifyNB(vec2Classify,p0Vec,p1vec,pClass1):
p1 = sum(vec2Classify*p1vec)+log(pClass1)#因为是log,所以直接加,每log的话是连乘,(回顾朴素贝叶斯公式)
p0 = sum(vec2Classify*p0Vec)+log(1 - pClass1)
if p1 > p0:
return ("侮辱性列表")
else:
return ("非侮辱性列表")
def testingNB():
listOposts,listClasses = loadDataSet() #获得数据和标签
myVocabList = creatVocabList(listOposts) #将数据装在集合中(无重复)
trainMat = []
for postinDoc in listOposts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))#将词表转换成向量,并将每行的向量添加,形成矩阵
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))#朴素贝叶斯分类器训练函数
testEntry = ['love','my','dalmation']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #将待测数据词表转换成向量
print(testEntry,'classified as : ',classifyNB(thisDoc,p0V,p1V,pAb))
testEntry = ['stupid','garbage',"my","dog","me"]
thisDoc = setOfWords2Vec(myVocabList, testEntry) # 将待测数据词表转换成向量
print(testEntry, 'classified as : ', classifyNB(thisDoc, p0V, p1V, pAb))
if __name__ == '__main__':
testingNB()
['love', 'my', 'dalmation'] classified as : 非侮辱性列表
the word: me is not in my Vocabulary
['stupid', 'garbage', 'my', 'dog', 'me'] classified as : 侮辱性列表
1.p0Num = ones(numWords);p1Num = ones(numWords)
和
2.p1Vect = log(p1Num/p1Denom)
p0Vect = log(p0Num/p0Denom)
是根据朴素贝叶斯的公式得。
1.因为条件概率不能是0.
2.用log更精确,因为概率太小可能会显示0,但用log就可以精确显示。而且用log不会影响数字比较的大小
如果还有不懂就认真体会公式在程序总如何体现的。具体是 trainNB0() 函数.谢谢