sourceData
是一些文本数据,labels
是对应的列标签。这是一个网站的评论信息,希望通过朴素贝叶斯分类器来过滤掉比较负面的评论,0表示正常评论,1表示负面评论。可以简单的看到第二个数据中出现了stupid
酱紫骂人的语句,那么这是一条负面评论的概率很大。sourceData = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
labels = [0,1,0,1,0,1]
def createVocabList(sourceData):
vocabSet = set([])
for document in sourceData:
vocabSet = vocabSet | set(document)
return list(vocabSet)
data
如果单词data[i]
出现在词汇表中,那么返回的词向量returnVec[i] = 1
,如果没有出现则为0
;'''输入参数为词汇表和单个样本数据'''
def setOfWords2Vec(vocabList, data):
returnVec = [0]*len(vocabList)
for word in data:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else: print "the word: %s is not in my Vocabulary!" % word
return returnVec
wordCountP
是一个嵌套字典,外层的键是类别,内层的键是单词的索引,值为单词出现的概率,例如wordCountP[ci][wi]
表示的是类别ci
中单词wi
出现的概率。defaultdict(dict)
,否则你直接使用a[1][2]=3
会报错,因为你必须先申明a={},a[1]={}, a[1][2]=3
.from collections import defaultdict
def NavieBayes(dataSet):
labelCountP = {}
wordCountP = defaultdict(dict)
for data in dataSet:
'''统计每个类别的频数'''
labelCountP[data[-1]] = labelCountP.get(data[-1],0)+1
'''统计每个类别中每个单词出现的频数'''
for i in range(len(data)-1): #注意此时单词的键其实是0.1.2这样的索引值,值为单词的频数
wordCountP[data[-1]][i] = wordCountP[data[-1]].get(i,0) + data[i]
for label in wordCountP:
'''统计每个类别的概率'''
labelCountP[label] = labelCountP[label]/float(len(dataSet))
'''统计每个类别中每个单词出现的概率'''
wordSum = sum(wordCountP[label].values())
for word in wordCountP[label]:
wordCountP[label][word] = (wordCountP[label][word])/float(wordSum)
return labelCountP,wordCountP
vocabList = createVocabList(sourceData)
print vocabList
dataSet = [setOfWords2Vec(vocabList,data) for data in sourceData]
'''下面这语句把类标签追加到每个样本的末尾'''
[[data.append(label)] for data,label in zip(dataSet,labels)]
labelCountP,wordCountP = NavieBayes(dataSet)
print labelCountP
print wordCountP
['cute','love','help','garbage','quit','I','problems','is','park','stop','flea','dalmation','licks','food','not','him','buying','posting','has','worthless','ate','to','maybe','please','dog','how','stupid','so','take','mr','steak','my']
{0: 0.5, 1: 0.5}
{0: 0.041666666666666664, 1: 0.041666666666666664, 2: 0.041666666666666664, 3: 0.0, 4: 0.0, 5: 0.041666666666666664, 6: 0.041666666666666664, 7: 0.041666666666666664, 8: 0.0, 9: 0.041666666666666664, 10: 0.041666666666666664, 11: 0.041666666666666664, 12: 0.041666666666666664, 13: 0.0, 14: 0.0, 15: 0.08333333333333333, 16: 0.0, 17: 0.0, 18: 0.041666666666666664, 19: 0.0, 20: 0.041666666666666664, 21: 0.041666666666666664, 22: 0.0, 23: 0.041666666666666664, 24: 0.041666666666666664, 25: 0.041666666666666664, 26: 0.0, 27: 0.041666666666666664, 28: 0.0, 29: 0.041666666666666664, 30: 0.041666666666666664, 31: 0.125}
{0: 0.0, 1: 0.0, 2: 0.0, 3: 0.05263157894736842, 4: 0.05263157894736842, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.05263157894736842, 9: 0.05263157894736842, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.05263157894736842, 14: 0.05263157894736842, 15: 0.05263157894736842, 16: 0.05263157894736842, 17: 0.05263157894736842, 18: 0.0, 19: 0.10526315789473684, 20: 0.0, 21: 0.05263157894736842, 22: 0.05263157894736842, 23: 0.0, 24: 0.10526315789473684, 25: 0.0, 26: 0.15789473684210525, 27: 0.0, 28: 0.05263157894736842, 29: 0.0, 30: 0.0, 31: 0.0}
stupid
,歪果仁都喜欢用这个骂人吗o(╯□╰)o,结果还是蛮符合实际情况的对吧。print max(wordCountP[0],key=wordCountP[0].get)
print max(wordCountP[1],key=wordCountP[1].get)
''' 31 26 '''
labelCountP[label] = (labelCountP[label]+1.0)/(len(dataSet)+2.0)
'''len(wordCountP[label])其实就是词汇表的长度'''
wordCountP[label][word] = (wordCountP[label][word]+1.0)/(wordSum+len(wordCountP[label]))
labelCountP[label] = math.log((labelCountP[label]+1.0)/(len(dataSet)+2.0))
wordCountP[label][word] = math.log((wordCountP[label][word]+1.0)/(wordSum+len(wordCountP[label])))
def classify(labelCountP, wordCountP, testX):
p = {} #统计属于每一类的概率
for label in labelCountP:
p[label] = labelCountP[label]
for i in range(len(testX)):
if(testX[i]!=0):
p[label] += wordCountP[label][i]
return max(p,key=p.get)
testX1 = setOfWords2Vec(vocabList,['love','my','dalmation'])
testX2 = setOfWords2Vec(vocabList,['stupid', 'garbage'])
print classify(labelCountP, wordCountP, testX1)
print classify(labelCountP, wordCountP, testX2)
''' 0 1 '''
def setOfWords2Vec2(vocabList, data):
returnVec = [0]*len(vocabList)
for word in data:
if word in vocabList:
returnVec[vocabList.index(word)] += 1 #只加了一个+号
else: print "the word: %s is not in my Vocabulary!" % word
return returnVec
朴素贝叶斯通过条件独立假设减少了对数据量的要求,虽然这个假设过于简单,但是仍然是一个很有效的分类算法,最原始的朴素贝叶斯会遭遇概率因子为0以及概率下溢的问题,分别通过laplace平滑和取对数解决了。在文本分类时,如果词汇表选的好,那么对分类的性能有很大的影响,比如去掉很多无区分度的高频词和很多停顿词,并且对于特定的文本分类任务,获取还需要额外的业务知识来筛选特征词,因此说它对数据的准备方式很敏感,稍不慎性能就很差。