第4章 基于概率论的分类方法:朴素贝叶斯
使用 k-近邻算法和决策树进行分类问题,该类问题都有明确的答案,但是,分类器有时会产生错误结果,这时可以要求分类器给出一个最优的类别猜测结果,同时给出这个猜测的概率估计值。
本章使用朴素贝叶斯构造分类器就是 使用概率论的方法进行分类。贝叶斯概率和贝叶斯准则提供了一个利用已知值来估计未知概率的有效方法。
本章将学到:
1.使用朴素贝叶斯来对文档分类
2.如何解释朴素贝叶斯分类器训练所得到的知识
贝叶斯概率引入先验知识和逻辑推理来处理不确定命题。另一种概率解释称为频数概率,它只从数据本身获得结论,并不考虑逻辑推理及先验知识。
核心思想:选择高概率对应的类别,即选择具有最高概率的决策。
p(c|x) = p(x|c)p(c) / p(x)
利用贝叶斯准则就是可以使用已知的3个概率值计算未知的概率值
要得到好的概率分布,需要足够的数据样本
朴素贝叶斯分类器中假设每个特征同等重要。(各个特征独立,方便计算)
'''收集数据'''
'''创建一些实验样本'''
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #创建6个文档
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1] # 类别标签集合:1代表侮辱性文字,0代表正常言论 代表6个文档所对应的类别
return postingList, classVec
'''准备数据'''
'''创建一个包含在所有文档中出现的不重复词的列表'''
def createVocabList(dataSet):
vocabSet = set([]) # 创建一个空集合
for document in dataSet:
vocabSet = vocabSet | set(document) # 将每篇文档返回的新词集合添加到创建的集合中去
return list(vocabSet)
'''将文本转化为向量并输出,向量的每一元素为1或0'''
# 准备数据:词集模型(每个单词只能出现一次)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList) # 先将所有文本都标记为0
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1 # index函数用于从列表中找出某个值第一个匹配项的索引值
else:
print("the word:%s is not in my Vocabulary!" % word)
return returnVec
loadDataSet中的classVec 表示文本类别,这些文本的类别由人工标注,这些标注信息用于训练程序以便自动检测侮辱性留言。
# 准备数据:词袋模型(每个单词可以出现多次)
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
词集模型:将每个词的出现是否作为一个特征
词袋模型:每个单词可以出现多次,而在词集模型中每个词只能出现一次
'''训练算法'''
def trainNB0(trainMatrix, trainCategory): # 输入参数为文档矩阵和每篇文档类别标签所构成的向量
numTrainDocs = len(trainMatrix) # 6
numWords = len(trainMatrix[0]) # 我的词汇表的单词数量
pAbusive = sum(trainCategory)/float(numTrainDocs) # pAbusive 表示所有文档中是侮辱类的概率
# p0Num = zeros(numWords); p1Num = zeros(numWords) # 初始化概率,分别表示
p0Num = ones(numWords); p1Num = ones(numWords) # 将所有词的出现次数初始化为1,避免因为一些词出现次数为0而计算概率使得成乘积为0
# p0Denom = 0.0; p1Denom = 0.0
p0Denom = 2.0; p1Denom = 2.0 # 修改初始化分母为2
for i in range(numTrainDocs):
if trainCategory[i] == 1: # 判断该文档是否为侮辱性文档
p1Num += trainMatrix[i] # 侮辱类文档中每个词汇的出现个数
p1Denom += sum(trainMatrix[i]) # 侮辱类文档中具有侮辱字眼的词汇个数
else:
p0Num += trainMatrix[i] # 非侮辱类文档中每个词汇的出现个数
p0Denom += sum(trainMatrix[i]) # 非侮辱类文档中具有侮辱字眼的词汇个数
# print(p1Num) # [0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 2. 2. 0. 0. 0. 1. 1. 0. 0. 0. 0. 3. 0. 0. 1. 0. 1. 1. 0.]
# print(p0Num) # [1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 2. 0. 1. 0. 1. 3. 1. 0. 0. 1. 1. 1. 1.0. 1. 1. 0. 1. 0. 1. 1.]
# print(p1Denom) # 19.0‘’‘’‘’‘’‘’
# print(p0Denom) # 24.0
# p1Vect = p1Num/p1Denom # 在类别1词汇表中每个单词出现的概率
# p0Vect = p0Num/p0Denom # 在类别0词汇表中每个单词出现的概率
p1Vect = log(p1Num / p1Denom)
p0Vect = log(p0Num / p0Denom)
# print(p1Vect)
# print(p0Vect)
return p0Vect, p1Vect, pAbusive
其中,p0Vect表示非侮辱性文本中每个单词的出现概率;p1Vect表示侮辱性文本中每个单词的出现概率;pAbusive表示所有文本中是侮辱性文本的概率。
上面用到了numpy数组,用于快速计算概率值。
在计算概率的过程中会出现以下问题:
'''朴素贝叶斯分类函数'''
# vec2Classify为测试文档中的词汇在词汇表中的词汇中出现的次数的特征向量
# p0Vec表示0类文档中词汇表中词出现的概率向量
# p1Vec表示1类文档中词汇表中词出现的概率向量
# pClass1表示所有文档中是1类文档的概率
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
# 该函数为一个便利函数:该函数封装所有操作,以节省输入代码的时间
def testingNB():
listOPosts, listClasses = loadDataSet() # 加载所有文档中的词汇和文档的类别列表
myVocabList = createVocabList(listOPosts) # 将所有文档中的词汇消除重复的放在一个列表中
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
# print('myVocabList:', myVocabList)
# print('postinDoc:', postinDoc)
# print(trainMat)
p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
testEntry = ['love', 'my', 'dalmation'] # 测试文档1
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
# print(thisDoc) # [1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
print(testEntry,'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
testEntry = ['stupid', 'garbage'] # 测试文档2
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
# print(thisDoc) # [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0]
print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
过程如下:
# 示例一:利用朴素贝叶斯过滤垃圾邮件
'''
现实生活中:邮件--》字符串列表--》词向量
'''
mySent = 'this book is the best book on python or M.L. I have ever laid\neyes upon'
# print(mySent)
mySentList = mySent.split()
# print(mySentList)
regEx = re.compile('\W+')
listOfTokens = regEx.split(mySent)
# print(listOfTokens)
# listOfTokens = [tok for tok in listOfTokens if len(tok) > 0]
# print(listOfTokens)
用到了正则表达式来切分文本
'''垃圾邮件测试函数'''
def textParse(bigString): # 接受一个大字符串并将其解析为字符串列表
listOfTokens = re.split(r'\W+', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2] # 去掉少于两个字符的字符串,并将所有字符串转化为小写
def spamTest(): # 对贝叶斯垃圾分类器进行自动化处理
docList = []; classList = []; fullText = []
for i in range(1, 26):
wordList = textParse(open('email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
# print(docList)
# print(fullText)
# print(classList)
wordList = textParse(open('email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList) # 利用集合消除重复的词条
trainingSet = list(range(50)) # 数字列表0-49 训练集
testSet = [] # 测试集
for i in range(10): # 五十封邮件中随机挑选10封添加到测试集中
randIndex = int(random.uniform(0, len(trainingSet))) # random.uniform(x, y) 随机生成一个(x,y)之间的实数
testSet.append(trainingSet[randIndex])
# print(randIndex)
# print(testSet)
del(trainingSet[randIndex])
trainMat = []
trainClasses = []
for docIndex in trainingSet: # 训练集中的邮件
trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
errorCount = 0
for docIndex in testSet: # 测试集中的邮件
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector),p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is:',float (errorCount)/len(testSet))
留存交叉验证:随机选择一部分作为训练集,剩余部分作为测试集的过程称为留存交叉验证。
本示例的目的并不是使用该分类器进行分类,而是通过观察单词和条件概率值来发现与特定城市相关的内容。
import feedparser
import operator
from bayes import textParse, createVocabList, bagOfWords2VecMN, trainNB0, classifyNB
import random
from numpy import *
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
print(ny)
print(ny['entries'])
print(len(ny['entries']))
'''RSS源分类器及高频词去除函数'''
def calcMostFreq(vocabList, fullText): # 遍历词汇表中的每个词并统计它在文本中出现的次数,然后根据出现次数从高到低对词典进行排序,最后返回排序最高的30个单词
freqDict = {
}
for token in vocabList:
freqDict[token] = fullText.count(token)
sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1),reverse=True)
return sortedFreq[:30]
def localWords(feed1, feed0):
docList = []; classList = []; fullText = []
minLen = min(len(feed1['entries']), len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
top30Words = calcMostFreq(vocabList, fullText)
for pairW in top30Words:
if pairW[0] in vocabList:
vocabList.remove(pairW[0])
trainingSet = list(range(2*minLen))
testSet = []
for i in range(20):
randIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex]) # 去掉出现概率最高的那些词汇(前30%)
trainMat = []
trainClasses = []
for docIndex in trainingSet:
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is:', float(errorCount)/len(testSet))
return vocabList, p0V, p1V
其中,去掉前30%的词是因为,在语言中,大部分都是冗余和结构辅助性内容。另一个常用方法是可以从某个预定此表中移除结构上的辅助词(这部分词被称为停用词,stop word list)。
为了得到错误率的精确估计,应该多次进行上诉实验,然后取平均值。
'''最具表征性的词汇显示函数'''
def getTopWords(ny, sf):
vocabList, p0V, p1V = localWords(ny, sf)
topNY = []; topSF = []
for i in range(len(p0V)):
if p0V[i] > -6.0:
topSF.append((vocabList[i], p0V[i]))
if p1V[i] > -6.0:
topNY.append((vocabList[i], p1V[i]))
sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
print('SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**')
for item in sortedSF:
print(item[0])
sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
print('NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**')
for item in sortedNY:
print(item[0])
'''bayes.py'''
from numpy import *
import re
'''收集数据'''
'''创建一些实验样本'''
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #创建6个文档
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1] # 类别标签集合:1代表侮辱性文字,0代表正常言论 代表6个文档所对应的类别
return postingList, classVec
'''准备数据'''
'''创建一个包含在所有文档中出现的不重复词的列表'''
def createVocabList(dataSet):
vocabSet = set([]) # 创建一个空集合
for document in dataSet:
vocabSet = vocabSet | set(document) # 将每篇文档返回的新词集合添加到创建的集合中去
return list(vocabSet)
'''将文本转化为向量并输出,向量的每一元素为1或0'''
# 准备数据:词集模型(每个单词只能出现一次)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList) # 先将所有文本都标记为0
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1 # index函数用于从列表中找出某个值第一个匹配项的索引值
else:
print("the word:%s is not in my Vocabulary!" % word)
return returnVec
# 准备数据:词袋模型(每个单词可以出现多次)
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
'''训练算法'''
def trainNB0(trainMatrix, trainCategory): # 输入参数为文档矩阵和每篇文档类别标签所构成的向量
numTrainDocs = len(trainMatrix) # 6
numWords = len(trainMatrix[0]) # 我的词汇表的单词数量
pAbusive = sum(trainCategory)/float(numTrainDocs) # pAbusive 表示所有文档中是侮辱类的概率
# p0Num = zeros(numWords); p1Num = zeros(numWords) # 初始化概率,分别表示
p0Num = ones(numWords); p1Num = ones(numWords) # 将所有词的出现次数初始化为1,避免因为一些词出现次数为0而计算概率使得成乘积为0
# p0Denom = 0.0; p1Denom = 0.0
p0Denom = 2.0; p1Denom = 2.0 # 修改初始化分母为2
for i in range(numTrainDocs):
if trainCategory[i] == 1: # 判断该文档是否为侮辱性文档
p1Num += trainMatrix[i] # 侮辱类文档中每个词汇的出现个数
p1Denom += sum(trainMatrix[i]) # 侮辱类文档中具有侮辱字眼的词汇个数
else:
p0Num += trainMatrix[i] # 非侮辱类文档中每个词汇的出现个数
p0Denom += sum(trainMatrix[i]) # 非侮辱类文档中具有侮辱字眼的词汇个数
# print(p1Num) # [0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 2. 2. 0. 0. 0. 1. 1. 0. 0. 0. 0. 3. 0. 0. 1. 0. 1. 1. 0.]
# print(p0Num) # [1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 2. 0. 1. 0. 1. 3. 1. 0. 0. 1. 1. 1. 1.0. 1. 1. 0. 1. 0. 1. 1.]
# print(p1Denom) # 19.0‘’‘’‘’‘’‘’
# print(p0Denom) # 24.0
# p1Vect = p1Num/p1Denom # 在类别1词汇表中每个单词出现的概率
# p0Vect = p0Num/p0Denom # 在类别0词汇表中每个单词出现的概率
p1Vect = log(p1Num / p1Denom)
p0Vect = log(p0Num / p0Denom)
# print(p1Vect)
# print(p0Vect)
return p0Vect, p1Vect, pAbusive
'''朴素贝叶斯分类函数'''
# vec2Classify为测试文档中的词汇在词汇表中的词汇中出现的次数的特征向量
# p0Vec表示0类文档中词汇表中词出现的概率向量
# p1Vec表示1类文档中词汇表中词出现的概率向量
# pClass1表示所有文档中是1类文档的概率
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
# 该函数为一个便利函数:该函数封装所有操作,以节省输入代码的时间
def testingNB():
listOPosts, listClasses = loadDataSet() # 加载所有文档中的词汇和文档的类别列表
myVocabList = createVocabList(listOPosts) # 将所有文档中的词汇消除重复的放在一个列表中
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
# print('myVocabList:', myVocabList)
# print('postinDoc:', postinDoc)
# print(trainMat)
p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
testEntry = ['love', 'my', 'dalmation'] # 测试文档1
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
# print(thisDoc) # [1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
print(testEntry,'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
testEntry = ['stupid', 'garbage'] # 测试文档2
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
# print(thisDoc) # [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0]
print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
'''垃圾邮件测试函数'''
def textParse(bigString): # 接受一个大字符串并将其解析为字符串列表
listOfTokens = re.split(r'\W+', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2] # 去掉少于两个字符的字符串,并将所有字符串转化为小写
def spamTest(): # 对贝叶斯垃圾分类器进行自动化处理
docList = []; classList = []; fullText = []
for i in range(1, 26):
wordList = textParse(open('email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
# print(docList)
# print(fullText)
# print(classList)
wordList = textParse(open('email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList) # 利用集合消除重复的词条
trainingSet = list(range(50)) # 数字列表0-49 训练集
testSet = [] # 测试集
for i in range(10): # 五十封邮件中随机挑选10封添加到测试集中
randIndex = int(random.uniform(0, len(trainingSet))) # random.uniform(x, y) 随机生成一个(x,y)之间的实数
testSet.append(trainingSet[randIndex])
# print(randIndex)
# print(testSet)
del(trainingSet[randIndex])
trainMat = []
trainClasses = []
for docIndex in trainingSet: # 训练集中的邮件
trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
errorCount = 0
for docIndex in testSet: # 测试集中的邮件
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector),p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is:',float (errorCount)/len(testSet))
if __name__ == '__main__':
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
# print(len(myVocabList)) # 32
# print(myVocabList) # ['has', 'licks', 'dog', 'my', 'help', 'buying', 'take', 'to', 'stupid', 'problems', 'is',......(每次运行结果顺序都会发生变化)
outVec = setOfWords2Vec(myVocabList, listOPosts[0])
# print(len(outVec)) # 32
# print(outVec)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) # 将每个文档文本都转化为向量,放如trainMat中
# print(trainMat)
p0V, p1V, pAb = trainNB0(trainMat, listClasses)
# print(pAb)
# print(p0V)
# print(p1V)
# 朴素贝叶斯分类函数测试
mytestingNB = testingNB()
# 示例一:利用朴素贝叶斯过滤垃圾邮件
'''
现实生活中:邮件--》字符串列表--》词向量
'''
mySent = 'this book is the best book on python or M.L. I have ever laid\neyes upon'
# print(mySent)
mySentList = mySent.split()
# print(mySentList)
regEx = re.compile('\W+')
listOfTokens = regEx.split(mySent)
# print(listOfTokens)
# listOfTokens = [tok for tok in listOfTokens if len(tok) > 0]
# print(listOfTokens)
# 测试邮件垃圾分类函数
testError = spamTest()
'''testFeedparser.py'''
import feedparser
import operator
from bayes import textParse, createVocabList, bagOfWords2VecMN, trainNB0, classifyNB
import random
from numpy import *
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
print(ny)
print(ny['entries'])
print(len(ny['entries']))
'''RSS源分类器及高频词去除函数'''
def calcMostFreq(vocabList, fullText):
freqDict = {
}
for token in vocabList:
freqDict[token] = fullText.count(token)
sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1),reverse=True)
return sortedFreq[:30]
def localWords(feed1, feed0):
docList = []; classList = []; fullText = []
minLen = min(len(feed1['entries']), len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
top30Words = calcMostFreq(vocabList, fullText)
for pairW in top30Words:
if pairW[0] in vocabList:
vocabList.remove(pairW[0])
trainingSet = list(range(2*minLen))
testSet = []
for i in range(20):
randIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = []
trainClasses = []
for docIndex in trainingSet:
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is:', float(errorCount)/len(testSet))
return vocabList, p0V, p1V
'''最具表征性的词汇显示函数'''
def getTopWords(ny, sf):
vocabList, p0V, p1V = localWords(ny, sf)
topNY = []; topSF = []
for i in range(len(p0V)):
if p0V[i] > -6.0:
topSF.append((vocabList[i], p0V[i]))
if p1V[i] > -6.0:
topNY.append((vocabList[i], p1V[i]))
sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
print('SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**')
for item in sortedSF:
print(item[0])
sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
print('NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**')
for item in sortedNY:
print(item[0])
if __name__ == '_main' :
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
vocabList, pSF, pNY = (ny, sf)
vocabList, pSF, pNY = localWords(ny, sf)
# print(vocabList,pSF,pNY)
getTopWords(ny, sf)