朴素贝叶斯---过滤垃圾邮件

在bayes.py中添加

#朴素贝叶斯词袋模型
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

#解析文本
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', bigString)  #除掉标点符号,保留单词
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]   #返回长度大于2,小写后的单词

#垃圾邮件测试函数
def spamTest():
    #导入并解析文本
    docList = []; classList = []; fullText = []
    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)   #解析为词列表
    trainingSet = range(50); testSet = []  #创建训练集、测试集,训练集初始化为一个整数列表
    #随机构建训练集
    for i in range(10):  #50封邮件中,随机选取10封作为测试集
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  #从训练集中删除
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:  #循环遍历训练集
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))  #对每封邮件基于词汇表构建词向量
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))  #计算分类所需的概率
    errorCount = 0
    #对测试集分类
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
            print "classification error", docList[docIndex]
    print 'the error rate is : ', float(errorCount / len(testSet))

测试:

>>> import bayes
>>> spamTest()
the error rate is :  0.0
>>> 

你可能感兴趣的:(机器学习笔记)