机器学习-分类算法实战-使用朴素贝叶斯分类器过滤垃圾邮件

充分利用 Python 的文本处理能力将文档拆分为词向量。这将用于对文本进行分类。我们将构建另一个分类器,看看它在现实世界的垃圾邮件数据集上的表现如何

机器学习-分类算法实战-使用朴素贝叶斯分类器过滤垃圾邮件_第1张图片
运行结果如上图
下面是代码

# -*- coding: utf-8 -*-
'''
使用python把文本分割成一个个单词,构建词向量
利用朴素贝叶斯构建分类器从概率的角度对文本进行分类
'''
import numpy as np
import re
from random import shuffle

'''创建一个词汇表'''


def createVocabList(Dataset):
    vocabSet = set([])
    for document in Dataset:
        vocabSet = vocabSet | set(document)

    return list(vocabSet)


'''  将文本转化成词向量'''


def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:

            # returnVec[vocabList.index(word)] = 1#词集模型
            returnVec[vocabList.index(word)] += 1  # 词袋模型
        else:
            print("the word:%s is not in VocabList" % word)
    return returnVec


'''训练'''


def trainNB(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    p = sum(trainCategory) / float(numTrainDocs)  # 属于类1的概率
    '''初始化在类0和类1中单词出现个数及概率'''
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 0.0
    p1Denom = 0.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1_vec = np.log(p1Num / p1Denom)
    p0_vec = np.log(p0Num / p0Denom)

    return p0_vec, p1_vec, p


'''构造分类器'''


def classifyNB(Input, p0, p1, p):
    p1 = sum(Input * p1) + np.log(p)
    p0 = sum(Input * p0) + np.log(1.0 - p)
    if p1 > p0:
        return 1
    else:
        return 0


'''预处理文本'''


def textParse(bigString):
    listOfTokens = re.split(r"\W*", bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]


"""垃圾邮件分类"""


def spamTest():
    docList = []
    classList = []
    fullText = []

    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open("email/ham/%d.txt" % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)
    DataSet = list(zip(docList, classList))
    print(shuffle(DataSet))
    Data, Y = list(zip(*DataSet))
    trainMat = []
    trainClass = []
    testData = Data[40:]
    test_label = Y[40:]
    for index in range(len(Data[:40])):
        trainMat.append(setOfWords2Vec(vocabList, Data[index]))
        trainClass.append(Y[index])

    p0, p1, p = trainNB(np.array(trainMat), np.array(trainClass))
    errorCount = 0
    for index in range(len(testData)):
        wordVector = setOfWords2Vec(vocabList, testData[index])
        if classifyNB(np.array(wordVector), p0, p1, p) != test_label[index]:
            errorCount += 1
    print("the error rate is : ", float(errorCount) / len(testData))


if __name__ == "__main__":
    spamTest()

数据集可以在网上找
emil文件夹里的ham\23.txt需要把?删掉

你可能感兴趣的:(机器学习)