《机器学习实战》基于朴素贝叶斯算法实现垃圾邮件分类

import random
import sys
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import re


def textParse(bigString):
    listOfTokens = re.split(r'\W*', bigString)  # 返回列表
    return [tok.lower() for tok in listOfTokens if len(tok) > 0]


def createVocabList(dataSet):
    vocabSet = set([])
    for docment in dataSet:
        vocabSet = vocabSet | set(docment)  # 将docment中vocabSet所没有的单词加到vocabSet中
    return list(vocabSet)  # 返回单词列表


# vocablist是词汇表,inputSet为输入的邮件
def bagOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)  # returnVec的大小与词汇表相同,用来记录输入邮件中有多少单词在词汇表中存在,并记录单词的出现次数
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec  # 返回词向量


# trainMat是训练样本的总的词向量,是一个矩阵,每一行都是一个邮件的词向量
# trainGategory为与trainMat对应的类别,值为0,1表示正常,垃圾
def train(trainMat, trainGategory):
    numTrain = len(trainMat)  # numTrain为训练样本邮件的个数
    numwords = len(trainMat[0])  # numwords为第一封邮件的单词个数
    pAbusive = sum(trainGategory) / float(numTrain)  # pAbusive为垃圾邮件占整个训练样本的比例
    p0Num = np.ones(numwords)  # p0Num为一个shape为(numwords,)的全为1的ndarray数组,其作用是统计所有正常邮件中每个单词的个数
    p1Num = np.ones(numwords)  # p1Num为一个shape为(numwords,)的全为1的ndarray数组,其作用是统计所有垃圾邮件中每个单词的个数
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrain):  # 遍历所有的邮件
        if trainGategory[i] == 1:  # 若这封邮件为垃圾邮件
            p1Num += trainMat[i]  # 矩阵加法,统计类1中每个单词的个数,将其加至p1Num中
            p1Denom += sum(trainMat[i])  # plDenom加上类1的单词总数
        else:
            p0Num += trainMat[i]
            p0Denom += sum(trainMat[i])
    p1Vec = np.log(p1Num / p1Denom)  # 自然对数运算,计算垃圾邮件中每个单词的概率,p1仍为ndarray矩阵,记录每个单词在垃圾邮件情况下出现的概率
    p0Vec = np.log(p0Num / p0Denom)  # 正常邮件中每个单词的概率
    return p0Vec, p1Vec, pAbusive


def classfy(vec2classfy, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2classfy * p1Vec) + np.log(pClass1)
    p0 = sum(vec2classfy * p0Vec) + np.log(1 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0


def spamTest():
    # fullTest = []
    docList = []
    classList = []
    # for i in range(1, 26):  # 每一类只有25封邮件
    #     print(i)
    #     wordList = textParse(open('email/spam/%d.txt' % i,encoding='utf8').read())
    #     docList.append(wordList)
    #     fullTest.append(wordList)
    #     classList.append(1)
    #     wordList = textParse(open('email/ham/%d.txt' % i, encoding="utf-8").read())
    #     docList.append(wordList)
    #     fullTest.append(wordList)
    #     classList.append(0)

    # 3.读入数据集
    df = pd.read_csv('./email/SMSSpamCollection.txt', delimiter='\t', header=None)  # 用\t分割,没有文件头
    # 生成label和x输入
    y, X_train = df[0], df[1]
    for text in X_train[y == "spam"]:
        wordList = textParse(text)
        docList.append(wordList)
        # fullTest.append(wordList)
        classList.append(1)
    for text in X_train[y == "ham"]:
        wordList = textParse(text)
        docList.append(wordList)
        # fullTest.append(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)  # 创建词汇表
    trainSet = list(range(5572))  # 训练样本集
    testSet = []
    # 随机抽取10个样本当作测试数据
    for i in range(int(5572 / 10)):
        randIndex = int(random.uniform(0, len(trainSet)))
        testSet.append(trainSet[randIndex])  # 将选出的样本添加到测试样本集
        del (trainSet[randIndex])  # 将选出的样本从训练样本集中删除
    trainMat = []
    trainClass = []
    for docIndex in trainSet:
        trainMat.append(bagOfWords2Vec(vocabList, docList[docIndex]))  # trainMat为训练样本的词向量矩阵
        trainClass.append(classList[docIndex])  # trainClass按顺序保存训练样本的分类
    p0, p1, pSpam = train(np.array(trainMat), np.array(trainClass))
    """
    p0为训练样本的正常邮件中每个单词出现的概率
    p1为训练样本的垃圾邮件中每个单词出现的概率
    pSpam为训练样本中垃圾邮件所占的比例
    """
    errcount = 0
    # 用测试样本进行测试
    for docIndex in testSet:
        wordVec = bagOfWords2Vec(vocabList, docList[docIndex])
        if classfy(np.array(wordVec), p0, p1, pSpam) != classList[docIndex]:
            errcount += 1
            print('classfication error'), docList[docIndex]
    print("The error rate is ", float(errcount) / len(testSet))
    print("正确率为:", 1 - float(errcount) / len(testSet))

    test_file = textParse(open('email/spam/2.txt', encoding='utf8').read())
    wordVec = bagOfWords2Vec(vocabList, test_file)
    print(classfy(np.array(wordVec), p0, p1, pSpam))


if __name__ == '__main__':
    spamTest()

代码中用到的SMSSpamCollection.txt可查看我的资源获取

参考文章:

《机器学习实战》——人民邮电出版社
《统计学习方法》——李航

你可能感兴趣的:(机器学习,机器学习,python)