使用朴素贝叶斯进行垃圾邮件识别

朴素贝叶斯在文本分类、垃圾邮件识别、情感分析等方面有很好的应用。进来就来体验一下,数据借用了点击打开链接,他文件夹里面有分类好的正常邮件,垃圾邮件和测试邮件。也有代码,不过我这个代码我自己写的。

主要过程:读取邮件-分词-转化为词袋模型、调用朴素贝叶斯进行分类。代码比较乱,直接上代码:

import jieba,re
import os
import numpy as np
from sklearn.naive_bayes import BernoulliNB
filepath=r'C:\Users\lbship\Desktop\脚本\BayesSpam-master\data\\'
#获取停用词,用set去重再转化为列表
stopword=list(set('sep'.join(i for i in open(r'C:\Users\lbship\Desktop\脚本\BayesSpam-master\data\中文停用词表.txt','r').read()).split('sep')))+['很不讨','临走前','装腔作势','宽宏大量','体格检查','考试成绩','普通高校','为情所困','乐不可支']
#获取目录下所有文件的名称
def get_filelist(filetype):
    return os.listdir(filepath+filetype)
#过滤掉停用词,返回结巴分词后的列表
def pretreat(content):
    words=[]
    contents=jieba.lcut(content)
    for i in contents:
        if i not in stopword:
            words.append(i)
    return words
#提取中文字符,返回整个文件夹的词列表
def get_wordlist(filetype,filelist):
    wordlist=[]
    for email in filelist:
        onemailword=''
        for s in open(filepath+filetype+email):
            s=re.sub(re.compile(r"[^\u4e00-\u9fa5]"),'',s)#过滤掉非中文字符
            onemailword+=s #把整个文档中文连接起来
        onemailword=pretreat(onemailword)
        wordlist.append(onemailword)
    return wordlist
# #把所有词放到一个列表中,用set去重
def get_wordset(train_x):
    wordsets=[]
    for i in train_x:
        for j in i:
            wordsets.append(j)
    wordsets=list(set(wordsets))
    return wordsets
#生成词袋向量或者词集
def create_wordVec(sample, wordSet, mode="wordSet"):
    length = len(wordSet)
    wordVec = [0] * len(wordSet)
    if mode == "wordSet":
        for i in range(length):
            if wordSet[i] in sample:
                wordVec[i] = 1
    elif mode == "wordBag":
        for i in range(length):
            for j in range(len(sample)):
                if sample[j] == wordSet[i]:
                    wordVec[i] += 1
    else:
        raise (Exception("The mode must be wordSet or wordBag."))
    return wordVec
#转化为矩阵
def get_matrix(trainx,wordsets,mode="wordSet"):
    train_matrix = []
    for i in range(len(trainx)):
        train_matrix.append(create_wordVec(trainx[i], wordsets, "wordSet"))
    return train_matrix
#获取测试机标签
def get_testlabel(testfile):
    testlabel={}
    for j in testfile:
        if int(j)>1000:
            testlabel[j]=1
        else:
            testlabel[j]=0
    testlabel=list(testlabel.values())
    return testlabel
def main():
    #读取文件名
    norfile = get_filelist('normal')[0:100]  # 自己设置读取多少数据作为训练库
    spamfile = get_filelist('spam')[0:100]
    testfile = get_filelist('test')[0:151]
    # 返回各个文件夹下文档的分词列表
    norlist = get_wordlist('normal\\', norfile)
    spamlist = get_wordlist('spam\\', spamfile)
    testlist = get_wordlist('test\\', testfile)
    # 生成训练库和标签
    train_x = norlist + spamlist
    label = [0] * len(norfile) + [1] * len(spamfile)
    wordSet = get_wordset(train_x)
    testwordSet = get_wordset(testlist)
    train_matrix = get_matrix(train_x, wordSet, "wordSet")
    test_matrix = get_matrix(testlist, testwordSet, "wordSet")
    #转化为矩阵
    xtrain=np.array(train_matrix)
    ytrain=np.array(label)
    testtrain=np.array(test_matrix)
    testlabel=np.array(get_testlabel(testfile))
    #计算正确率
    clf=BernoulliNB()
    clf.fit(xtrain,ytrain)
    print('测试样本的正确率为{0:.2f}%'.format(clf.score(testtrain,testlabel)*100))
if __name__ == '__main__':
    main()



结果准确率是98%:

使用朴素贝叶斯进行垃圾邮件识别_第1张图片

你可能感兴趣的:(机器学习)