本文基于朴素贝叶斯构建一个分类垃圾邮件的模型,研究对象是英文的垃圾邮件。
邮件内容保存在txt文件中,其中分为训练样本train和测试样本test。
在训练样本中正常邮件命名为:pos;垃圾邮件命名为:neg。
同时,可以将待分类的测试样本放入测试文件test中的对于pos,或者neg下,用来进行测试,如果是垃圾邮件则类别为0,反之类别为1。
在朴素贝叶斯法进行垃圾邮件的分类的思想中:有一个方法与一个假设:
贝叶斯定理:求解p(c|x)的问题变成了求解p(x|c)的问题
特征条件独立假设 :X的n个特征在某类确定的条件下都是条件独立的。
具体可以看:[监督学习] 朴素贝叶斯法.
在GitHub上的数据: 数据及代码.
1 建立词汇表,(用来收集训练集中所有邮件的词汇。)
2 每封邮件词汇向量,(通过所有词汇来确定每封邮件中词汇出现的数量。)
3 计算先验概率p(b)
4 计算每个词汇的条件概率
5 用例测试
完整程序:
# -*- coding: utf-8 -*-
# @Time : 2020/4/16 21:09
# @Author : Zudy
'''
1. 基于朴素贝叶斯的垃圾邮件分类
'''
from sklearn import datasets
from time import time
import numpy
import re
import os
import random
def load_data(folder_path):
print("Loading dataset ...")
loadTime = time()
datalist = datasets.load_files(folder_path)
#datalist是一个Bunch类,其中重要的数据项有
#data:原始数据
#filenames:每个文件的名称
#target:类别标签(子目录的文件从0开始标记了索引)
#target_names:类别标签(子目录的具体名称)
#输出总文档数和类别数
print("summary: {0} documents in {1} categories.".format(len(datalist.data),len(datalist.target_names)))
#加载数据所用的时间
print("Load data in {0}seconds".format(time() - loadTime))
#去停用词操作
#datalist.data = [word for word in datalist.data if(word not in stopwords.words('english'))]
return datalist
# 返回集合列表
def word_create(ori_data):
print("\nVectorzing dataset ...")
word_dic = set([]) #建立一个集合列表
#词向量的时间
vectorTime = time()
#词典的构造
for doc in ori_data.data:
#doc是byte,这里将byte转化为string
doc = str(doc, encoding = "utf-8")
#使用正则表达式将特殊符号去除
doc = re.sub("[\s+\.\!\/_,$%^*(+\"\'-]+|[+——!,。?、~@#¥%……&*()<>]+", " ", doc)
#使用默认的空格方式将email分隔开,然后转化为小写字母,与原集合取并集
word_dic = word_dic|set(doc.lower().split())
#向量化的时间和词典中词的数量
print("Vectorzing time:{0}\nThe number of word_dictionary:{1}".format(vectorTime,len(word_dic)))
return list(word_dic)
def doc_represent(wordDic,ori_data):
#创建一个文档数(行)*词向量(列)长度的二维数组
doc_re = numpy.zeros((len(ori_data.data),len(wordDic)),dtype= numpy.int)
#计数器
count = 0
#用来记录词向量表示时间
representTime = time()
for doc in ori_data.data:
#同word_create函数,进行同样的操作
doc = str(doc, encoding = "utf-8")
doc = re.sub("[\s+\.\!\/_,$%^*(+\"\'-]+|[+——!,。?、~@#¥%……&*()<>]+", " ", doc)
for word in doc.lower().split():
if word in wordDic:
#将对应词向量位置置1
doc_re[count][wordDic.index(word)] = 1
count = count+1
print("Represent doc time:{0}\nThe number of doc:{1}".format(representTime-time(),len(doc_re)))
#返回表示文档的二维数组
return doc_re
def pre_probabilty(ori_data):
s_pre_pro = []
#正常邮件的先验概率
P_normal = (normal + 1.0)/(len(ori_data.data) + 2.0)
s_pre_pro.append(P_normal)
#垃圾邮件的先验概率
P_spam = (spam + 1.0)/(len(ori_data.data) + 2.0)
s_pre_pro.append(P_spam)
#返回先验概率的列表
return s_pre_pro
#计算每个词在正常邮件垃圾邮件中的数目
def wordNum_email(email_repre,wordDic):
#用二维向量存储
num_word = numpy.zeros((2,len(wordDic)),dtype= numpy.int)
for i in range(len(wordDic)):
#在正常邮件的数目
for j in range(normal):
num_word[0][i] += email_repre[j][i]
#在垃圾邮件中的数目
for j in range(normal, spam+normal):
num_word[1][i] += email_repre[j][i]
return num_word
#条件概率
def con_probabilty(email_repre,wordDic):
#得到每个词汇在正常邮件、垃圾邮件中的数目
word_num = wordNum_email(email_repre,wordDic)
word_pro = numpy.zeros((2,len(wordDic)),dtype = numpy.double)
for i in range(len(wordDic)):
word_pro[0][i] = round((word_num[0][i]+1)/(normal + 2),8)
word_pro[1][i] = round((word_num[1][i]+1)/(spam + 2 ),8)
return word_pro
#得到每个类别中的文档数
def class_num(path,class_name):
count = 0
path=path+"/"+class_name
for root, dirs, files in os.walk(path): # 遍历统计
for each in files:
count += 1
return count
#测试
def test_spam(test_repre,pre_pro,con_pro):
email_pro = numpy.zeros((len(test_repre),2),dtype = numpy.double)
email_judge = []
normal_num = 0
spam_num = 0
for i in range(len(test_repre)):
email_pro[i][0] = round(pre_pro[0],8)
email_pro[i][1] = round(pre_pro[1],8)
for j in range(len(test_repre[0])):
if test_repre[i][j] != 0:
email_pro[i][0] *= con_pro[0][j]
email_pro[i][1] *= con_pro[1][j]
if email_pro[i][0] > email_pro[i][1] :
email_judge.append(0)
elif email_pro[i][0] < email_pro[i][1] :
email_judge.append(1)
else :
if random.random() > 0.5:
email_judge.append(1)
else:
email_judge.append(0)
for i in range(normal_test):
if email_judge[i] == 0:
normal_num +=1
for i in range(normal_test,len(test_repre)):
if email_judge[i] == 1:
spam_num +=1
print("email_judge=")
print(email_judge)
print("normal_num="+str(normal_num)+"\nspam_num="+str(spam_num))
return (normal_num + spam_num)/len(test_repre)
if __name__ == "__main__":
# 训练集和测试集的路径
train_path = "D:/Python/Python_learning/Book_code/LH_mechine_learning/bayes/spamDataset/email/train1"
test_path = "D:/Python/Python_learning/Book_code/LH_mechine_learning/bayes/spamDataset/email/test1"
train_list = load_data(train_path)
test_list = load_data(test_path)
normal = class_num(train_path,"pos") # 正常邮件的数目
spam = class_num(train_path,"neg") # 垃圾邮件的数目
WordDictionary = word_create(train_list) # 建立词汇表
docRepre = doc_represent(WordDictionary,train_list) # 将训练数据进行向量表示
prePro = pre_probabilty(train_list)
conPro = con_probabilty(docRepre,WordDictionary)
print("\npreProbablity:",prePro) # 计算先验概率
print("conProbablity:",conPro) # 计算条件概率
testRepre = doc_represent(WordDictionary,test_list) # 测试数据的向量表示
normal_test = class_num(test_path, "pos") # 正常邮件的数目
spam_test = class_num(test_path, "neg") # 垃圾邮件的数目
test_accuracy = test_spam(testRepre,prePro,conPro) # 测试数据的准确率
print ("test accuracy")
print(test_accuracy)
Loading dataset ...
summary: 43 documents in 2 categories.
Load data in 0.008994102478027344seconds
Loading dataset ...
summary: 36 documents in 2 categories.
Load data in 0.007995843887329102seconds
Vectorzing dataset ...
Vectorzing time:1587043343.3496442
The number of word_dictionary:2426
Represent doc time:-0.5976784229278564
The number of doc:43
preProbablity: [0.5777777777777777, 0.4222222222222222]
conProbablity: [[0.59259259 0.07407407 0.03703704 ... 0.11111111 0.07407407 0.03703704]
[0.7 0.05 0.15 ... 0.05 0.05 0.1 ]]
Represent doc time:-0.5137045383453369
The number of doc:36
email_judge=
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1]
normal_num=16
spam_num=13
test accuracy
0.8055555555555556
参考文献:
链接: 基于朴素贝叶斯的垃圾邮件分类.