import os import re import string import math DATA_DIR = 'enron' target_names = ['ham', 'spam'] def get_data(DATA_DIR): subfolders = ['enron%d' % i for i in range(1, 7)] data = [] target = [] for subfolder in subfolders: # spam spam_files = os.listdir(os.path.join(DATA_DIR, subfolder, 'spam')) for spam_file in spam_files: with open(os.path.join(DATA_DIR, subfolder, 'spam', spam_file), encoding="latin-1") as f: data.append(f.read()) target.append(1) # ham ham_files = os.listdir(os.path.join(DATA_DIR, subfolder, 'ham')) for ham_file in ham_files: with open(os.path.join(DATA_DIR, subfolder, 'ham', ham_file), encoding="latin-1") as f: data.append(f.read()) target.append(0) return data, target X, y = get_data(DATA_DIR) # 读取数据 class SpamDetector_1(object): #清除标点符号 def clean(self, s): translator = str.maketrans("", "", string.punctuation) return s.translate(translator) #将字符串标记为单词 def tokenize(self, text): text = self.clean(text).lower() return re.split("\W+", text) #计算某个单词出现的次数 def get_word_counts(self, words): word_counts = {} for word in words: word_counts[word] = word_counts.get(word, 0.0) + 1.0 return word_counts class SpamDetector_2(SpamDetector_1): # X:data,Y:target标签(垃圾邮件或正常邮件) def fit(self, X, Y): self.num_messages = {} self.log_class_priors = {} self.word_counts = {} # 建立一个集合存储所有出现的单词 self.vocab = set() # 统计spam和ham邮件的个数 self.num_messages['spam'] = sum(1 for label in Y if label == 1) self.num_messages['ham'] = sum(1 for label in Y if label == 0) # 计算先验概率,即所有的邮件中,垃圾邮件和正常邮件所占的比例 self.log_class_priors['spam'] = math.log( self.num_messages['spam'] / (self.num_messages['spam'] + self.num_messages['ham'])) self.log_class_priors['ham'] = math.log( self.num_messages['ham'] / (self.num_messages['spam'] + self.num_messages['ham'])) self.word_counts['spam'] = {} self.word_counts['ham'] = {} for x, y in zip(X, Y): c = 'spam' if y == 1 else 'ham' # 构建一个字典存储单封邮件中的单词以及其个数 counts = self.get_word_counts(self.tokenize(x)) for word, count in counts.items(): if word not in self.vocab: self.vocab.add(word) # 确保self.vocab中含有所有邮件中的单词 # 下面语句是为了计算垃圾邮件和非垃圾邮件的词频,即给定词在垃圾邮件和非垃圾邮件中出现的次数。 # c是0或1,垃圾邮件的标签 if word not in self.word_counts[c]: self.word_counts[c][word] = 0.0 self.word_counts[c][word] += count MNB = SpamDetector_2() # 选取了第100封之后的邮件作为训练集,前面一百封邮件作为测试集 MNB.fit(X[100:], y[100:]) print("log_class_priors of spam", MNB.log_class_priors['spam']) #-0.6776 print("log_class_priors of ham", MNB.log_class_priors['ham']) #-0.7089