1. 数据集说明
trec06c是一个公开的垃圾邮件语料库,由国际文本检索会议提供,分为英文数据集(trec06p)和中文数据集(trec06c),其中所含的邮件均来源于真实邮件保留了邮件的原有格式和内容,下载地址:https://plg.uwaterloo.ca/~gvcormac/treccorpus06/
由于数据集分散在各个文件中,为了方便我将正样本和负样本分别放在了ham_data和spam_data文件夹中(处女座的强迫症)
正样本数:21766
负样本数:42854
中文停用词:chinese_stop_vocab.txt
下面使用的所有数据集都已上传github
2. 实现思路
- 对单个邮件进行数据预处理
- 去除所有非中文字符,如标点符号、英文字符、数字、网站链接等特殊字符
- 对邮件内容进行分词处理
- 过滤停用词
- 创建特征矩阵和样本数据集
- feature_maxtrix:shape=(samples, feature_word_nums)
- leabel; shape = (samples, 1)
- 词向量的选择:索引或word2vect,注意二者的区别
拆分数据集:训练数据集、测试数据集和验证数据集
选择模型,这里选择svm
训练、测试、调参
3. 具体实现过程
3.1 所用到的库
import os
import jieba
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV,train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from scipy.stats import uniform
3.2 将邮件转换为特征词矩阵类
class EmailToWordFeatures:
'''
功能:将邮件转换为特征词矩阵
整个过程包括:
- 对邮件内容进行分词处理
- 去除所有非中文字符,如标点符号、英文字符、数字、网站链接等特殊字符
- 过滤停用词
- 创建特征矩阵
'''
def __init__(self,stop_word_file=None,features_vocabulary=None):
self.features_vocabulary = features_vocabulary
self.stop_vocab_dict = {} # 初始化停用词
if stop_word_file is not None:
self.stop_vocab_dict = self._get_stop_words(stop_word_file)
def text_to_feature_matrix(self,words,vocabulary=None,threshold =10):
cv = CountVectorizer()
if vocabulary is None:
cv.fit(words)
else:
cv.fit(vocabulary)
words_to_vect = cv.transform(words)
words_to_matrix = pd.DataFrame(words_to_vect.toarray()) # 转换成索引矩阵
print(words_to_matrix.shape)
# 进行训练特征词选择,给定一个阈值,当单个词在所有邮件中出现的次数的在阈值范围内时及选为训练特征词、
selected_features = []
selected_features_index = []
for key,value in cv.vocabulary_.items():
if words_to_matrix[value].sum() >= threshold: # 词在每封邮件中出现的次数与阈值进行比较
selected_features.append(key)
selected_features_index.append(value)
words_to_matrix.rename(columns=dict(zip(selected_features_index,selected_features)),inplace=True)
return words_to_matrix[selected_features]
def get_email_words(self,email_path, max_email = 600):
'''
由于机器配置问题,作为测试给定阈值600,及正负样本数各位600
'''
self.emails = email_path
if os.path.isdir(self.emails):
emails = os.listdir(self.emails)
is_dir = True
else:
emails = [self.emails,]
is_dir = False
count = 0
all_email_words = []
for email in emails:
if count >= max_email: # 给定读取email数量的阈值
break
if is_dir:
email_path = os.path.join(self.emails,email)
email_words = self._email_to_words(email_path)
all_email_words.append(' '.join(email_words))
count += 1
return all_email_words
def _email_to_words(self, email):
'''
将邮件进行分词处理,去除所有非中文和停用词
retrun:words_list
'''
email_words = []
with open(email, 'rb') as pf:
for line in pf.readlines():
line = line.strip().decode('gbk','ignore')
if not self._check_contain_chinese(line): # 判断是否是中文
continue
word_list = jieba.cut(line, cut_all=False) # 进行分词处理
for word in word_list:
if word in self.stop_vocab_dict or not self._check_contain_chinese(word):
continue # 判断是否为停用词
email_words.append(word)
return email_words
def _get_stop_words(self,file):
'''
获取停用词
'''
stop_vocab_dict = {}
with open(file,'rb') as pf:
for line in pf.readlines():
line = line.decode('utf-8','ignore').strip()
stop_vocab_dict[line] = 1
return stop_vocab_dict
def _check_contain_chinese(self,check_str):
'''
判断邮件中的字符是否有中文
'''
for ch in check_str:
if u'\u4e00' <= ch <= u'\u9fff':
return True
return False
3.3 将正负邮件数据集转换为词特征列表,每项为一封邮件
index_file= '.\\datasets\\trec06c\\full\\index'
stop_word_file = '.\\datasets\\trec06c\\chinese_stop_vocab.txt'
ham_file = '.\\datasets\\trec06c\\ham_data'
spam_file = '.\\datasets\\trec06c\\spam_data'
email_to_features = EmailToWordFeatures(stop_word_file=stop_word_file)
ham_words = email_to_features.get_email_words(ham_file)
spam_words = email_to_features.get_email_words(spam_file)
print('ham email numbers:',len(ham_words))
print('spam email numbers:',len(spam_words))
ham email numbers: 600
spam email numbers: 600
3.4 将所有邮件转换为特征词矩阵,及模型输入数据
all_email = []
all_email.extend(ham_words)
all_email.extend(spam_words)
print('all test email numbers:',len(all_email))
words_to_matrix = email_to_features.text_to_feature_matrix(all_email)
print(words_to_matrix)
all test email numbers: 1200
(1200, 22556)
故事 领导 回到 儿子 感情 有个 大概 民俗 出国 教育 ... 培训网 商友会 网管 埃森哲 驱鼠器 条例 \
0 1 2 1 1 1 1 1 1 1 1 ... 0 0 0 0 0 0
1 0 0 0 0 5 0 0 0 0 0 ... 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
... .. .. .. .. .. .. .. .. .. .. ... ... ... .. ... ... ..
1195 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
1196 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
1197 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
1198 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
1199 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
智囊 教练 含双早 王府井
0 0 0 0 0
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
... .. .. ... ...
1195 0 0 0 0
1196 0 0 0 0
1197 0 0 0 0
1198 0 0 0 0
1199 0 0 0 0
[1200 rows x 3099 columns]
3.5 获取标签矩阵
label_matrix = np.zeros((len(all_email),1))
label_matrix[0:len(ham_words),:] = 1
4. 使用svm模型进行训练
# 拆分数据集
x_train,x_test,y_train,y_test = train_test_split(words_to_matrix,label_matrix,test_size=0.2,random_state=42)
# 使用LinearSVC模型进行训练
svc = LinearSVC(loss='hinge',dual=True)
param_distributions = {'C':uniform(0,10)}
rscv_clf =RandomizedSearchCV(estimator=svc, param_distributions=param_distributions,cv=3,n_iter=200,verbose=2)
rscv_clf.fit(x_train,y_train)
print('best_params:',rscv_clf.best_params_)
Fitting 3 folds for each of 200 candidates, totalling 600 fits
[CV] C=6.119041659192192 .............................................
[CV] .............................. C=6.119041659192192, total= 0.0s
[CV] C=6.119041659192192 .............................................
[CV] .............................. C=6.119041659192192, total= 0.1s
[CV] C=6.119041659192192 .............................................
[CV] .............................. C=6.119041659192192, total= 0.1s
[CV] C=6.103402593686549 .............................................
...
...
...
[CV] .............................. C=4.395657632563425, total= 0.2s
best_params: {'C': 0.0279898379592336}
# 使用测试数据集进行测试
y_prab = rscv_clf.predict(x_test)
print('accuracy:',accuracy_score(y_prab,y_test))
accuracy: 0.9791666666666666
5. 分别选择一封正式邮件和垃圾邮件进行
正式邮件内容如下:
- 很久以前,我为了考人大,申请了他的ID,而现在却不对外开放了。
申请水木的ID,真的是不知道出于什么缘故。离开校园寻找一份校园的感觉,怀着对清华的向往,还是为了偶尔无聊工作的一些调剂……
我讨厌多愁善感,却时常沉浸于其中,生活中的挫折,不幸,让我知道自己得坚强……
可每天的灰色心情却挥之不去,我可以把自己的心事埋于深处,笑着对待我身边的每一个人,告诉我乐观。身边最亲的人,也许知道或不知道我的脆弱和恐惧。而唯一知道的人,告诉我“希望你坚不可摧”。
回想多年前,为“在靠近的地方住下,能掩耳不听烦世喧嚣,要一份干净的自由自在”而感动,那,是否是对今天的预见,无知是快乐的,而不知道责任也是快乐的。我可以逃避一时,却始终要面对。
垃圾邮件如下:
- 这是一封善意的邮件,如给你造成不便,请随手删除.SOHO建站代理网诚聘兼职网站代理
一、职业要求:
1、有上网条件(在家中、办公室、网吧等地);
2、每天能有1-2小时上网时间;
3、有网络应用的基础(会上论坛发贴子、发电子邮件,
与客户QQ沟通等)。
二、工作任务:
您报名加入我公司后,公司将分配给您一个属
于自己的冠名为SOHO致富联盟的网站,作为站长,您的任
务就是利用各种方法宣传这个网站,让客户从你的网站上
购买更多的商品,并接受你的建议,也同意加盟SOHO建站
代理网网站的兼职代理,这样你便拥有滚滚不断的财源。
三、工资待遇:3000元以上/月。业绩累积,收入直线上升.
def email_to_predict_matrix(words,features):
cv = CountVectorizer()
words_to_vect = cv.fit_transform(words)
words_to_marix = pd.DataFrame(words_to_vect.toarray())
vocabulary = cv.vocabulary_
words_numbers_list = [] # 特征词出现的次数列表
for feature in features:
if feature in cv.vocabulary_.keys():
words_numbers_list.append(words_to_marix[vocabulary[feature]][0])
else:
words_numbers_list.append(0)
words_numbers_matrix = pd.DataFrame([words_numbers_list],columns = features)
return words_numbers_matrix
valid_ham_email = '.\\datasets\\trec06c\\valid_ham_email'
valid_spam_email = '.\\datasets\\trec06c\\valid_spam_email'
email_to_features_valid = EmailToWordFeatures(stop_word_file=stop_word_file)
valid_ham_email_words = email_to_features_valid.get_email_words(valid_ham_email)
valid_spam_email_words = email_to_features_valid.get_email_words(valid_spam_email)
valid_ham_words_maxtrix = email_to_predict_matrix(valid_ham_email_words,words_to_matrix.columns)
valid_spam_words_maxtrix = email_to_predict_matrix(valid_spam_email_words,words_to_matrix.columns)
print('测试正式邮件----------')
print('预测结果:',rscv_clf.predict(valid_ham_words_maxtrix))
测试正式邮件----------
预测结果: [1.]
print('测试垃圾邮件----------')
print('预测结果:',rscv_clf.predict(valid_spam_words_maxtrix))
测试垃圾邮件----------
预测结果: [0.]
附
6.1 改进计划
将特征词矩阵改word2vect
使用mxnet神经网络模型进行训练
6.2 数据集整理部分的代码
# 将正样本和负样本分别放在了ham_data和spam_data文件夹中
index_file= '.\\datasets\\trec06c\\full\\index'
data_file = '.\\datasets\\trec06c\\data'
ham_file_list = []
spam_file_list = []
# 读index文件
with open(index_file,'r') as pf:
for line in pf.readlines():
content = line.strip().split('..')
label,path = content
path = path.replace('/', '\\')
if label == 'spam ':
spam_file_list.append(path)
else:
ham_file_list.append(path)
import os
import shutil
root = '.\\datasets\\trec06c\\'
new_ham_root = '.\\datasets\\trec06c\\ham_data'
new_spam_root = '.\\datasets\\trec06c\\spam_data'
def copy_file(filelist,new_file_path):
for file in filelist:
file_name = file.split('\\')
path = root + file
if not os.path.exists(new_file_path):
os.makedirs(new_file_path)
shutil.copyfile(path, new_file_path+'\\' + file_name[-2]+ '_' + file_name[-1])