import matplotlib.pyplot as plt
import pandas as pd
import string
import codecs
import os
#import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import naive_bayes as bayes
from sklearn.model_selection import train_test_split
#open file
file_path = "./"
emailframe = pd.read_excel(os.path.join(file_path, "chinesespam.xlsx"), 0)
print("data shape:", emailframe.shape)
print("spams in rows:", emailframe.loc[emailframe['type'] == "spam"].shape[0])
print("ham in rows:", emailframe.loc[emailframe['type'] == "ham"].shape[0])
data shape: (150, 2)
spams in rows: 50
ham in rows: 100
#load stopwords
stopwords = codecs.open(os.path.join(file_path, 'stopwords.txt'), 'r', 'UTF8').read().split('\r\n')
#cut words and process text
processed_texts = []
for text in emailframe["text"]:
words = []
seg_list = jieba.cut(text)
for seg in seg_list:
if (seg.isalpha()) & (seg not in stopwords):
words.append(seg)
sentence = " ".join(words)
processed_texts.append(sentence)
emailframe["text"] = processed_texts
print(emailframe.head(3))
'''
one-host编码,将所有的词转换为词库以及对应的矩阵形式(samples, vocab大小5915)
texts: 来自email_frame text字段值
resutl: 返回one-hot编码矩阵
'''
def transformTextToSparseMatrix(texts):
# 构建词袋数据结构
vectorizer = CountVectorizer(binary=False)
vectorizer.fit(texts)
# 获取字典型词典列表,格式:{'ab':3, 'cd':2} ab字符的索引为3,cd字符的索引为2
vocabulary = vectorizer.vocabulary_
print("There are ", len(vocabulary), " word features")
# 获取每篇文章中,每个词的出现的次数(词频),格式:(0,3) 2,表示第0个列表元素,索引为3的词出现的次数(2次)
vector = vectorizer.transform(texts)
result = pd.DataFrame(vector.toarray()) # .toarray()将结果转换为稀疏矩阵表示方法
print(result.head(3))
keys = []
values = []
for key,value in vectorizer.vocabulary_.items():
keys.append(key)
values.append(value)
df = pd.DataFrame(data = {"key" : keys, "value" : values})
print(df.head(3))
# df按照value列进行排序,其实就是按照key对应的value索引值排序,再获取key的词作为列名
colnames = df.sort_values("value")["key"].values
result.columns = colnames
return result
textmatrix = transformTextToSparseMatrix(emailframe["text"])
print(textmatrix.head(3))
#统计单词,并计算单词出现的总次数
features = pd.DataFrame(textmatrix.apply(sum, axis=0))
extractedfeatures = [features.index[i] for i in range(features.shape[0]) if features.iloc[i, 0] > 5]
textmatrix = textmatrix[extractedfeatures]
print("There are ", textmatrix.shape[1], " word features")
#划分训练数据集和测试数据集 2:8比例
train, test, trainlabel, testlabel = train_test_split(textmatrix, emailframe["type"], test_size = 0.2)
#由于样本特征是二元离散值或者很稀疏的多元离散值,所以使用BernoulliNB
clf = bayes.BernoulliNB(alpha=1, binarize=True)
model = clf.fit(train, trainlabel)
#打分
model.score(test, testlabel)
0.7333333333333333
model.predict(test)
array([‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘spam’,
‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘spam’, ‘ham’, ‘ham’, ‘spam’, ‘ham’,
‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘spam’, ‘ham’,
‘ham’, ‘ham’, ‘ham’], dtype=’