02.朴素贝叶斯-垃圾邮件分类

【需求说明】
  1. chinesespam.xlsx为邮件数据集,stopwords.txt为停用词数据集;
  2. 根据现有数据集,采用jieba进行分词切分,并通过sklearn进行特征处理;
  3. 由于样本特征是二元离散值或者很稀疏的多元离散值,采用先验为伯努利分布的朴素贝叶斯进行分类;
  4. 数据集地址:https://download.csdn.net/download/LWY_Xing/13238548
import matplotlib.pyplot as plt
import pandas as pd
import string
import codecs
import os
#import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import naive_bayes as bayes
from sklearn.model_selection import train_test_split
#open file
file_path = "./"
emailframe = pd.read_excel(os.path.join(file_path, "chinesespam.xlsx"), 0)
print("data shape:", emailframe.shape)
print("spams in rows:", emailframe.loc[emailframe['type'] == "spam"].shape[0])
print("ham in rows:", emailframe.loc[emailframe['type'] == "ham"].shape[0])

data shape: (150, 2)
spams in rows: 50
ham in rows: 100

#load stopwords
stopwords = codecs.open(os.path.join(file_path, 'stopwords.txt'), 'r', 'UTF8').read().split('\r\n')
#cut words and process text
processed_texts = []
for text in emailframe["text"]:
    words = []
    seg_list = jieba.cut(text)
    for seg in seg_list:
        if (seg.isalpha()) & (seg not in stopwords):
            words.append(seg)
    sentence = " ".join(words)
    processed_texts.append(sentence)
emailframe["text"] = processed_texts
print(emailframe.head(3))

在这里插入图片描述

'''
one-host编码,将所有的词转换为词库以及对应的矩阵形式(samples, vocab大小5915)
texts: 来自email_frame text字段值
resutl: 返回one-hot编码矩阵
'''
def transformTextToSparseMatrix(texts):
	# 构建词袋数据结构
    vectorizer = CountVectorizer(binary=False)
    vectorizer.fit(texts)

    # 获取字典型词典列表,格式:{'ab':3, 'cd':2} ab字符的索引为3,cd字符的索引为2
    vocabulary = vectorizer.vocabulary_
    print("There are ", len(vocabulary), " word features")
    
    # 获取每篇文章中,每个词的出现的次数(词频),格式:(0,3) 2,表示第0个列表元素,索引为3的词出现的次数(2次)
    vector = vectorizer.transform(texts)
    result = pd.DataFrame(vector.toarray()) # .toarray()将结果转换为稀疏矩阵表示方法
    print(result.head(3))
    
    keys = []
    values = []
    for key,value in vectorizer.vocabulary_.items():
        keys.append(key)
        values.append(value)
    df = pd.DataFrame(data = {"key" : keys, "value" : values})
    print(df.head(3))
    # df按照value列进行排序,其实就是按照key对应的value索引值排序,再获取key的词作为列名
    colnames = df.sort_values("value")["key"].values
    result.columns = colnames
    return result

textmatrix = transformTextToSparseMatrix(emailframe["text"])
print(textmatrix.head(3))

02.朴素贝叶斯-垃圾邮件分类_第1张图片

#统计单词,并计算单词出现的总次数
features = pd.DataFrame(textmatrix.apply(sum, axis=0))
extractedfeatures = [features.index[i] for i in range(features.shape[0]) if features.iloc[i, 0] > 5]
textmatrix = textmatrix[extractedfeatures]
print("There are ", textmatrix.shape[1], " word features")

#划分训练数据集和测试数据集 2:8比例
train, test, trainlabel, testlabel = train_test_split(textmatrix, emailframe["type"], test_size = 0.2)

#由于样本特征是二元离散值或者很稀疏的多元离散值,所以使用BernoulliNB
clf = bayes.BernoulliNB(alpha=1, binarize=True)
model = clf.fit(train, trainlabel)

#打分
model.score(test, testlabel)

0.7333333333333333

model.predict(test)

array([‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘spam’,
‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘spam’, ‘ham’, ‘ham’, ‘spam’, ‘ham’,
‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘ham’, ‘spam’, ‘ham’,
‘ham’, ‘ham’, ‘ham’], dtype=’

你可能感兴趣的:(自然语言处理,python,机器学习,朴素贝叶斯算法)