python 邮件分类_python_NLP实战之中文垃圾邮件分类

一、机器学习训练的要素

数据、转换数据的模型、衡量模型好坏的损失函数、调整模型权重以便最小化损失函数的算法

二、机器学习的组成部分

1、按照学习结果分类

预测、聚类、分类、降维

2、按照学习方法分类

监督学习,无监督学习,半监督学习,增强学习

补充:特征提取(BOW   TFIDF  Ngram)

三、实战中文垃圾文件分类

1、数据提取

def get_data():

'''

获取数据,数据的载入

:return: 文本数据,对应的labels

'''

with open("data/ham_data.txt", encoding="utf8") as ham_f, open("data/spam_data.txt", encoding="utf8") as spam_f:

ham_data = ham_f.readlines()

spam_data = spam_f.readlines()

ham_label = np.ones(len(ham_data)).tolist()

spam_label = np.zeros(len(spam_data)).tolist()

corpus = ham_data + spam_data

labels = ham_label + spam_label

return corpus, labels

def prepare_datasets(corpus, labels, test_data_proportion=0.3):

'''

将数据分为训练集和测试集

:param corpus: 文本数据

:param labels: label数据

:param test_data_proportion:测试数据占比

:return: 训练数据,测试数据,训练label,测试label

'''

train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,

test_size=test_data_proportion, random_state=42)

return train_X, test_X, train_Y, test_Y

2、对数据进行规整化和预处理

import re

import string

import jieba

# 加载停用词

with open("dict/stop_words.utf8", encoding="utf8") as f:

stopword_list = f.readlines()

def tokenize_text(text):

tokens = jieba.cut(text)

tokens = [token.strip() for token in tokens]

return tokens

def remove_special_characters(text):

tokens = tokenize_text(text)

pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))

filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])

filtered_text = ' '.join(filtered_tokens)

return filtered_text

def remove_stopwords(text):

tokens = tokenize_text(text)

filtered_tokens = [token for token in tokens if token not in stopword_list]

filtered_text = ''.join(filtered_tokens)

return filtered_text

def normalize_corpus(corpus, tokenize=False):

normalized_corpus = []

for text in corpus:

text = remove_special_characters(text)

text = remove_stopwords(text)

normalized_corpus.append(text)

if tokenize:

text = tokenize_text(text)

normalized_corpus.append(text)

return normalized_corpus

3、提取特征

from sklearn.feature_extraction.text import CountVectorizer

def bow_extractor(corpus, ngram_range=(1, 1)):

vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)

features = vectorizer.fit_transform(corpus)

return vectorizer, features

from sklearn.feature_extraction.text import TfidfTransformer

def tfidf_transformer(bow_matrix):

transformer = TfidfTransformer(norm='l2',

smooth_idf=True,

use_idf=True)

tfidf_matrix = transformer.fit_transform(bow_matrix)

return transformer, tfidf_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_extractor(corpus, ngram_range=(1, 1)):

vectorizer = TfidfVectorizer(min_df=1,

norm='l2',

smooth_idf=True,

use_idf=True,

ngram_range=ngram_range)

features = vectorizer.fit_transform(corpus)

return vectorizer, features

4、训练分类器

def train_predict_evaluate_model(classifier,

train_features, train_labels,

test_features, test_labels):

# build model

classifier.fit(train_features, train_labels)

# predict using model

predictions = classifier.predict(test_features)

# evaluate model prediction performance

get_metrics(true_labels=test_labels,

predicted_labels=predictions)

return predictions

from sklearn.naive_bayes import MultinomialNB

from sklearn.linear_model import SGDClassifier

from sklearn.linear_model import LogisticRegression

mnb = MultinomialNB()

svm = SGDClassifier(loss='hinge', n_iter=100)

lr = LogisticRegression()

# 基于词袋模型的多项朴素贝叶斯

print("基于词袋模型特征的贝叶斯分类器")

mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,

train_features=bow_train_features,

train_labels=train_labels,

test_features=bow_test_features,

test_labels=test_labels)

# 基于词袋模型特征的逻辑回归

print("基于词袋模型特征的逻辑回归")

lr_bow_predictions = train_predict_evaluate_model(classifier=lr,

train_features=bow_train_features,

train_labels=train_labels,

test_features=bow_test_features,

test_labels=test_labels)

# 基于词袋模型的支持向量机方法

print("基于词袋模型的支持向量机")

svm_bow_predictions = train_predict_evaluate_model(classifier=svm,

train_features=bow_train_features,

train_labels=train_labels,

test_features=bow_test_features,

test_labels=test_labels)

# 基于tfidf的多项式朴素贝叶斯模型

print("基于tfidf的贝叶斯模型")

mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,

train_features=tfidf_train_features,

train_labels=train_labels,

test_features=tfidf_test_features,

test_labels=test_labels)

# 基于tfidf的逻辑回归模型

print("基于tfidf的逻辑回归模型")

lr_tfidf_predictions=train_predict_evaluate_model(classifier=lr,

train_features=tfidf_train_features,

train_labels=train_labels,

test_features=tfidf_test_features,

test_labels=test_labels)

# 基于tfidf的支持向量机模型

print("基于tfidf的支持向量机模型")

svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,

train_features=tfidf_train_features,

train_labels=train_labels,

test_features=tfidf_test_features,

test_labels=test_labels)

5、评价指标

显示部分测试结果

import re

num = 0

for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):

if label == 0 and predicted_label == 0:

print('邮件类型:', label_name_map[int(label)])

print('预测的邮件类型:', label_name_map[int(predicted_label)])

print('文本:-')

print(re.sub('\n', ' ', document))

num += 1

if num == 4:

break

num = 0

for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):

if label == 1 and predicted_label == 0:

print('邮件类型:', label_name_map[int(label)])

print('预测的邮件类型:', label_name_map[int(predicted_label)])

print('文本:-')

print(re.sub('\n', ' ', document))

num += 1

if num == 4:

break

你可能感兴趣的:(python,邮件分类)