传统机器学习的文本分类通常提取TFIDF或者词袋特征,然后给模型进行训练,传统的机器学习的分类模型由很多,比如逻辑回归、支持向量机、多层感知机、贝叶斯等等。利用传统机器学习方法进行文本分类的基本思路:获取数据、数据预处理(上一篇博客已经讲过了https://blog.csdn.net/weixin_44766179/article/details/89855100)、特征提取、模型训练、预测。
下面利用传统机器学习方法实现垃圾邮件分类任务。
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')
data_file = './spam.csv'
df = pd.read_csv(data_file, encoding='latin1')
labels = df.v1
texts = df.v2
def clear_data(text):
# 英文缩写替换
text_abbreviation = []
for item in text:
item = item.lower().replace("it's", "it is").replace("i'm", "i am").replace("he's", "he is").replace("she's", "she is")\
.replace("we're", "we are").replace("they're", "they are").replace("you're", "you are").replace("that's", "that is")\
.replace("this's", "this is").replace("can't", "can not").replace("don't", "do not").replace("doesn't", "does not")\
.replace("we've", "we have").replace("i've", " i have").replace("isn't", "is not").replace("won't", "will not")\
.replace("hasn't", "has not").replace("wasn't", "was not").replace("weren't", "were not").replace("let's", "let us")
text_abbreviation.append(item)
# 删除标点符号、数字等其他字符
text_clear_str = []
for item in text_abbreviation:
item = re.sub("[^a-zA-Z]", " ", item)
text_clear_str.append(' '.join(item.split()))
texts = []
stem_porter = PorterStemmer() # 词形归一化
stop_words = stopwords.words("english") # 停用词
# 分词、词形归一化、删除停用词
for item in text_clear_str:
words_token = word_tokenize(item) # 分词
words = [stem_porter.stem(w) for w in words_token if w not in stop_words]
texts.append(' '.join(words))
return texts
texts = clear_data(texts)
le = LabelEncoder()
labels = le.fit_transform(labels)
# TFIDF特征提取
def features_extraction(text):
vector = TfidfVectorizer()
return vector.fit_transform(text).todense()
features = features_extraction(texts)
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=2)
# 逻辑回归
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(x_train, y_train)
y_pred = lr.predict(x_test)
print('accuracy_lr : ', accuracy_score(y_test, y_pred)) # 输出:0.9524663677130045
# 支持向量机
from sklearn.svm import SVC
svc = SVC(kernel='linear').fit(x_train, y_train)
y_pred = svc.predict(x_test)
print('accuracy_svm: ', accuracy_score(y_test, y_pred)) # 输出:0.9739910313901345
# 多层感知机
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 100,)).fit(x_train, y_train)
y_pred = mlp.predict(x_test)
print('accuracy_mlp: ', accuracy_score(y_test, y_pred)) # 输出:0.9748878923766816
# 贝叶斯
from sklearn.naive_bayes import MultinomialNB
mb = MultinomialNB().fit(x_train, y_train)
y_pred = mb.predict(x_test)
print('accuracy_mb: ', accuracy_score(y_test, y_pred)) # 输出:0.9623318385650225