数据、转换数据的模型、衡量模型好坏的损失函数、调整模型权重以便最小化损失函数的算法
1、按照学习结果分类
预测、聚类、分类、降维
2、按照学习方法分类
监督学习,无监督学习,半监督学习,增强学习
补充:特征提取(BOW TFIDF Ngram)
def get_data():
'''
获取数据,数据的载入
:return: 文本数据,对应的labels
'''
with open("data/ham_data.txt", encoding="utf8") as ham_f, open("data/spam_data.txt", encoding="utf8") as spam_f:
ham_data = ham_f.readlines()
spam_data = spam_f.readlines()
ham_label = np.ones(len(ham_data)).tolist()
spam_label = np.zeros(len(spam_data)).tolist()
corpus = ham_data + spam_data
labels = ham_label + spam_label
return corpus, labels
def prepare_datasets(corpus, labels, test_data_proportion=0.3):
'''
将数据分为训练集和测试集
:param corpus: 文本数据
:param labels: label数据
:param test_data_proportion:测试数据占比
:return: 训练数据,测试数据,训练label,测试label
'''
train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
test_size=test_data_proportion, random_state=42)
return train_X, test_X, train_Y, test_Y
import re
import string
import jieba
# 加载停用词
with open("dict/stop_words.utf8", encoding="utf8") as f:
stopword_list = f.readlines()
def tokenize_text(text):
tokens = jieba.cut(text)
tokens = [token.strip() for token in tokens]
return tokens
def remove_special_characters(text):
tokens = tokenize_text(text)
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def remove_stopwords(text):
tokens = tokenize_text(text)
filtered_tokens = [token for token in tokens if token not in stopword_list]
filtered_text = ''.join(filtered_tokens)
return filtered_text
def normalize_corpus(corpus, tokenize=False):
normalized_corpus = []
for text in corpus:
text = remove_special_characters(text)
text = remove_stopwords(text)
normalized_corpus.append(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
return normalized_corpus
from sklearn.feature_extraction.text import CountVectorizer
def bow_extractor(corpus, ngram_range=(1, 1)):
vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
features = vectorizer.fit_transform(corpus)
return vectorizer, features
from sklearn.feature_extraction.text import TfidfTransformer
def tfidf_transformer(bow_matrix):
transformer = TfidfTransformer(norm='l2',
smooth_idf=True,
use_idf=True)
tfidf_matrix = transformer.fit_transform(bow_matrix)
return transformer, tfidf_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_extractor(corpus, ngram_range=(1, 1)):
vectorizer = TfidfVectorizer(min_df=1,
norm='l2',
smooth_idf=True,
use_idf=True,
ngram_range=ngram_range)
features = vectorizer.fit_transform(corpus)
return vectorizer, features
def train_predict_evaluate_model(classifier,
train_features, train_labels,
test_features, test_labels):
# build model
classifier.fit(train_features, train_labels)
# predict using model
predictions = classifier.predict(test_features)
# evaluate model prediction performance
get_metrics(true_labels=test_labels,
predicted_labels=predictions)
return predictions
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100)
lr = LogisticRegression()
# 基于词袋模型的多项朴素贝叶斯
print("基于词袋模型特征的贝叶斯分类器")
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
train_features=bow_train_features,
train_labels=train_labels,
test_features=bow_test_features,
test_labels=test_labels)
# 基于词袋模型特征的逻辑回归
print("基于词袋模型特征的逻辑回归")
lr_bow_predictions = train_predict_evaluate_model(classifier=lr,
train_features=bow_train_features,
train_labels=train_labels,
test_features=bow_test_features,
test_labels=test_labels)
# 基于词袋模型的支持向量机方法
print("基于词袋模型的支持向量机")
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
train_features=bow_train_features,
train_labels=train_labels,
test_features=bow_test_features,
test_labels=test_labels)
# 基于tfidf的多项式朴素贝叶斯模型
print("基于tfidf的贝叶斯模型")
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
train_features=tfidf_train_features,
train_labels=train_labels,
test_features=tfidf_test_features,
test_labels=test_labels)
# 基于tfidf的逻辑回归模型
print("基于tfidf的逻辑回归模型")
lr_tfidf_predictions=train_predict_evaluate_model(classifier=lr,
train_features=tfidf_train_features,
train_labels=train_labels,
test_features=tfidf_test_features,
test_labels=test_labels)
# 基于tfidf的支持向量机模型
print("基于tfidf的支持向量机模型")
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
train_features=tfidf_train_features,
train_labels=train_labels,
test_features=tfidf_test_features,
test_labels=test_labels)
显示部分测试结果
import re
num = 0
for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
if label == 0 and predicted_label == 0:
print('邮件类型:', label_name_map[int(label)])
print('预测的邮件类型:', label_name_map[int(predicted_label)])
print('文本:-')
print(re.sub('\n', ' ', document))
num += 1
if num == 4:
break
num = 0
for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
if label == 1 and predicted_label == 0:
print('邮件类型:', label_name_map[int(label)])
print('预测的邮件类型:', label_name_map[int(predicted_label)])
print('文本:-')
print(re.sub('\n', ' ', document))
num += 1
if num == 4:
break