本文使用的是和鲸社区现有数据集,代码也在和鲸社区公开,fork之后可以直接运行。
本文中所用的数据集清华NLP组提供的THUCNews新闻文本分类数据集的一个子集(原始的数据集大约74万篇文档,训练起来需要花较长的时间)。 本次训练使用了其中的体育, 财经, 房产, 家居, 教育, 科技, 时尚, 时政, 游戏, 娱乐10个分类,每个分类6500条,总共65000条新闻数据。项目在和鲸社区的平台上跑的,数据集直接引用了和鲸的数据集,每个分类6500条,总共65000条新闻数据。
数据集划分如下: cnews.train.txt: 训练集(50000条) cnews.val.txt: 验证集(5000条) cnews.test.txt: 测试集(10000条)
本文使用了较为传统的tfidf算法实现文本的向量化,并使用sklearn中的经典分类算法对文本数据进行分类。
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from pprint import pprint
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from data_loader.cnews_loader import *
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
# 设置数据读取、模型、结果保存路径
base_dir = '/home/kesci/input/new3021'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')
save_dir = 'checkpoints/textcnn'
save_path = os.path.join(save_dir, 'best_validation')
读取数据使用了data_loader.cnews_loader中的read_file读取数据。
train_contents, train_labels = read_file(train_dir)
test_contents, test_labels = read_file(test_dir)
val_counts = Counter(train_labels)
val_counts
Counter({'体育': 5000,
'娱乐': 5000,
'家居': 5000,
'房产': 5000,
'教育': 5000,
'时尚': 5000,
'时政': 5000,
'游戏': 5000,
'科技': 5000,
'财经': 5000})
import re
#去除文本中的表情字符(只保留中英文和数字)
def clear_character(sentence):
pattern1= '\[.*?\]'
pattern2 = re.compile('[^\u4e00-\u9fa5^a-z^A-Z^0-9]')
line1=re.sub(pattern1,'',sentence)
line2=re.sub(pattern2,'',line1)
new_sentence=''.join(line2.split()) #去除空白
return new_sentence
train_text=list(map(lambda s: clear_character(s), train_contents))
test_text=list(map(lambda s: clear_character(s), test_contents))
使用了jieba分词。
import jieba
train_seg_text=list(map(lambda s: jieba.lcut(s), train_text))
test_seg_text=list(map(lambda s: jieba.lcut(s), test_text))
# 读取停用词
stop_words_path = "/home/kesci/work/data_loader/百度停用词列表.txt"
def get_stop_words():
file = open(stop_words_path, 'rb').read().decode('gbk').split('\r\n')
return set(file)
stopwords = get_stop_words()
# 去掉文本中的停用词
def drop_stopwords(line, stopwords):
line_clean = []
for word in line:
if word in stopwords:
continue
line_clean.append(word)
return line_clean
train_st_text=list(map(lambda s: drop_stopwords(s,stopwords), train_seg_text))
test_st_text=list(map(lambda s: drop_stopwords(s,stopwords), test_seg_text))
le = LabelEncoder()
le.fit(train_labels)
label_train_id=le.transform(train_labels)
label_test_id=le.transform(test_labels)
train_c_text=list(map(lambda s: ' '.join(s), train_st_text))
test_c_text=list(map(lambda s: ' '.join(s), test_st_text))
tfidf_model = TfidfVectorizer(binary=False,token_pattern=r"(?u)\b\w+\b")
train_Data = tfidf_model.fit_transform(train_c_text)
test_Data = tfidf_model.transform(test_c_text)
from sklearn.linear_model import LogisticRegression
'''LR模型分类训练'''
classifier=LogisticRegression()
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
precision recall f1-score support
0 0.9970 0.9950 0.9960 1000
1 0.9850 0.9850 0.9850 1000
2 0.9651 0.8560 0.9073 1000
3 0.8963 0.9080 0.9021 1000
4 0.9680 0.9070 0.9365 1000
5 0.9676 0.9850 0.9762 1000
6 0.9251 0.9630 0.9437 1000
7 0.9682 0.9750 0.9716 1000
8 0.9438 0.9910 0.9668 1000
9 0.9457 0.9920 0.9683 1000
accuracy 0.9557 10000
macro avg 0.9562 0.9557 0.9553 10000
weighted avg 0.9562 0.9557 0.9553 10000
time: 58.6 s
使用sklearn的tuner以及Pipline函数,对参数进行优化。在本文中调参结果影响比较小只有千分之几。由于参数多了训练时间会呈指数增长,所以只算个示例,具体调参还是得根据经验手工来调,或者采用更先进的调参方法(贝叶斯优化、Hyperband 和随机搜索算法等)进行调参。
parameters = {
'tfidf__max_df': (0.75,),
# 'tfidf__stop_words':('english',stopwords),
'tfidf__norm':('l2',),
'tfidf__use_idf':(True,),
'tfidf__smooth_idf':(True,),
'tfidf__max_features':(None,),
# 'tfidf__ngram_range':((1, 1), (1, 2),(2, 2)), # unigrams or bigrams
# 'clf__max_iter': (20,),
'clf__penalty': ('l1','l2'),
# 'clf__tol': (0.0001,0.00001,0.000001),
'clf__solver': ( 'liblinear','saga',),
}
pipeline = Pipeline([
('tfidf', TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")),
('clf', LogisticRegression()),
])
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_c_text, label_train_id)
print("done in %0.3fs" % (time() - t0))
print()
Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__penalty': ('l1', 'l2'),
'clf__solver': ('liblinear', 'saga'),
'tfidf__max_df': (0.75,),
'tfidf__max_features': (None,),
'tfidf__norm': ('l2',),
'tfidf__smooth_idf': (True,),
'tfidf__use_idf': (True,)}
Fitting 3 folds for each of 4 candidates, totalling 12 fits
done in 614.930s
time: 10min 14s
打印最佳模型结果及参数。
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
Best score: 0.907
Best parameters set:
clf__penalty: 'l2'
clf__solver: 'liblinear'
tfidf__max_df: 0.75
tfidf__max_features: None
tfidf__norm: 'l2'
tfidf__smooth_idf: True
tfidf__use_idf: True
time: 1.36 ms
parameters = {
'tfidf__max_df': (0.75,),
# 'tfidf__stop_words':('english',stopwords),
'tfidf__norm':('l2',),
'tfidf__use_idf':(True,),
'tfidf__smooth_idf':(True,),
'tfidf__max_features':(None,),
# 'tfidf__ngram_range':((1, 1), (1, 2),(2, 2)), # unigrams or bigrams
# 'clf__max_iter': (20,),
'clf__penalty': ('l2',),
'clf__C':(0.8,0.9,1.0,1.1,),
'clf__tol': (0.001,0.0001,0.00001,0.000001,),
'clf__solver': ( 'liblinear',),
}
pipeline = Pipeline([
('tfidf', TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")),
('clf', LogisticRegression()),
])
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_c_text, label_train_id)
print("done in %0.3fs" % (time() - t0))
print()
Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__C': (0.8, 0.9, 1.0, 1.1),
'clf__penalty': ('l2',),
'clf__solver': ('liblinear',),
'clf__tol': (0.001, 0.0001, 1e-05, 1e-06),
'tfidf__max_df': (0.75,),
'tfidf__max_features': (None,),
'tfidf__norm': ('l2',),
'tfidf__smooth_idf': (True,),
'tfidf__use_idf': (True,)}
Fitting 3 folds for each of 16 candidates, totalling 48 fits
done in 1031.269s
time: 17min 11s
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
Best score: 0.908
Best parameters set:
clf__C: 1.1
clf__penalty: 'l2'
clf__solver: 'liblinear'
clf__tol: 0.001
tfidf__max_df: 0.75
tfidf__max_features: None
tfidf__norm: 'l2'
tfidf__smooth_idf: True
tfidf__use_idf: True
time: 1.48 ms
'''LR模型分类训练'''
classifier=LogisticRegression(C=1.1)
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
precision recall f1-score support
0 0.9980 0.9950 0.9965 1000
1 0.9850 0.9840 0.9845 1000
2 0.9652 0.8600 0.9096 1000
3 0.8982 0.9090 0.9036 1000
4 0.9691 0.9090 0.9381 1000
5 0.9676 0.9850 0.9762 1000
6 0.9278 0.9640 0.9456 1000
7 0.9682 0.9750 0.9716 1000
8 0.9447 0.9910 0.9673 1000
9 0.9448 0.9920 0.9678 1000
accuracy 0.9564 10000
macro avg 0.9569 0.9564 0.9561 10000
weighted avg 0.9569 0.9564 0.9561 10000
time: 58.6 s
将分类器参数C由1.0改为1.1后,结果提升了0.7%。因为sklearn算法默认参数比较好,所以分类结果本身就不错。
parameters = {
'tfidf__max_df': (0.75,),
# 'tfidf__stop_words':('english',stopwords),
'tfidf__norm':('l2',),
'tfidf__use_idf':(True,),
'tfidf__smooth_idf':(True,),
'tfidf__max_features':(50000,100000,150000,),
'tfidf__ngram_range':((1, 1), (1, 2),(2, 2)), # unigrams or bigrams
# 'clf__max_iter': (20,),
'clf__penalty': ('l2',),
# 'clf__C':(0.8,0.9,1.0,1.1,),
# 'clf__tol': (0.001,0.0001,0.00001,0.000001,),
'clf__solver': ( 'liblinear',),
}
pipeline = Pipeline([
('tfidf', TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")),
('clf', LogisticRegression()),
])
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_c_text, label_train_id)
print("done in %0.3fs" % (time() - t0))
print()
Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__penalty': ('l2',),
'clf__solver': ('liblinear',),
'tfidf__max_df': (0.75,),
'tfidf__max_features': (50000, 100000, 150000),
'tfidf__ngram_range': ((1, 1), (1, 2), (2, 2)),
'tfidf__norm': ('l2',),
'tfidf__smooth_idf': (True,),
'tfidf__use_idf': (True,)}
Fitting 3 folds for each of 9 candidates, totalling 27 fits
done in 1094.832s
time: 18min 14s
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
Best score: 0.910
Best parameters set:
clf__penalty: 'l2'
clf__solver: 'liblinear'
tfidf__max_df: 0.75
tfidf__max_features: 50000
tfidf__ngram_range: (1, 2)
tfidf__norm: 'l2'
tfidf__smooth_idf: True
tfidf__use_idf: True
time: 1.37 ms
tfidf_model = TfidfVectorizer(binary=False,token_pattern=r"(?u)\b\w+\b",ngram_range=(1,2),max_features=50000)
train_Data = tfidf_model.fit_transform(train_c_text)
test_Data = tfidf_model.transform(test_c_text)
'''LR模型分类训练'''
classifier=LogisticRegression()
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
precision recall f1-score support
0 0.9970 0.9960 0.9965 1000
1 0.9870 0.9850 0.9860 1000
2 0.9650 0.8540 0.9061 1000
3 0.8926 0.9140 0.9032 1000
4 0.9701 0.9090 0.9386 1000
5 0.9619 0.9850 0.9733 1000
6 0.9322 0.9620 0.9469 1000
7 0.9654 0.9770 0.9712 1000
8 0.9538 0.9900 0.9715 1000
9 0.9439 0.9920 0.9673 1000
accuracy 0.9564 10000
macro avg 0.9569 0.9564 0.9561 10000
weighted avg 0.9569 0.9564 0.9561 10000
time: 38.9 s
tfidf算法采用2-gram特征后,分类结果与基线提升了0.7%。
'''LR模型分类训练'''
classifier=LogisticRegression(C=1.1)
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
precision recall f1-score support
0 0.9970 0.9960 0.9965 1000
1 0.9870 0.9850 0.9860 1000
2 0.9661 0.8560 0.9077 1000
3 0.8927 0.9150 0.9037 1000
4 0.9701 0.9090 0.9386 1000
5 0.9629 0.9850 0.9738 1000
6 0.9340 0.9630 0.9483 1000
7 0.9654 0.9770 0.9712 1000
8 0.9538 0.9900 0.9715 1000
9 0.9439 0.9920 0.9673 1000
accuracy 0.9568 10000
macro avg 0.9573 0.9568 0.9565 10000
weighted avg 0.9573 0.9568 0.9565 10000
将分类器参数C调整为1.1后,效果在2-gram的基础上又提升了0.04%。由于max_features=50000是设置的参数里最小的,试试max_features=30000怎么样。
tfidf_model = TfidfVectorizer(binary=False,token_pattern=r"(?u)\b\w+\b",ngram_range=(1,2),max_features=30000)
train_Data = tfidf_model.fit_transform(train_c_text)
test_Data = tfidf_model.transform(test_c_text)
'''LR模型分类训练'''
classifier=LogisticRegression()
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
precision recall f1-score support
0 0.9970 0.9960 0.9965 1000
1 0.9870 0.9850 0.9860 1000
2 0.9673 0.8580 0.9094 1000
3 0.8937 0.9160 0.9047 1000
4 0.9681 0.9110 0.9387 1000
5 0.9629 0.9870 0.9748 1000
6 0.9357 0.9600 0.9477 1000
7 0.9664 0.9780 0.9722 1000
8 0.9565 0.9900 0.9730 1000
9 0.9430 0.9920 0.9669 1000
accuracy 0.9573 10000
macro avg 0.9578 0.9573 0.9570 10000
weighted avg 0.9578 0.9573 0.9570 10000
time: 1min 53s
将max_features设置为30000后,比50000提高了0.09个百分点。将C设置为1.1:
'''LR模型分类训练'''
classifier=LogisticRegression(C=1.1)
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
precision recall f1-score support
0 0.9970 0.9960 0.9965 1000
1 0.9870 0.9850 0.9860 1000
2 0.9684 0.8590 0.9104 1000
3 0.8954 0.9160 0.9056 1000
4 0.9661 0.9110 0.9377 1000
5 0.9620 0.9880 0.9748 1000
6 0.9357 0.9600 0.9477 1000
7 0.9674 0.9780 0.9727 1000
8 0.9574 0.9900 0.9735 1000
9 0.9430 0.9920 0.9669 1000
accuracy 0.9575 10000
macro avg 0.9579 0.9575 0.9572 10000
weighted avg 0.9579 0.9575 0.9572 10000
time: 44.3 s
结果提升了0.02个百分点,设置为1.2:
'''LR模型分类训练'''
classifier=LogisticRegression(C=1.2)
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
precision recall f1-score support
0 0.9970 0.9960 0.9965 1000
1 0.9870 0.9850 0.9860 1000
2 0.9695 0.8590 0.9109 1000
3 0.8953 0.9150 0.9050 1000
4 0.9661 0.9110 0.9377 1000
5 0.9630 0.9880 0.9753 1000
6 0.9357 0.9600 0.9477 1000
7 0.9674 0.9800 0.9737 1000
8 0.9574 0.9900 0.9735 1000
9 0.9421 0.9920 0.9664 1000
accuracy 0.9576 10000
macro avg 0.9580 0.9576 0.9573 10000
weighted avg 0.9580 0.9576 0.9573 10000
time: 45.7 s
提升了0.01个百分点,进一步设置为1.3。
'''LR模型分类训练'''
classifier=LogisticRegression(C=1.3)
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
precision recall f1-score support
0 0.9970 0.9970 0.9970 1000
1 0.9880 0.9850 0.9865 1000
2 0.9696 0.8600 0.9115 1000
3 0.8952 0.9140 0.9045 1000
4 0.9661 0.9110 0.9377 1000
5 0.9648 0.9880 0.9763 1000
6 0.9357 0.9600 0.9477 1000
7 0.9674 0.9800 0.9737 1000
8 0.9566 0.9910 0.9735 1000
9 0.9421 0.9920 0.9664 1000
accuracy 0.9578 10000
macro avg 0.9582 0.9578 0.9575 10000
weighted avg 0.9582 0.9578 0.9575 10000
time: 46.6 s
又提升了0.02个百分。
y_val=test_labels
y_pre = le.inverse_transform(pred)
## 评价预测效果,计算混淆矩阵
confm = metrics.confusion_matrix(y_pre,y_val)
categories = le.classes_
## 混淆矩阵可视化
plt.figure(figsize=(8,8))
sns.heatmap(confm.T, square=True, annot=True,
fmt='d', cbar=False,linewidths=.8,
cmap="YlGnBu")
plt.xlabel('True label',size = 14)
plt.ylabel('Predicted label',size = 14)
plt.xticks(np.arange(10)+0.5,categories,size = 12)
plt.yticks(np.arange(10)+0.3,categories,size = 12)
plt.show()