THUCNews新闻文本分类-tfidf+sklearn

本文使用的是和鲸社区现有数据集,代码也在和鲸社区公开,fork之后可以直接运行。
本文中所用的数据集清华NLP组提供的THUCNews新闻文本分类数据集的一个子集(原始的数据集大约74万篇文档,训练起来需要花较长的时间)。 本次训练使用了其中的体育, 财经, 房产, 家居, 教育, 科技, 时尚, 时政, 游戏, 娱乐10个分类,每个分类6500条,总共65000条新闻数据。项目在和鲸社区的平台上跑的,数据集直接引用了和鲸的数据集,每个分类6500条,总共65000条新闻数据。

数据集划分如下: cnews.train.txt: 训练集(50000条) cnews.val.txt: 验证集(5000条) cnews.test.txt: 测试集(10000条)

本文使用了较为传统的tfidf算法实现文本的向量化,并使用sklearn中的经典分类算法对文本数据进行分类。

import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from pprint import pprint
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from data_loader.cnews_loader import *
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

预处理

# 设置数据读取、模型、结果保存路径
base_dir = '/home/kesci/input/new3021'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')
save_dir = 'checkpoints/textcnn'
save_path = os.path.join(save_dir, 'best_validation')

读取数据使用了data_loader.cnews_loader中的read_file读取数据。

train_contents, train_labels = read_file(train_dir)
test_contents, test_labels = read_file(test_dir)
val_counts = Counter(train_labels)
val_counts
    Counter({'体育': 5000,
             '娱乐': 5000,
             '家居': 5000,
             '房产': 5000,
             '教育': 5000,
             '时尚': 5000,
             '时政': 5000,
             '游戏': 5000,
             '科技': 5000,
             '财经': 5000})

移除特殊字符

import re
#去除文本中的表情字符(只保留中英文和数字)
def clear_character(sentence):
    pattern1= '\[.*?\]'     
    pattern2 = re.compile('[^\u4e00-\u9fa5^a-z^A-Z^0-9]')   
    line1=re.sub(pattern1,'',sentence)
    line2=re.sub(pattern2,'',line1)   
    new_sentence=''.join(line2.split()) #去除空白
    return new_sentence

train_text=list(map(lambda s: clear_character(s), train_contents))
test_text=list(map(lambda s: clear_character(s), test_contents))

分词

使用了jieba分词。

import jieba
train_seg_text=list(map(lambda s: jieba.lcut(s), train_text))
test_seg_text=list(map(lambda s: jieba.lcut(s), test_text))

去除停用词

# 读取停用词
stop_words_path = "/home/kesci/work/data_loader/百度停用词列表.txt"
def get_stop_words():
    file = open(stop_words_path, 'rb').read().decode('gbk').split('\r\n')
    return set(file)
stopwords = get_stop_words()

# 去掉文本中的停用词
def drop_stopwords(line, stopwords):
    line_clean = []
    for word in line:
        if word in stopwords:
            continue
        line_clean.append(word)
    return line_clean

train_st_text=list(map(lambda s: drop_stopwords(s,stopwords), train_seg_text))
test_st_text=list(map(lambda s: drop_stopwords(s,stopwords), test_seg_text))

标签映射

le = LabelEncoder()
le.fit(train_labels)
label_train_id=le.transform(train_labels)
label_test_id=le.transform(test_labels)

tfidf

train_c_text=list(map(lambda s: ' '.join(s), train_st_text))
test_c_text=list(map(lambda s: ' '.join(s), test_st_text))
tfidf_model = TfidfVectorizer(binary=False,token_pattern=r"(?u)\b\w+\b")
train_Data = tfidf_model.fit_transform(train_c_text)
test_Data = tfidf_model.transform(test_c_text)

LR模型

from sklearn.linear_model import LogisticRegression
'''LR模型分类训练'''
classifier=LogisticRegression()
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
                  precision    recall  f1-score   support
    
               0     0.9970    0.9950    0.9960      1000
               1     0.9850    0.9850    0.9850      1000
               2     0.9651    0.8560    0.9073      1000
               3     0.8963    0.9080    0.9021      1000
               4     0.9680    0.9070    0.9365      1000
               5     0.9676    0.9850    0.9762      1000
               6     0.9251    0.9630    0.9437      1000
               7     0.9682    0.9750    0.9716      1000
               8     0.9438    0.9910    0.9668      1000
               9     0.9457    0.9920    0.9683      1000
    
        accuracy                         0.9557     10000
       macro avg     0.9562    0.9557    0.9553     10000
    weighted avg     0.9562    0.9557    0.9553     10000
    
    time: 58.6 s

tuner

使用sklearn的tuner以及Pipline函数,对参数进行优化。在本文中调参结果影响比较小只有千分之几。由于参数多了训练时间会呈指数增长,所以只算个示例,具体调参还是得根据经验手工来调,或者采用更先进的调参方法(贝叶斯优化、Hyperband 和随机搜索算法等)进行调参。

parameters = {
    'tfidf__max_df': (0.75,),
#     'tfidf__stop_words':('english',stopwords),
    'tfidf__norm':('l2',),
    'tfidf__use_idf':(True,),
    'tfidf__smooth_idf':(True,),
    'tfidf__max_features':(None,),
#     'tfidf__ngram_range':((1, 1), (1, 2),(2, 2)),  # unigrams or bigrams

#     'clf__max_iter': (20,),
    'clf__penalty': ('l1','l2'),
    # 'clf__tol': (0.0001,0.00001,0.000001),
    'clf__solver': ( 'liblinear','saga',),
}
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")),
    ('clf', LogisticRegression()),
])

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_c_text, label_train_id)
print("done in %0.3fs" % (time() - t0))
print()
    Performing grid search...
    pipeline: ['tfidf', 'clf']
    parameters:
    {'clf__penalty': ('l1', 'l2'),
     'clf__solver': ('liblinear', 'saga'),
     'tfidf__max_df': (0.75,),
     'tfidf__max_features': (None,),
     'tfidf__norm': ('l2',),
     'tfidf__smooth_idf': (True,),
     'tfidf__use_idf': (True,)}
    Fitting 3 folds for each of 4 candidates, totalling 12 fits


    done in 614.930s
    time: 10min 14s

打印最佳模型结果及参数。

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
Best score: 0.907
Best parameters set:
	clf__penalty: 'l2'
	clf__solver: 'liblinear'
	tfidf__max_df: 0.75
	tfidf__max_features: None
	tfidf__norm: 'l2'
	tfidf__smooth_idf: True
	tfidf__use_idf: True
time: 1.36 ms
parameters = {
    'tfidf__max_df': (0.75,),
#     'tfidf__stop_words':('english',stopwords),
    'tfidf__norm':('l2',),
    'tfidf__use_idf':(True,),
    'tfidf__smooth_idf':(True,),
    'tfidf__max_features':(None,),
    # 'tfidf__ngram_range':((1, 1), (1, 2),(2, 2)),  # unigrams or bigrams

#     'clf__max_iter': (20,),
    'clf__penalty': ('l2',),
    'clf__C':(0.8,0.9,1.0,1.1,),
    'clf__tol': (0.001,0.0001,0.00001,0.000001,),
    'clf__solver': ( 'liblinear',),
}
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")),
    ('clf', LogisticRegression()),
])

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_c_text, label_train_id)
print("done in %0.3fs" % (time() - t0))
print()
    Performing grid search...
    pipeline: ['tfidf', 'clf']
    parameters:
    {'clf__C': (0.8, 0.9, 1.0, 1.1),
     'clf__penalty': ('l2',),
     'clf__solver': ('liblinear',),
     'clf__tol': (0.001, 0.0001, 1e-05, 1e-06),
     'tfidf__max_df': (0.75,),
     'tfidf__max_features': (None,),
     'tfidf__norm': ('l2',),
     'tfidf__smooth_idf': (True,),
     'tfidf__use_idf': (True,)}
    Fitting 3 folds for each of 16 candidates, totalling 48 fits


    done in 1031.269s
    time: 17min 11s
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    Best score: 0.908
    Best parameters set:
    	clf__C: 1.1
    	clf__penalty: 'l2'
    	clf__solver: 'liblinear'
    	clf__tol: 0.001
    	tfidf__max_df: 0.75
    	tfidf__max_features: None
    	tfidf__norm: 'l2'
    	tfidf__smooth_idf: True
    	tfidf__use_idf: True
    time: 1.48 ms
'''LR模型分类训练'''
classifier=LogisticRegression(C=1.1)
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
                  precision    recall  f1-score   support
    
               0     0.9980    0.9950    0.9965      1000
               1     0.9850    0.9840    0.9845      1000
               2     0.9652    0.8600    0.9096      1000
               3     0.8982    0.9090    0.9036      1000
               4     0.9691    0.9090    0.9381      1000
               5     0.9676    0.9850    0.9762      1000
               6     0.9278    0.9640    0.9456      1000
               7     0.9682    0.9750    0.9716      1000
               8     0.9447    0.9910    0.9673      1000
               9     0.9448    0.9920    0.9678      1000
    
        accuracy                         0.9564     10000
       macro avg     0.9569    0.9564    0.9561     10000
    weighted avg     0.9569    0.9564    0.9561     10000
    
    time: 58.6 s

将分类器参数C由1.0改为1.1后,结果提升了0.7%。因为sklearn算法默认参数比较好,所以分类结果本身就不错。

parameters = {
    'tfidf__max_df': (0.75,),
#     'tfidf__stop_words':('english',stopwords),
    'tfidf__norm':('l2',),
    'tfidf__use_idf':(True,),
    'tfidf__smooth_idf':(True,),
    'tfidf__max_features':(50000,100000,150000,),
    'tfidf__ngram_range':((1, 1), (1, 2),(2, 2)),  # unigrams or bigrams
#     'clf__max_iter': (20,),
    'clf__penalty': ('l2',),
    # 'clf__C':(0.8,0.9,1.0,1.1,),
    # 'clf__tol': (0.001,0.0001,0.00001,0.000001,),
    'clf__solver': ( 'liblinear',),
}
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")),
    ('clf', LogisticRegression()),
])

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_c_text, label_train_id)
print("done in %0.3fs" % (time() - t0))
print()
    Performing grid search...
    pipeline: ['tfidf', 'clf']
    parameters:
    {'clf__penalty': ('l2',),
     'clf__solver': ('liblinear',),
     'tfidf__max_df': (0.75,),
     'tfidf__max_features': (50000, 100000, 150000),
     'tfidf__ngram_range': ((1, 1), (1, 2), (2, 2)),
     'tfidf__norm': ('l2',),
     'tfidf__smooth_idf': (True,),
     'tfidf__use_idf': (True,)}
    Fitting 3 folds for each of 9 candidates, totalling 27 fits


    done in 1094.832s
    time: 18min 14s
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    Best score: 0.910
    Best parameters set:
    	clf__penalty: 'l2'
    	clf__solver: 'liblinear'
    	tfidf__max_df: 0.75
    	tfidf__max_features: 50000
    	tfidf__ngram_range: (1, 2)
    	tfidf__norm: 'l2'
    	tfidf__smooth_idf: True
    	tfidf__use_idf: True
    time: 1.37 ms
tfidf_model = TfidfVectorizer(binary=False,token_pattern=r"(?u)\b\w+\b",ngram_range=(1,2),max_features=50000)
train_Data = tfidf_model.fit_transform(train_c_text)
test_Data = tfidf_model.transform(test_c_text)
'''LR模型分类训练'''
classifier=LogisticRegression()
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
                  precision    recall  f1-score   support
    
               0     0.9970    0.9960    0.9965      1000
               1     0.9870    0.9850    0.9860      1000
               2     0.9650    0.8540    0.9061      1000
               3     0.8926    0.9140    0.9032      1000
               4     0.9701    0.9090    0.9386      1000
               5     0.9619    0.9850    0.9733      1000
               6     0.9322    0.9620    0.9469      1000
               7     0.9654    0.9770    0.9712      1000
               8     0.9538    0.9900    0.9715      1000
               9     0.9439    0.9920    0.9673      1000
    
        accuracy                         0.9564     10000
       macro avg     0.9569    0.9564    0.9561     10000
    weighted avg     0.9569    0.9564    0.9561     10000
    
    time: 38.9 s

tfidf算法采用2-gram特征后,分类结果与基线提升了0.7%。

'''LR模型分类训练'''
classifier=LogisticRegression(C=1.1)
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
                  precision    recall  f1-score   support
    
               0     0.9970    0.9960    0.9965      1000
               1     0.9870    0.9850    0.9860      1000
               2     0.9661    0.8560    0.9077      1000
               3     0.8927    0.9150    0.9037      1000
               4     0.9701    0.9090    0.9386      1000
               5     0.9629    0.9850    0.9738      1000
               6     0.9340    0.9630    0.9483      1000
               7     0.9654    0.9770    0.9712      1000
               8     0.9538    0.9900    0.9715      1000
               9     0.9439    0.9920    0.9673      1000
    
        accuracy                         0.9568     10000
       macro avg     0.9573    0.9568    0.9565     10000
    weighted avg     0.9573    0.9568    0.9565     10000

将分类器参数C调整为1.1后,效果在2-gram的基础上又提升了0.04%。由于max_features=50000是设置的参数里最小的,试试max_features=30000怎么样。

tfidf_model = TfidfVectorizer(binary=False,token_pattern=r"(?u)\b\w+\b",ngram_range=(1,2),max_features=30000)
train_Data = tfidf_model.fit_transform(train_c_text)
test_Data = tfidf_model.transform(test_c_text)

'''LR模型分类训练'''
classifier=LogisticRegression()
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
                  precision    recall  f1-score   support
    
               0     0.9970    0.9960    0.9965      1000
               1     0.9870    0.9850    0.9860      1000
               2     0.9673    0.8580    0.9094      1000
               3     0.8937    0.9160    0.9047      1000
               4     0.9681    0.9110    0.9387      1000
               5     0.9629    0.9870    0.9748      1000
               6     0.9357    0.9600    0.9477      1000
               7     0.9664    0.9780    0.9722      1000
               8     0.9565    0.9900    0.9730      1000
               9     0.9430    0.9920    0.9669      1000
    
        accuracy                         0.9573     10000
       macro avg     0.9578    0.9573    0.9570     10000
    weighted avg     0.9578    0.9573    0.9570     10000
    
    time: 1min 53s

将max_features设置为30000后,比50000提高了0.09个百分点。将C设置为1.1:

'''LR模型分类训练'''
classifier=LogisticRegression(C=1.1)
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
                  precision    recall  f1-score   support
    
               0     0.9970    0.9960    0.9965      1000
               1     0.9870    0.9850    0.9860      1000
               2     0.9684    0.8590    0.9104      1000
               3     0.8954    0.9160    0.9056      1000
               4     0.9661    0.9110    0.9377      1000
               5     0.9620    0.9880    0.9748      1000
               6     0.9357    0.9600    0.9477      1000
               7     0.9674    0.9780    0.9727      1000
               8     0.9574    0.9900    0.9735      1000
               9     0.9430    0.9920    0.9669      1000
    
        accuracy                         0.9575     10000
       macro avg     0.9579    0.9575    0.9572     10000
    weighted avg     0.9579    0.9575    0.9572     10000
    
    time: 44.3 s

结果提升了0.02个百分点,设置为1.2:

'''LR模型分类训练'''
classifier=LogisticRegression(C=1.2)
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
                  precision    recall  f1-score   support
    
               0     0.9970    0.9960    0.9965      1000
               1     0.9870    0.9850    0.9860      1000
               2     0.9695    0.8590    0.9109      1000
               3     0.8953    0.9150    0.9050      1000
               4     0.9661    0.9110    0.9377      1000
               5     0.9630    0.9880    0.9753      1000
               6     0.9357    0.9600    0.9477      1000
               7     0.9674    0.9800    0.9737      1000
               8     0.9574    0.9900    0.9735      1000
               9     0.9421    0.9920    0.9664      1000
    
        accuracy                         0.9576     10000
       macro avg     0.9580    0.9576    0.9573     10000
    weighted avg     0.9580    0.9576    0.9573     10000
    
    time: 45.7 s

提升了0.01个百分点,进一步设置为1.3。

'''LR模型分类训练'''
classifier=LogisticRegression(C=1.3)
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))
                  precision    recall  f1-score   support
    
               0     0.9970    0.9970    0.9970      1000
               1     0.9880    0.9850    0.9865      1000
               2     0.9696    0.8600    0.9115      1000
               3     0.8952    0.9140    0.9045      1000
               4     0.9661    0.9110    0.9377      1000
               5     0.9648    0.9880    0.9763      1000
               6     0.9357    0.9600    0.9477      1000
               7     0.9674    0.9800    0.9737      1000
               8     0.9566    0.9910    0.9735      1000
               9     0.9421    0.9920    0.9664      1000
    
        accuracy                         0.9578     10000
       macro avg     0.9582    0.9578    0.9575     10000
    weighted avg     0.9582    0.9578    0.9575     10000
    
    time: 46.6 s

又提升了0.02个百分。

绘制混淆矩阵

y_val=test_labels
y_pre = le.inverse_transform(pred)
## 评价预测效果,计算混淆矩阵
confm = metrics.confusion_matrix(y_pre,y_val)
categories = le.classes_ 
## 混淆矩阵可视化
plt.figure(figsize=(8,8))
sns.heatmap(confm.T, square=True, annot=True,
            fmt='d', cbar=False,linewidths=.8,
            cmap="YlGnBu")
plt.xlabel('True label',size = 14)
plt.ylabel('Predicted label',size = 14)
plt.xticks(np.arange(10)+0.5,categories,size = 12)
plt.yticks(np.arange(10)+0.3,categories,size = 12)
plt.show()
THUCNews新闻文本分类-tfidf+sklearn_第1张图片

你可能感兴趣的:(NLP,python,机器学习)