自然语言处理(NLP):08-05 TextCNN短文本分类案例分享

TextCNN 短文本分类案例分享,学习更多工业界案例:数据分析可视化技术、TextCNN论文原理、词向量技术、tensorboardx效果可视化工具、TextCNN 短文本分类案例分享总结,让大家彻底掌握TextCNN 在文本分类领域应用,有任何问题请留言或者加入交流群:NLP技术交流群 ,让我们一起学习NLP。

自然语言处理(NLP):08 TextCNN在短文本分类应用介绍
自然语言处理(NLP):08-01 数据分析和文本分类效果对比
自然语言处理(NLP):08-02 CNN文本分类论文讲解及TextCNN原理
自然语言处理(NLP):08-03 词向量word2vec
自然语言处理(NLP):08-04 tensorboardx 可视化
自然语言处理(NLP):08-05 TextCNN短文本分类案例分享

文章目录

  • 数据预处理
  • 基于所有数据构建词典
  • 文本离散化处理
  • 文本数据集切分
  • 自定义Dataset
  • DataLoader批量加载
  • 构建词向量
    • 词向量可视化
      • gensim工具加载词向量模型
      • 词向量应用
      • 词向量转换 可视化表示
    • 构建领域的词向量
    • 词向量验证
  • 模型训练
    • 模型结构
    • 训练方法定义

import warnings
warnings.filterwarnings('ignore')
import jieba
from data_processing import load_data
from collections import Counter
import numpy as np 

from dataset import MLDataset
from torch.utils.data import DataLoader
#from tqdm import tqdm

数据预处理

data_path = 'data/news/data.txt'
tokenizer = lambda x: jieba.lcut(x);
data, labels, max_seq_len = load_data(data_path, tokenizer)
0it [00:00, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/y4/m077y_dj1pd7blyvjrx9y30w0000gn/T/jieba.cache
Loading model cost 1.056 seconds.
Prefix dict has been built succesfully.
200000it [00:30, 6658.56it/s]

total_examples =  200000
label_count =  10
sentences example =  [['男子', '因', '家庭', '积怨', '杀死', '3', '名', '亲人', '重伤', '1', '人'], ['郭鹏', ':', '三亚', '房价', '不会', '下跌', '眼下', '正是', '买房', '最佳时机']]
labels example =  [5, 1]
max_seq_len =  21

基于所有数据构建词典

def build_vocab(data,max_size,min_freq = 3):
    """
    基于所有数据构建词表
    :param data:
            句子格式:[[word1,word2,word3....],[word11,word12,word13...]]
    :param max_size:最大词典的数量
    :param min_freq:
    :return:
           word_freqs: 词频-每个单词出现的次数 [ ('',-1),('',0),('创业',4),('风景',3) ......]
           dict_word2index: 每个单词对应的索引位置{ '':0,'':1,'创业':2,.... }
           dit_index2word: 每个位置对应的单词{ 0: '',1:'',2:'创业'.....}
    """
    
    word_freqs = [ ('',-1),('',-1) ]
    words = []
    for line in data:
        words.extend(line) # extend 和 append的区别: extend 每个单词放入list
    
    counter = Counter(words)
    # 词频从大到小进行排序
    counter_list = counter.most_common()[:max_size]
    for word,freq in counter_list:
        if freq >= min_freq:
            word_freqs.append( (word,freq) )
    
    # 构建word2index,index2word
    dict_word2index = dict()
    for word,freq in word_freqs:
        dict_word2index[word] = len(dict_word2index)
    dict_index2word = dict( zip(dict_word2index.values(), dict_word2index.keys() )  )
    
    vocab_size = len(dict_index2word)
    print('vocab_size = ',vocab_size)
    return vocab_size,word_freqs,dict_word2index,dict_index2word

max_size = 100000# 词典大小
min_freq = 1
vocab_size,word_freqs,dict_word2index,dict_index2word = build_vocab(data,max_size,min_freq)
vocab_size =  100002

文本离散化处理

def build_dataset(data,labels,dict_word2index,max_seq_len):
    """
        基于词表构建离散化数据
    :param data:
            句子格式:[[word1,word2,word3....],[word11,word12,word13...]]
    :param labels:
    :param dict_word2index:
                标签格式: [5,3]
    :param max_seq_len: data中的文本最大的长度( 保证数据长度一致 ,不够的补0,否则截断)
    :return:
            离散化后的结果
           datasets:[[3899,2,62....],[3,4,1...]]
           labels:[5,3]
    """
    dataset = []
    indices = np.arange(len(labels))
    for i in indices:
        # i 第i行的数据
        new_line = []
        for word in data[i]:# data[i] 获取data列表中的第i个句子(列表类型) 
            if word in dict_word2index:
                index = dict_word2index.get(word)
            else:
                index = dict_word2index.get('')
            new_line.append(index)
        
        # 短句子: 最大的长度看齐
        pad_num = max_seq_len - len(new_line)
        while pad_num > 0:
            new_line.append(dict_word2index.get(''))
            pad_num -= 1
        
        # 长句子截断
        dataset.append( new_line[:max_seq_len] )
    
    # 返回最终的结果
    datasets,labels = np.array(dataset,dtype = np.int64),np.array(labels,dtype=np.int64)
    return datasets,labels


datasets,labels = build_dataset(data,labels,dict_word2index,max_seq_len)
datasets[:2]
array([[   37,    79,  1139, 28947,   917,    18,    60,  7961,  1469,
           26,    23,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1],
       [63425,     3,  6084,   354,   362,   324, 63426, 16772,  1750,
        15095,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1]])

我们获取一条数据,查看原始文本数据如何

data0,label0 = datasets[0],labels[0]

data0_0 = [ dict_index2word.get(idx) for idx in data0 ]
print(data0_0)
['男子', '因', '家庭', '积怨', '杀死', '3', '名', '亲人', '重伤', '1', '人', '', '', '', '', '', '', '', '', '', '']
data[:2]
[['男子', '因', '家庭', '积怨', '杀死', '3', '名', '亲人', '重伤', '1', '人'],
 ['郭鹏', ':', '三亚', '房价', '不会', '下跌', '眼下', '正是', '买房', '最佳时机']]
dict_word2index.get('积怨')
28947
max_seq_len
21

文本数据集切分

  • 定义切分数据函数
  • 切分数据集,训练集和验证集
datasets
array([[   37,    79,  1139, ...,     1,     1,     1],
       [63425,     3,  6084, ...,     1,     1,     1],
       [  366,  2482,   814, ...,     1,     1,     1],
       ...,
       [25006,   904,  6164, ...,     1,     1,     1],
       [ 1265,   594,  7558, ...,     1,     1,     1],
       [   17,   412,   952, ...,     1,     1,     1]])
labels
array([5, 1, 7, ..., 2, 4, 0])
# 使用sklearn中的切分
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split( datasets, labels, test_size=0.3, random_state=42)

print('len(X_train) = ',len(X_train))
print('len(X_val) = ',len(X_val))
print('*' * 60)
print('X_train = ',X_train[:2])
print('y_train = ',y_train[:2])
len(X_train) =  140000
len(X_val) =  60000
************************************************************
X_train =  [[   42     9    43  2376 14912     4  1208  1419     5     1     1     1
      1     1     1     1     1     1     1     1     1]
 [37521 14174     3 12932  1341  4077     2   893  6055   484  3008     1
      1     1     1     1     1     1     1     1     1]]
y_train =  [3 2]

分析一下我们切分后的数据分布

import numpy as np
from collections import Counter
train_data = dict(Counter(y_train))
val_data = dict(Counter(y_val))


#dict 类型的排序
train_data = dict(  sorted(train_data.items(),key=lambda x:x[0],reverse=False)  )
val_data = dict(  sorted(val_data.items(),key=lambda x:x[0],reverse=False)  ) 

print(train_data)
print(val_data)
{0: 13935, 1: 14038, 2: 13982, 3: 14025, 4: 14021, 5: 14038, 6: 13952, 7: 13956, 8: 14048, 9: 14005}
{0: 6065, 1: 5962, 2: 6018, 3: 5975, 4: 5979, 5: 5962, 6: 6048, 7: 6044, 8: 5952, 9: 5995}

自定义Dataset

type(X_train)
numpy.ndarray
X_train.shape
(140000, 21)
X_train
array([[   42,     9,    43, ...,     1,     1,     1],
       [37521, 14174,     3, ...,     1,     1,     1],
       [14386,   205,    24, ...,     1,     1,     1],
       ...,
       [27693,    28,    78, ...,     1,     1,     1],
       [   30,   687,    63, ...,     1,     1,     1],
       [  207,   666,    44, ...,     1,     1,     1]])
train_dataset = MLDataset(X_train,y_train)
val_dataset = MLDataset(X_val,y_val)
train_dataset.x_data
array([[   42,     9,    43, ...,     1,     1,     1],
       [37521, 14174,     3, ...,     1,     1,     1],
       [14386,   205,    24, ...,     1,     1,     1],
       ...,
       [27693,    28,    78, ...,     1,     1,     1],
       [   30,   687,    63, ...,     1,     1,     1],
       [  207,   666,    44, ...,     1,     1,     1]])
train_dataset.__len__()
140000
train_dataset.__getitem__(0)
(array([   42,     9,    43,  2376, 14912,     4,  1208,  1419,     5,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1]), 3)

DataLoader批量加载

可以通过batch_size 进行数据加载

help(DataLoader)
Help on class DataLoader in module torch.utils.data.dataloader:

class DataLoader(builtins.object)
 |  DataLoader(dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, multiprocessing_context=None)
 |  
 |  Data loader. Combines a dataset and a sampler, and provides an iterable over
 |  the given dataset.
 |  
 |  The :class:`~torch.utils.data.DataLoader` supports both map-style and
 |  iterable-style datasets with single- or multi-process loading, customizing
 |  loading order and optional automatic batching (collation) and memory pinning.
 |  
 |  See :py:mod:`torch.utils.data` documentation page for more details.
 |  
 |  Arguments:
 |      dataset (Dataset): dataset from which to load the data.
 |      batch_size (int, optional): how many samples per batch to load
 |          (default: ``1``).
 |      shuffle (bool, optional): set to ``True`` to have the data reshuffled
 |          at every epoch (default: ``False``).
 |      sampler (Sampler, optional): defines the strategy to draw samples from
 |          the dataset. If specified, :attr:`shuffle` must be ``False``.
 |      batch_sampler (Sampler, optional): like :attr:`sampler`, but returns a batch of
 |          indices at a time. Mutually exclusive with :attr:`batch_size`,
 |          :attr:`shuffle`, :attr:`sampler`, and :attr:`drop_last`.
 |      num_workers (int, optional): how many subprocesses to use for data
 |          loading. ``0`` means that the data will be loaded in the main process.
 |          (default: ``0``)
 |      collate_fn (callable, optional): merges a list of samples to form a
 |          mini-batch of Tensor(s).  Used when using batched loading from a
 |          map-style dataset.
 |      pin_memory (bool, optional): If ``True``, the data loader will copy Tensors
 |          into CUDA pinned memory before returning them.  If your data elements
 |          are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type,
 |          see the example below.
 |      drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
 |          if the dataset size is not divisible by the batch size. If ``False`` and
 |          the size of dataset is not divisible by the batch size, then the last batch
 |          will be smaller. (default: ``False``)
 |      timeout (numeric, optional): if positive, the timeout value for collecting a batch
 |          from workers. Should always be non-negative. (default: ``0``)
 |      worker_init_fn (callable, optional): If not ``None``, this will be called on each
 |          worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
 |          input, after seeding and before data loading. (default: ``None``)
 |  
 |  
 |  .. warning:: If the ``spawn`` start method is used, :attr:`worker_init_fn`
 |               cannot be an unpicklable object, e.g., a lambda function. See
 |               :ref:`multiprocessing-best-practices` on more details related
 |               to multiprocessing in PyTorch.
 |  
 |  .. note:: ``len(dataloader)`` heuristic is based on the length of the sampler used.
 |            When :attr:`dataset` is an :class:`~torch.utils.data.IterableDataset`,
 |            an infinite sampler is used, whose :meth:`__len__` is not
 |            implemented, because the actual length depends on both the
 |            iterable as well as multi-process loading configurations. So one
 |            should not query this method unless they work with a map-style
 |            dataset. See `Dataset Types`_ for more details on these two types
 |            of datasets.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, multiprocessing_context=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __iter__(self)
 |  
 |  __len__(self)
 |  
 |  __setattr__(self, attr, val)
 |      Implement setattr(self, name, value).
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  multiprocessing_context
batch_size = 32
num_workers = 2
train_loader = DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=True,num_workers=num_workers)
val_loader = DataLoader(dataset=val_dataset,batch_size=batch_size,shuffle=False,num_workers=num_workers)
train_loader

next(iter(train_loader))
[tensor([[  490,  3710,   600,  1979,     2,   203, 20151,  8151,   850,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [  102,    94, 12099, 16709,   126,   107, 38082,    15,    39,  2544,
             16,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [42638,   839,  5814, 31617, 78419,     2,  3001,   136,  1071,   638,
           4954,     4,     6,     5,     1,     1,     1,     1,     1,     1,
              1],
         [10757,   178,  1470,     7,  6113,  5486, 10967,     8,   799,  1674,
            226,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [25144,    72, 11991, 11992,   154,    82,   103,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [  255,    27,  1221,  2947,   446,   236,   635,  2452,   324,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 1812,  1612,    77,  4276,  1076,   279,    36,  1708,    52, 24206,
              4,     6,     5,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 7406,  1935,  1510,   795,   659,  6182,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [13437,   110, 17048,    79,  6250, 91282, 58400, 25647, 17281,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [  448,    17,   728,  1210,     2,   909,   136,    17,    63,   728,
           7314,  2643, 13756,     1,     1,     1,     1,     1,     1,     1,
              1],
         [   98,  1618,     3,  1849,   949,   128,     2,  6618,   698,   185,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 4082,   445,  1458,   249,  1635,   153,   248,  4093,  1259,  8847,
              4,     6,     5,     1,     1,     1,     1,     1,     1,     1,
              1],
         [  367, 15823,  6347,    14,  2864, 23390,  4385, 58791,     2,  2732,
          20500,   152,   363,     9,   460,     1,     1,     1,     1,     1,
              1],
         [  398,    43,   312,     3,  1329, 84239,  2957,   139,     4,    21,
              5,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 3746,   930,  3867,   163, 97935,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [  236,  1867,   874,  1142,     2,  5726,  3096,    61,   288, 35933,
             11,   218, 25644,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 2395,    12,  3977,   242,  2003,  6120,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 1439,  1613, 20255, 80303,   789,  1459,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 1918,  1260,   118,  1738,  9191,  1461,  3616,  1142,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [   96,     9,    69, 11876,     7,  4169,   802,  2824,     8,   792,
            104,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [   62,  1574,   909,  4100,     2,  1215,   154, 17126,  4937,    15,
          77221,    16,     4,     6,     5,     1,     1,     1,     1,     1,
              1],
         [27321, 20215,    24,  8095,  3352,     2,    27,  7435,  3990,  1740,
          20567,  8057,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 6017,  1632, 12961,    61,  2487,  2257,   970,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 3833,    13, 26354,  2149,   383,     0,    14,  1924,     2,  1468,
           8626,  7989,     4,    21,     5,     1,     1,     1,     1,     1,
              1],
         [ 1014,   311,    32,  8007,  4106,     2,  1579, 25251, 23017,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 3626, 36881,   463,   890,  2364,  2840,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 1169,  1077,    47, 16051,   236,     2,  1013,  3272, 19092, 64410,
           9624,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [  177,   973, 32868,   315,   533,  4653, 25475,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 5305,    60, 17951,     3,    96,     9,   678, 19757, 10915,  4860,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [ 7859,   191,   274,  1320,    14,   331,  2312,     2,   675, 15580,
           4414,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [   19,    60,  1316,  3890,  3718,  7834,    53,    23,  8950,     2,
           4174,  3856,  6643,    23,     1,     1,     1,     1,     1,     1,
              1],
         [ 4193,    14,  6945,  1693, 12750,    15,   655,   244, 97051,    16,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1]]),
 tensor([4, 3, 9, 8, 3, 1, 1, 4, 5, 0, 0, 5, 7, 3, 6, 1, 2, 6, 0, 8, 3, 9, 0, 9,
         4, 1, 1, 2, 3, 5, 5, 9])]
for i,(words,labels)  in enumerate(train_loader):
    print(i)
    print('*' * 60)
    print(words)
    print('*' * 60)
    print(labels)
    break
0
************************************************************
tensor([[ 3928, 26941,  1738,     2,    17,   368, 26118, 25529,  1519,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [  472,   161,  5764,   403,    44,    11,    87,   339,  1386,   507,
          5702,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 2455,    27, 21532,  3439, 62060, 21748,   147,  4432,  6434,     2,
          4976,   205,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 3139,   914, 27201,   145,     2,    82,   154,  1309,   126,    23,
          4521,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 1720,  3149,   405,    91,   662,  2003, 28034,    23, 56911,    23,
           364,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [  489,     0,   323,   422,     0,  1460,     2,   304, 11616,    11,
          4881,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 4068,  2608,     0,   513,     2,    90,   436, 46225,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [  196,    35,   354,   185,    10,  3127,  4268,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 4539,   331, 10018,   179,  8652,  1937,    15, 16285,    16,  2104,
          1093,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 7236,    27,  2143, 18043,  1977, 15836,     2, 16123,  3125,  1258,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [15840,  4197,  2902,  5070,  9259,   905,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [24745,  4045,  2892,  2538,   363,    60,    81,   638,   207,  3012,
             4,     6,     5,     1,     1,     1,     1,     1,     1,     1,
             1],
        [   62,  5192,  1662,   145,  1620,   313,  8675,     2,  2631,  1569,
          2288,  6888,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 1037,  4979,  3698,   238,    14,  4065, 32789,  1129,    45,  2081,
           318,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 1350,   951,   918,  2133,    26,  1147,     2, 18428,  1034,    12,
           141,   241,  1112,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 7047,   198,    13, 51933,  2912,     2,   203,  6302,  1024,   509,
         57170,    22,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [  247,     9,  2448, 69387,  3000, 11227,     2,  2565,   463, 17930,
         27366,  1735, 12981,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 3051,    40,  6563, 16309,    13, 18391,   143,    92,  2605,  5104,
           477,    54,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 1275,   392,  7925,  2858, 35808,     2,  1912,   196,  2530, 61561,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [   20,  1429,    95,    24,  1635, 27190,     2,  7978,   137,    69,
         34417,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [  650,   118,  7415,  1306,     2,   433,  3870,  3002,   633, 11879,
             0,    22,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [  367,  3102,  5235,  2303,   853,     2,  2566,    36,  2552,   713,
         14612,     2,    31,    41,  2043,    74, 40140,     1,     1,     1,
             1],
        [ 2616, 45481,   745,  1621,   113,  4618,   209,  6496,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [   15,     0,    16,   383,  1153, 25143,     2,  6092,   492,  2838,
             4,     6,     5,     1,     1,     1,     1,     1,     1,     1,
             1],
        [27693,    28,    78,   406,     3,  4320,  2033,  1664,  3310,  5000,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [  302,   435,    12, 89376,   726,     2, 42645, 45168,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [  115,   514,     3,  4212,  2595,  3318,  4240,  4100,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 6300, 56493, 10786,     2,  4245,    29,   120,  1979,   188,   112,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 8814, 18649, 17074,    48,    12,   792, 78330,   726,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [  541,  1264,   221,     2,  1966,     2, 11420,   664,  9616,    22,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [   95, 17075, 11990, 56400,     3,   305,  7932, 17075,  2411, 13298,
          2838,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [ 1965,  3342,     3,   338,  2459,    20, 38637, 17655,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1]])
************************************************************
tensor([0, 3, 5, 3, 6, 5, 4, 1, 4, 6, 1, 5, 3, 5, 7, 4, 7, 1, 2, 8, 4, 7, 0, 9,
        1, 4, 3, 4, 8, 4, 7, 2])

构建词向量

https://github.com/Embedding/Chinese-Word-Vectors

词向量可视化

首先,我们了解下词向量,为方便大家了解词向量用途,我们通过可视化的方式给大家展示

gensim工具加载词向量模型

import gensim
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings('ignore')
sogou_word_model = KeyedVectors.load_word2vec_format('data/news/sgns.sogou.char',binary=False)
number_words = len(sogou_word_model.vocab)
print('number_words= ',number_words)
items = sogou_word_model.vocab.items()
number_words=  365076

词向量应用

我们看下词向量可以做些什么

  • 获取某个词的相关词
  • 获取某个词的向量,实现了文本的向量化表示
sogou_word_model.most_similar('北京大学')
[('北大', 0.6751739978790283),
 ('中国北京大学', 0.6405676603317261),
 ('北京大学经济系', 0.6353614330291748),
 ('北京大学化学系', 0.6258565187454224),
 ('北京大学经济学院', 0.6239113211631775),
 ('清华大学', 0.623389720916748),
 ('北京大学数学系', 0.6190596222877502),
 ('北京联合大学', 0.6075736880302429),
 ('北京大学国家发展研究院', 0.6050190329551697),
 ('北京大学社会学系', 0.6039434671401978)]

词向量转换 可视化表示

这里大家可以把number_words设置更多的数据,这样可以展示更多的内容。 为了演示,我们这里获取1000条

"""
Convert word2vec models to JSON database by cosine distance metric
"""
import json
# Name of output file
with open('custom_cosine_simialrity.json', 'w') as f:
    # number_words = len(model.vocab) # 考虑数据量大,这里我们取10000个词展示效果,了解
    number_words = 1000
    dic = {
     }
    for i in range(0, number_words):
        if i%100==0:
            print(i)
        stringA = list(items)[i][0]
        dic[stringA] = []
        nearest_words = sogou_word_model.most_similar(positive=[stringA], negative=[], topn=20)
        number_nearest_words = len(nearest_words)

        for j in range(0, number_nearest_words):
            dic[stringA].append({
     
                'w' : nearest_words[j][0],
                'd' : str(round(nearest_words[j][1], 3))
            })
    json.dump(dic, f, ensure_ascii=False, indent=4)
print("Finished!")
0
100
200
300
400
500
600
700
800
900
Finished!

我们把custom_cosine_simialrity.json 拷贝到可视化项目展示下

word2vec-visualization/frontend/data/custom_cosine_simialrity.json

然后,我们启动服务 python -m http.server 8081,然后可以通过浏览器进行访问了

http://127.0.0.1:8081/

构建领域的词向量

原始数据-> 字典(word2index)-> 结合外部的词向量给我们的每个word 对应的一个向量结果,然后保存起来

from tqdm import tqdm

ROOT_PATH = 'data/news/'
# 提取预训练词向量
emb_dim = 300
pretrain_dir = ROOT_PATH + 'sgns.sogou.char'
filename_trimmed_dir = ROOT_PATH + 'embedding_SougouNews'

# 随机初始化词向量矩阵
print("vocab_size = ", vocab_size)
print('emb_dim = ', emb_dim)
print('pretrain_dir = ',pretrain_dir)
print('filename_trimmed_dir = ',filename_trimmed_dir)
embeddings = np.random.rand(vocab_size, emb_dim)
print("embeddings shape=", embeddings.shape)

word_embedding = []
# 构建领域词向量
with open(pretrain_dir, 'r', encoding='utf-8') as f:
    for i, line in enumerate(tqdm(f)):
        if i == 0:  # 若第一行是标题,则跳过
            continue
        splits = line.strip().split(" ")
        word = splits[0]
        if word in dict_word2index:
            idx = dict_word2index[word] # 词对应index
            feat = splits[1:] # 预训练 词 embedding的数据
            emb = [float(x) for x in feat]
            
            # 
            embeddings[idx] = np.asarray(emb,dtype='float32')
            
            
            # 
            word_embedding.append( "{} {}".format(word,feat))
print('final embeddings = ',embeddings.shape)
# 保存 词典中每个单词的向量  <单词_idx,向量>
np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)
vocab_size =  100002
emb_dim =  300
pretrain_dir =  data/news/sgns.sogou.char
filename_trimmed_dir =  data/news/embedding_SougouNews


1238it [00:00, 6036.30it/s]

embeddings shape= (100002, 300)


365077it [00:20, 17690.22it/s]


final embeddings =  (100002, 300)

词向量验证

embedding_npz = np.load('data/news/embedding_SougouNews.npz')
print(embedding_npz['embeddings'].shape)
embedding_npz['embeddings']
(100002, 300)





array([[ 0.86068087,  0.88799588,  0.32389122, ...,  0.57952255,
         0.40938491,  0.34478832],
       [ 0.30103572,  0.9029473 ,  0.83626528, ...,  0.7544311 ,
         0.0582625 ,  0.88511525],
       [ 0.78581959,  0.33171605,  0.33091543, ...,  0.96064628,
         0.6581095 ,  0.51953195],
       ...,
       [-0.67065603,  0.18955401,  0.29267699, ...,  0.007438  ,
        -0.50046903, -0.603239  ],
       [ 0.27142355,  0.29432985,  0.16363384, ...,  0.42247807,
         0.19791253,  0.56313068],
       [ 0.99596462,  0.92187184,  0.53073177, ...,  0.78373915,
         0.0539841 ,  0.77132351]])

模型训练

模型结构

from importlib import import_module  

embedding = 'random'  
model_name = 'TextCNN'
num_epochs = 1        
dataset = 'data/news/'

x = import_module("models." + model_name)
config = x.Config(dataset,embedding)


model = x.Model(config)                            
model.to(config.device)  # 模型运行设备(cuda/cpu)        

print(model)
rando init embedding
Model(
  (embedding): Embedding(100002, 300)
  (convs): ModuleList(
    (0): Sequential(
      (0): Conv1d(300, 256, kernel_size=(3,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=19, stride=19, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (0): Conv1d(300, 256, kernel_size=(4,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=18, stride=18, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (0): Conv1d(300, 256, kernel_size=(5,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=17, stride=17, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (fc): Linear(in_features=768, out_features=10, bias=True)
)

训练方法定义

import torch
import torch.nn.functional as F
for i ,(feats,labels) in enumerate(train_loader):
    
    feats = feats.to(config.device)
    labels = labels.to(config.device)
    
    print('feats = ',feats.shape)
    print('labels = ',labels.shape)
    outputs = model(feats)
    print('outputs = ',outputs.shape)
    
    # < outputs,labels> -> CrossEntropy(outputs,labels)-> loss
    embed_x = model.embedding(feats)
    
    embed_x = embed_x.permute(0,2,1)
    
    
    
    print(embed_x.shape)
    
    
    out = [ conv(embed_x)  for conv in model.convs ]
    
    
    out = torch.cat(out,dim=1)
    print(out.shape)
    out = out.view(-1,out.size(1))
    print(out.shape)
    
    out = F.dropout(input = out,p = model.dropout)
    print(out.shape)
    
    
    out = model.fc( out )
    print(out.shape)
    break
feats =  torch.Size([32, 21])
labels =  torch.Size([32])
outputs =  torch.Size([32, 10])
torch.Size([32, 300, 21])
torch.Size([32, 768, 1])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 10])

# -*- coding: UTF-8 -*-
import pickle
from collections import OrderedDict
from importlib import import_module

import jieba
# 导入模型训练和评估模块
import torch
from flask import Flask, render_template
from flask import request

from data_processing import build_dataset_online
from json_utils import jsonify
from utils import load_config

# 导入模型训练和评估模块
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
app = Flask(__name__)

d_config = load_config('config_file')
print('d_config = ', d_config)
max_seq_len = d_config['vocab_size']
class_list = d_config['class_list']
print('class_list = ', class_list)
print('max_seq_len = ', max_seq_len)

# 加载词表文件
with open(d_config['vocab_file'], "rb") as f:
    dict_word2index = pickle.load(f)
d_config =  {'data_path': 'data/news/data.txt', 'vocab_file': 'data/news/word2index.pkl', 'model_name': 'TextCNN', 'learning_rate': 0.001, 'batch_size': 256, 'embedding_size': 300, 'num_classes': 10, 'dropout': 0.5, 'num_filters': 256, 'max_vocab_size': 100000, 'min_freq': 1, 'log_path': 'data/news/ckpts/TextCNN', 'ckpt_path': 'data/news/ckpts/TextCNN.ckpt', 'vocab_size': 100002, 'max_seq_len': 21, 'class_list': ['财经', '房产', '股票', '教育', '科技', '社会', '时政', '体育', '游戏', '娱乐']}
class_list =  ['财经', '房产', '股票', '教育', '科技', '社会', '时政', '体育', '游戏', '娱乐']
max_seq_len =  100002
def load_model():
    """
    初始化cnn 网络 ,这里我们 仅支持  外部词向量 + 分词模式下的预测
    其他的模型方式: 大家尝试自己学更改

    :return:
    """
    embedding = 'embedding_SougouNews.npz'
    model_name = "TextCNN"
    dataset = "data/news/"
    m_file = import_module("models." + model_name)
    config = m_file.Config(dataset, embedding)

    # 模型训练
    model = m_file.Model(config).to(device)
    model.load_state_dict(torch.load(config.save_path, map_location='cpu'))
    model.eval()
    return model
# 加载cnn 模型以及配置信息
model = load_model()
word = True
if word:
    tokenizer = lambda x: jieba.lcut(x)  # 中文我们按照切词处理
else:
    tokenizer = lambda x: [y for y in x]

tokenizer = lambda x: jieba.lcut(x)
pre_trained init embedding
model
Model(
  (embedding): Embedding(100002, 300)
  (convs): ModuleList(
    (0): Sequential(
      (0): Conv1d(300, 256, kernel_size=(2,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=20, stride=20, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (0): Conv1d(300, 256, kernel_size=(3,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=19, stride=19, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (0): Conv1d(300, 256, kernel_size=(4,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=18, stride=18, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (fc): Linear(in_features=768, out_features=10, bias=True)
)

更多关于TextCNN 短文本分类详细案例:可以入群交流学习。
访问博客首页交流群或者私信留言。多谢

你可能感兴趣的:(自然语言处理,NLP,TextCNN,文本分类)