TextCNN 短文本分类案例分享,学习更多工业界案例:数据分析可视化技术、TextCNN论文原理、词向量技术、tensorboardx效果可视化工具、TextCNN 短文本分类案例分享总结,让大家彻底掌握TextCNN 在文本分类领域应用,有任何问题请留言或者加入交流群:NLP技术交流群 ,让我们一起学习NLP。
自然语言处理(NLP):08 TextCNN在短文本分类应用介绍
自然语言处理(NLP):08-01 数据分析和文本分类效果对比
自然语言处理(NLP):08-02 CNN文本分类论文讲解及TextCNN原理
自然语言处理(NLP):08-03 词向量word2vec
自然语言处理(NLP):08-04 tensorboardx 可视化
自然语言处理(NLP):08-05 TextCNN短文本分类案例分享
import warnings
warnings.filterwarnings('ignore')
import jieba
from data_processing import load_data
from collections import Counter
import numpy as np
from dataset import MLDataset
from torch.utils.data import DataLoader
#from tqdm import tqdm
data_path = 'data/news/data.txt'
tokenizer = lambda x: jieba.lcut(x);
data, labels, max_seq_len = load_data(data_path, tokenizer)
0it [00:00, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/y4/m077y_dj1pd7blyvjrx9y30w0000gn/T/jieba.cache
Loading model cost 1.056 seconds.
Prefix dict has been built succesfully.
200000it [00:30, 6658.56it/s]
total_examples = 200000
label_count = 10
sentences example = [['男子', '因', '家庭', '积怨', '杀死', '3', '名', '亲人', '重伤', '1', '人'], ['郭鹏', ':', '三亚', '房价', '不会', '下跌', '眼下', '正是', '买房', '最佳时机']]
labels example = [5, 1]
max_seq_len = 21
def build_vocab(data,max_size,min_freq = 3):
"""
基于所有数据构建词表
:param data:
句子格式:[[word1,word2,word3....],[word11,word12,word13...]]
:param max_size:最大词典的数量
:param min_freq:
:return:
word_freqs: 词频-每个单词出现的次数 [ ('',-1),('',0),('创业',4),('风景',3) ......]
dict_word2index: 每个单词对应的索引位置{ '':0,'':1,'创业':2,.... }
dit_index2word: 每个位置对应的单词{ 0: '',1:'',2:'创业'.....}
"""
word_freqs = [ ('' ,-1),('' ,-1) ]
words = []
for line in data:
words.extend(line) # extend 和 append的区别: extend 每个单词放入list
counter = Counter(words)
# 词频从大到小进行排序
counter_list = counter.most_common()[:max_size]
for word,freq in counter_list:
if freq >= min_freq:
word_freqs.append( (word,freq) )
# 构建word2index,index2word
dict_word2index = dict()
for word,freq in word_freqs:
dict_word2index[word] = len(dict_word2index)
dict_index2word = dict( zip(dict_word2index.values(), dict_word2index.keys() ) )
vocab_size = len(dict_index2word)
print('vocab_size = ',vocab_size)
return vocab_size,word_freqs,dict_word2index,dict_index2word
max_size = 100000# 词典大小
min_freq = 1
vocab_size,word_freqs,dict_word2index,dict_index2word = build_vocab(data,max_size,min_freq)
vocab_size = 100002
def build_dataset(data,labels,dict_word2index,max_seq_len):
"""
基于词表构建离散化数据
:param data:
句子格式:[[word1,word2,word3....],[word11,word12,word13...]]
:param labels:
:param dict_word2index:
标签格式: [5,3]
:param max_seq_len: data中的文本最大的长度( 保证数据长度一致 ,不够的补0,否则截断)
:return:
离散化后的结果
datasets:[[3899,2,62....],[3,4,1...]]
labels:[5,3]
"""
dataset = []
indices = np.arange(len(labels))
for i in indices:
# i 第i行的数据
new_line = []
for word in data[i]:# data[i] 获取data列表中的第i个句子(列表类型)
if word in dict_word2index:
index = dict_word2index.get(word)
else:
index = dict_word2index.get('' )
new_line.append(index)
# 短句子: 最大的长度看齐
pad_num = max_seq_len - len(new_line)
while pad_num > 0:
new_line.append(dict_word2index.get('' ))
pad_num -= 1
# 长句子截断
dataset.append( new_line[:max_seq_len] )
# 返回最终的结果
datasets,labels = np.array(dataset,dtype = np.int64),np.array(labels,dtype=np.int64)
return datasets,labels
datasets,labels = build_dataset(data,labels,dict_word2index,max_seq_len)
datasets[:2]
array([[ 37, 79, 1139, 28947, 917, 18, 60, 7961, 1469,
26, 23, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1],
[63425, 3, 6084, 354, 362, 324, 63426, 16772, 1750,
15095, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1]])
我们获取一条数据,查看原始文本数据如何
data0,label0 = datasets[0],labels[0]
data0_0 = [ dict_index2word.get(idx) for idx in data0 ]
print(data0_0)
['男子', '因', '家庭', '积怨', '杀死', '3', '名', '亲人', '重伤', '1', '人', '', '', '', '', '', '', '', '', '', '']
data[:2]
[['男子', '因', '家庭', '积怨', '杀死', '3', '名', '亲人', '重伤', '1', '人'],
['郭鹏', ':', '三亚', '房价', '不会', '下跌', '眼下', '正是', '买房', '最佳时机']]
dict_word2index.get('积怨')
28947
max_seq_len
21
datasets
array([[ 37, 79, 1139, ..., 1, 1, 1],
[63425, 3, 6084, ..., 1, 1, 1],
[ 366, 2482, 814, ..., 1, 1, 1],
...,
[25006, 904, 6164, ..., 1, 1, 1],
[ 1265, 594, 7558, ..., 1, 1, 1],
[ 17, 412, 952, ..., 1, 1, 1]])
labels
array([5, 1, 7, ..., 2, 4, 0])
# 使用sklearn中的切分
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split( datasets, labels, test_size=0.3, random_state=42)
print('len(X_train) = ',len(X_train))
print('len(X_val) = ',len(X_val))
print('*' * 60)
print('X_train = ',X_train[:2])
print('y_train = ',y_train[:2])
len(X_train) = 140000
len(X_val) = 60000
************************************************************
X_train = [[ 42 9 43 2376 14912 4 1208 1419 5 1 1 1
1 1 1 1 1 1 1 1 1]
[37521 14174 3 12932 1341 4077 2 893 6055 484 3008 1
1 1 1 1 1 1 1 1 1]]
y_train = [3 2]
分析一下我们切分后的数据分布
import numpy as np
from collections import Counter
train_data = dict(Counter(y_train))
val_data = dict(Counter(y_val))
#dict 类型的排序
train_data = dict( sorted(train_data.items(),key=lambda x:x[0],reverse=False) )
val_data = dict( sorted(val_data.items(),key=lambda x:x[0],reverse=False) )
print(train_data)
print(val_data)
{0: 13935, 1: 14038, 2: 13982, 3: 14025, 4: 14021, 5: 14038, 6: 13952, 7: 13956, 8: 14048, 9: 14005}
{0: 6065, 1: 5962, 2: 6018, 3: 5975, 4: 5979, 5: 5962, 6: 6048, 7: 6044, 8: 5952, 9: 5995}
type(X_train)
numpy.ndarray
X_train.shape
(140000, 21)
X_train
array([[ 42, 9, 43, ..., 1, 1, 1],
[37521, 14174, 3, ..., 1, 1, 1],
[14386, 205, 24, ..., 1, 1, 1],
...,
[27693, 28, 78, ..., 1, 1, 1],
[ 30, 687, 63, ..., 1, 1, 1],
[ 207, 666, 44, ..., 1, 1, 1]])
train_dataset = MLDataset(X_train,y_train)
val_dataset = MLDataset(X_val,y_val)
train_dataset.x_data
array([[ 42, 9, 43, ..., 1, 1, 1],
[37521, 14174, 3, ..., 1, 1, 1],
[14386, 205, 24, ..., 1, 1, 1],
...,
[27693, 28, 78, ..., 1, 1, 1],
[ 30, 687, 63, ..., 1, 1, 1],
[ 207, 666, 44, ..., 1, 1, 1]])
train_dataset.__len__()
140000
train_dataset.__getitem__(0)
(array([ 42, 9, 43, 2376, 14912, 4, 1208, 1419, 5,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1]), 3)
可以通过batch_size 进行数据加载
help(DataLoader)
Help on class DataLoader in module torch.utils.data.dataloader:
class DataLoader(builtins.object)
| DataLoader(dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, multiprocessing_context=None)
|
| Data loader. Combines a dataset and a sampler, and provides an iterable over
| the given dataset.
|
| The :class:`~torch.utils.data.DataLoader` supports both map-style and
| iterable-style datasets with single- or multi-process loading, customizing
| loading order and optional automatic batching (collation) and memory pinning.
|
| See :py:mod:`torch.utils.data` documentation page for more details.
|
| Arguments:
| dataset (Dataset): dataset from which to load the data.
| batch_size (int, optional): how many samples per batch to load
| (default: ``1``).
| shuffle (bool, optional): set to ``True`` to have the data reshuffled
| at every epoch (default: ``False``).
| sampler (Sampler, optional): defines the strategy to draw samples from
| the dataset. If specified, :attr:`shuffle` must be ``False``.
| batch_sampler (Sampler, optional): like :attr:`sampler`, but returns a batch of
| indices at a time. Mutually exclusive with :attr:`batch_size`,
| :attr:`shuffle`, :attr:`sampler`, and :attr:`drop_last`.
| num_workers (int, optional): how many subprocesses to use for data
| loading. ``0`` means that the data will be loaded in the main process.
| (default: ``0``)
| collate_fn (callable, optional): merges a list of samples to form a
| mini-batch of Tensor(s). Used when using batched loading from a
| map-style dataset.
| pin_memory (bool, optional): If ``True``, the data loader will copy Tensors
| into CUDA pinned memory before returning them. If your data elements
| are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type,
| see the example below.
| drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
| if the dataset size is not divisible by the batch size. If ``False`` and
| the size of dataset is not divisible by the batch size, then the last batch
| will be smaller. (default: ``False``)
| timeout (numeric, optional): if positive, the timeout value for collecting a batch
| from workers. Should always be non-negative. (default: ``0``)
| worker_init_fn (callable, optional): If not ``None``, this will be called on each
| worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
| input, after seeding and before data loading. (default: ``None``)
|
|
| .. warning:: If the ``spawn`` start method is used, :attr:`worker_init_fn`
| cannot be an unpicklable object, e.g., a lambda function. See
| :ref:`multiprocessing-best-practices` on more details related
| to multiprocessing in PyTorch.
|
| .. note:: ``len(dataloader)`` heuristic is based on the length of the sampler used.
| When :attr:`dataset` is an :class:`~torch.utils.data.IterableDataset`,
| an infinite sampler is used, whose :meth:`__len__` is not
| implemented, because the actual length depends on both the
| iterable as well as multi-process loading configurations. So one
| should not query this method unless they work with a map-style
| dataset. See `Dataset Types`_ for more details on these two types
| of datasets.
|
| Methods defined here:
|
| __init__(self, dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, multiprocessing_context=None)
| Initialize self. See help(type(self)) for accurate signature.
|
| __iter__(self)
|
| __len__(self)
|
| __setattr__(self, attr, val)
| Implement setattr(self, name, value).
|
| ----------------------------------------------------------------------
| Data descriptors defined here:
|
| __dict__
| dictionary for instance variables (if defined)
|
| __weakref__
| list of weak references to the object (if defined)
|
| multiprocessing_context
batch_size = 32
num_workers = 2
train_loader = DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=True,num_workers=num_workers)
val_loader = DataLoader(dataset=val_dataset,batch_size=batch_size,shuffle=False,num_workers=num_workers)
train_loader
next(iter(train_loader))
[tensor([[ 490, 3710, 600, 1979, 2, 203, 20151, 8151, 850, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 102, 94, 12099, 16709, 126, 107, 38082, 15, 39, 2544,
16, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[42638, 839, 5814, 31617, 78419, 2, 3001, 136, 1071, 638,
4954, 4, 6, 5, 1, 1, 1, 1, 1, 1,
1],
[10757, 178, 1470, 7, 6113, 5486, 10967, 8, 799, 1674,
226, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[25144, 72, 11991, 11992, 154, 82, 103, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 255, 27, 1221, 2947, 446, 236, 635, 2452, 324, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 1812, 1612, 77, 4276, 1076, 279, 36, 1708, 52, 24206,
4, 6, 5, 1, 1, 1, 1, 1, 1, 1,
1],
[ 7406, 1935, 1510, 795, 659, 6182, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[13437, 110, 17048, 79, 6250, 91282, 58400, 25647, 17281, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 448, 17, 728, 1210, 2, 909, 136, 17, 63, 728,
7314, 2643, 13756, 1, 1, 1, 1, 1, 1, 1,
1],
[ 98, 1618, 3, 1849, 949, 128, 2, 6618, 698, 185,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 4082, 445, 1458, 249, 1635, 153, 248, 4093, 1259, 8847,
4, 6, 5, 1, 1, 1, 1, 1, 1, 1,
1],
[ 367, 15823, 6347, 14, 2864, 23390, 4385, 58791, 2, 2732,
20500, 152, 363, 9, 460, 1, 1, 1, 1, 1,
1],
[ 398, 43, 312, 3, 1329, 84239, 2957, 139, 4, 21,
5, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 3746, 930, 3867, 163, 97935, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 236, 1867, 874, 1142, 2, 5726, 3096, 61, 288, 35933,
11, 218, 25644, 1, 1, 1, 1, 1, 1, 1,
1],
[ 2395, 12, 3977, 242, 2003, 6120, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 1439, 1613, 20255, 80303, 789, 1459, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 1918, 1260, 118, 1738, 9191, 1461, 3616, 1142, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 96, 9, 69, 11876, 7, 4169, 802, 2824, 8, 792,
104, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 62, 1574, 909, 4100, 2, 1215, 154, 17126, 4937, 15,
77221, 16, 4, 6, 5, 1, 1, 1, 1, 1,
1],
[27321, 20215, 24, 8095, 3352, 2, 27, 7435, 3990, 1740,
20567, 8057, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 6017, 1632, 12961, 61, 2487, 2257, 970, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 3833, 13, 26354, 2149, 383, 0, 14, 1924, 2, 1468,
8626, 7989, 4, 21, 5, 1, 1, 1, 1, 1,
1],
[ 1014, 311, 32, 8007, 4106, 2, 1579, 25251, 23017, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 3626, 36881, 463, 890, 2364, 2840, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 1169, 1077, 47, 16051, 236, 2, 1013, 3272, 19092, 64410,
9624, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 177, 973, 32868, 315, 533, 4653, 25475, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 5305, 60, 17951, 3, 96, 9, 678, 19757, 10915, 4860,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 7859, 191, 274, 1320, 14, 331, 2312, 2, 675, 15580,
4414, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 19, 60, 1316, 3890, 3718, 7834, 53, 23, 8950, 2,
4174, 3856, 6643, 23, 1, 1, 1, 1, 1, 1,
1],
[ 4193, 14, 6945, 1693, 12750, 15, 655, 244, 97051, 16,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1]]),
tensor([4, 3, 9, 8, 3, 1, 1, 4, 5, 0, 0, 5, 7, 3, 6, 1, 2, 6, 0, 8, 3, 9, 0, 9,
4, 1, 1, 2, 3, 5, 5, 9])]
for i,(words,labels) in enumerate(train_loader):
print(i)
print('*' * 60)
print(words)
print('*' * 60)
print(labels)
break
0
************************************************************
tensor([[ 3928, 26941, 1738, 2, 17, 368, 26118, 25529, 1519, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 472, 161, 5764, 403, 44, 11, 87, 339, 1386, 507,
5702, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 2455, 27, 21532, 3439, 62060, 21748, 147, 4432, 6434, 2,
4976, 205, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 3139, 914, 27201, 145, 2, 82, 154, 1309, 126, 23,
4521, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 1720, 3149, 405, 91, 662, 2003, 28034, 23, 56911, 23,
364, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 489, 0, 323, 422, 0, 1460, 2, 304, 11616, 11,
4881, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 4068, 2608, 0, 513, 2, 90, 436, 46225, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 196, 35, 354, 185, 10, 3127, 4268, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 4539, 331, 10018, 179, 8652, 1937, 15, 16285, 16, 2104,
1093, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 7236, 27, 2143, 18043, 1977, 15836, 2, 16123, 3125, 1258,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[15840, 4197, 2902, 5070, 9259, 905, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[24745, 4045, 2892, 2538, 363, 60, 81, 638, 207, 3012,
4, 6, 5, 1, 1, 1, 1, 1, 1, 1,
1],
[ 62, 5192, 1662, 145, 1620, 313, 8675, 2, 2631, 1569,
2288, 6888, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 1037, 4979, 3698, 238, 14, 4065, 32789, 1129, 45, 2081,
318, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 1350, 951, 918, 2133, 26, 1147, 2, 18428, 1034, 12,
141, 241, 1112, 1, 1, 1, 1, 1, 1, 1,
1],
[ 7047, 198, 13, 51933, 2912, 2, 203, 6302, 1024, 509,
57170, 22, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 247, 9, 2448, 69387, 3000, 11227, 2, 2565, 463, 17930,
27366, 1735, 12981, 1, 1, 1, 1, 1, 1, 1,
1],
[ 3051, 40, 6563, 16309, 13, 18391, 143, 92, 2605, 5104,
477, 54, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 1275, 392, 7925, 2858, 35808, 2, 1912, 196, 2530, 61561,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 20, 1429, 95, 24, 1635, 27190, 2, 7978, 137, 69,
34417, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 650, 118, 7415, 1306, 2, 433, 3870, 3002, 633, 11879,
0, 22, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 367, 3102, 5235, 2303, 853, 2, 2566, 36, 2552, 713,
14612, 2, 31, 41, 2043, 74, 40140, 1, 1, 1,
1],
[ 2616, 45481, 745, 1621, 113, 4618, 209, 6496, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 15, 0, 16, 383, 1153, 25143, 2, 6092, 492, 2838,
4, 6, 5, 1, 1, 1, 1, 1, 1, 1,
1],
[27693, 28, 78, 406, 3, 4320, 2033, 1664, 3310, 5000,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 302, 435, 12, 89376, 726, 2, 42645, 45168, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 115, 514, 3, 4212, 2595, 3318, 4240, 4100, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 6300, 56493, 10786, 2, 4245, 29, 120, 1979, 188, 112,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 8814, 18649, 17074, 48, 12, 792, 78330, 726, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 541, 1264, 221, 2, 1966, 2, 11420, 664, 9616, 22,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 95, 17075, 11990, 56400, 3, 305, 7932, 17075, 2411, 13298,
2838, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1],
[ 1965, 3342, 3, 338, 2459, 20, 38637, 17655, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1]])
************************************************************
tensor([0, 3, 5, 3, 6, 5, 4, 1, 4, 6, 1, 5, 3, 5, 7, 4, 7, 1, 2, 8, 4, 7, 0, 9,
1, 4, 3, 4, 8, 4, 7, 2])
https://github.com/Embedding/Chinese-Word-Vectors
首先,我们了解下词向量,为方便大家了解词向量用途,我们通过可视化的方式给大家展示
import gensim
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings('ignore')
sogou_word_model = KeyedVectors.load_word2vec_format('data/news/sgns.sogou.char',binary=False)
number_words = len(sogou_word_model.vocab)
print('number_words= ',number_words)
items = sogou_word_model.vocab.items()
number_words= 365076
我们看下词向量可以做些什么
sogou_word_model.most_similar('北京大学')
[('北大', 0.6751739978790283),
('中国北京大学', 0.6405676603317261),
('北京大学经济系', 0.6353614330291748),
('北京大学化学系', 0.6258565187454224),
('北京大学经济学院', 0.6239113211631775),
('清华大学', 0.623389720916748),
('北京大学数学系', 0.6190596222877502),
('北京联合大学', 0.6075736880302429),
('北京大学国家发展研究院', 0.6050190329551697),
('北京大学社会学系', 0.6039434671401978)]
这里大家可以把number_words设置更多的数据,这样可以展示更多的内容。 为了演示,我们这里获取1000条
"""
Convert word2vec models to JSON database by cosine distance metric
"""
import json
# Name of output file
with open('custom_cosine_simialrity.json', 'w') as f:
# number_words = len(model.vocab) # 考虑数据量大,这里我们取10000个词展示效果,了解
number_words = 1000
dic = {
}
for i in range(0, number_words):
if i%100==0:
print(i)
stringA = list(items)[i][0]
dic[stringA] = []
nearest_words = sogou_word_model.most_similar(positive=[stringA], negative=[], topn=20)
number_nearest_words = len(nearest_words)
for j in range(0, number_nearest_words):
dic[stringA].append({
'w' : nearest_words[j][0],
'd' : str(round(nearest_words[j][1], 3))
})
json.dump(dic, f, ensure_ascii=False, indent=4)
print("Finished!")
0
100
200
300
400
500
600
700
800
900
Finished!
我们把custom_cosine_simialrity.json 拷贝到可视化项目展示下
word2vec-visualization/frontend/data/custom_cosine_simialrity.json
然后,我们启动服务 python -m http.server 8081,然后可以通过浏览器进行访问了
http://127.0.0.1:8081/
原始数据-> 字典(word2index)-> 结合外部的词向量给我们的每个word 对应的一个向量结果,然后保存起来
from tqdm import tqdm
ROOT_PATH = 'data/news/'
# 提取预训练词向量
emb_dim = 300
pretrain_dir = ROOT_PATH + 'sgns.sogou.char'
filename_trimmed_dir = ROOT_PATH + 'embedding_SougouNews'
# 随机初始化词向量矩阵
print("vocab_size = ", vocab_size)
print('emb_dim = ', emb_dim)
print('pretrain_dir = ',pretrain_dir)
print('filename_trimmed_dir = ',filename_trimmed_dir)
embeddings = np.random.rand(vocab_size, emb_dim)
print("embeddings shape=", embeddings.shape)
word_embedding = []
# 构建领域词向量
with open(pretrain_dir, 'r', encoding='utf-8') as f:
for i, line in enumerate(tqdm(f)):
if i == 0: # 若第一行是标题,则跳过
continue
splits = line.strip().split(" ")
word = splits[0]
if word in dict_word2index:
idx = dict_word2index[word] # 词对应index
feat = splits[1:] # 预训练 词 embedding的数据
emb = [float(x) for x in feat]
#
embeddings[idx] = np.asarray(emb,dtype='float32')
#
word_embedding.append( "{} {}".format(word,feat))
print('final embeddings = ',embeddings.shape)
# 保存 词典中每个单词的向量 <单词_idx,向量>
np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)
vocab_size = 100002
emb_dim = 300
pretrain_dir = data/news/sgns.sogou.char
filename_trimmed_dir = data/news/embedding_SougouNews
1238it [00:00, 6036.30it/s]
embeddings shape= (100002, 300)
365077it [00:20, 17690.22it/s]
final embeddings = (100002, 300)
embedding_npz = np.load('data/news/embedding_SougouNews.npz')
print(embedding_npz['embeddings'].shape)
embedding_npz['embeddings']
(100002, 300)
array([[ 0.86068087, 0.88799588, 0.32389122, ..., 0.57952255,
0.40938491, 0.34478832],
[ 0.30103572, 0.9029473 , 0.83626528, ..., 0.7544311 ,
0.0582625 , 0.88511525],
[ 0.78581959, 0.33171605, 0.33091543, ..., 0.96064628,
0.6581095 , 0.51953195],
...,
[-0.67065603, 0.18955401, 0.29267699, ..., 0.007438 ,
-0.50046903, -0.603239 ],
[ 0.27142355, 0.29432985, 0.16363384, ..., 0.42247807,
0.19791253, 0.56313068],
[ 0.99596462, 0.92187184, 0.53073177, ..., 0.78373915,
0.0539841 , 0.77132351]])
from importlib import import_module
embedding = 'random'
model_name = 'TextCNN'
num_epochs = 1
dataset = 'data/news/'
x = import_module("models." + model_name)
config = x.Config(dataset,embedding)
model = x.Model(config)
model.to(config.device) # 模型运行设备(cuda/cpu)
print(model)
rando init embedding
Model(
(embedding): Embedding(100002, 300)
(convs): ModuleList(
(0): Sequential(
(0): Conv1d(300, 256, kernel_size=(3,), stride=(1,))
(1): ReLU()
(2): MaxPool1d(kernel_size=19, stride=19, padding=0, dilation=1, ceil_mode=False)
)
(1): Sequential(
(0): Conv1d(300, 256, kernel_size=(4,), stride=(1,))
(1): ReLU()
(2): MaxPool1d(kernel_size=18, stride=18, padding=0, dilation=1, ceil_mode=False)
)
(2): Sequential(
(0): Conv1d(300, 256, kernel_size=(5,), stride=(1,))
(1): ReLU()
(2): MaxPool1d(kernel_size=17, stride=17, padding=0, dilation=1, ceil_mode=False)
)
)
(fc): Linear(in_features=768, out_features=10, bias=True)
)
import torch
import torch.nn.functional as F
for i ,(feats,labels) in enumerate(train_loader):
feats = feats.to(config.device)
labels = labels.to(config.device)
print('feats = ',feats.shape)
print('labels = ',labels.shape)
outputs = model(feats)
print('outputs = ',outputs.shape)
# < outputs,labels> -> CrossEntropy(outputs,labels)-> loss
embed_x = model.embedding(feats)
embed_x = embed_x.permute(0,2,1)
print(embed_x.shape)
out = [ conv(embed_x) for conv in model.convs ]
out = torch.cat(out,dim=1)
print(out.shape)
out = out.view(-1,out.size(1))
print(out.shape)
out = F.dropout(input = out,p = model.dropout)
print(out.shape)
out = model.fc( out )
print(out.shape)
break
feats = torch.Size([32, 21])
labels = torch.Size([32])
outputs = torch.Size([32, 10])
torch.Size([32, 300, 21])
torch.Size([32, 768, 1])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 10])
# -*- coding: UTF-8 -*-
import pickle
from collections import OrderedDict
from importlib import import_module
import jieba
# 导入模型训练和评估模块
import torch
from flask import Flask, render_template
from flask import request
from data_processing import build_dataset_online
from json_utils import jsonify
from utils import load_config
# 导入模型训练和评估模块
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
app = Flask(__name__)
d_config = load_config('config_file')
print('d_config = ', d_config)
max_seq_len = d_config['vocab_size']
class_list = d_config['class_list']
print('class_list = ', class_list)
print('max_seq_len = ', max_seq_len)
# 加载词表文件
with open(d_config['vocab_file'], "rb") as f:
dict_word2index = pickle.load(f)
d_config = {'data_path': 'data/news/data.txt', 'vocab_file': 'data/news/word2index.pkl', 'model_name': 'TextCNN', 'learning_rate': 0.001, 'batch_size': 256, 'embedding_size': 300, 'num_classes': 10, 'dropout': 0.5, 'num_filters': 256, 'max_vocab_size': 100000, 'min_freq': 1, 'log_path': 'data/news/ckpts/TextCNN', 'ckpt_path': 'data/news/ckpts/TextCNN.ckpt', 'vocab_size': 100002, 'max_seq_len': 21, 'class_list': ['财经', '房产', '股票', '教育', '科技', '社会', '时政', '体育', '游戏', '娱乐']}
class_list = ['财经', '房产', '股票', '教育', '科技', '社会', '时政', '体育', '游戏', '娱乐']
max_seq_len = 100002
def load_model():
"""
初始化cnn 网络 ,这里我们 仅支持 外部词向量 + 分词模式下的预测
其他的模型方式: 大家尝试自己学更改
:return:
"""
embedding = 'embedding_SougouNews.npz'
model_name = "TextCNN"
dataset = "data/news/"
m_file = import_module("models." + model_name)
config = m_file.Config(dataset, embedding)
# 模型训练
model = m_file.Model(config).to(device)
model.load_state_dict(torch.load(config.save_path, map_location='cpu'))
model.eval()
return model
# 加载cnn 模型以及配置信息
model = load_model()
word = True
if word:
tokenizer = lambda x: jieba.lcut(x) # 中文我们按照切词处理
else:
tokenizer = lambda x: [y for y in x]
tokenizer = lambda x: jieba.lcut(x)
pre_trained init embedding
model
Model(
(embedding): Embedding(100002, 300)
(convs): ModuleList(
(0): Sequential(
(0): Conv1d(300, 256, kernel_size=(2,), stride=(1,))
(1): ReLU()
(2): MaxPool1d(kernel_size=20, stride=20, padding=0, dilation=1, ceil_mode=False)
)
(1): Sequential(
(0): Conv1d(300, 256, kernel_size=(3,), stride=(1,))
(1): ReLU()
(2): MaxPool1d(kernel_size=19, stride=19, padding=0, dilation=1, ceil_mode=False)
)
(2): Sequential(
(0): Conv1d(300, 256, kernel_size=(4,), stride=(1,))
(1): ReLU()
(2): MaxPool1d(kernel_size=18, stride=18, padding=0, dilation=1, ceil_mode=False)
)
)
(fc): Linear(in_features=768, out_features=10, bias=True)
)
更多关于TextCNN 短文本分类详细案例:可以入群交流学习。
访问博客首页交流群或者私信留言。多谢