self.label_vocab
Out[6]:
{
'POS': Vocabulary(['S-root', 'B-NR', 'M-NR', 'E-NR', 'B-NN']...),
'CWS': Vocabulary(['S', 'B', 'E', 'M']...),
'NER': Vocabulary(['O', 'B-NT', 'M-NT', 'E-NT', 'B-NR']...),
'Parsing': Vocabulary(['APP', 'nn', 'nsubj', 'rcmod', 'cpm']...),
'pos': Vocabulary(['root', 'NR', 'NN', 'VV', 'DEC']...)}
self.char_vocab
Out[10]: Vocabulary(['[unused12]', '有', '的', '厂', '长']...)
len(self.char_vocab)
Out[11]: 8675
内容太多,感觉有点干不动了,先挂起这条线
https://zhuanlan.zhihu.com/p/67106791
txt = ["中华 人民 共和国",
"中央 人民 政府"]
words
Out[2]:
tensor([[3, 2, 4],
[5, 2, 6]])
根据words_to_chars_embedding
这个lookup-table转chars
chars.shape
Out[3]: torch.Size([2, 3, 7])
# 2 batch 3 words 7 chars
割最大长度
说实话我不知道+2怎么来的
max_word_len = word_lengths.max()
chars = chars[:, :, :max_word_len]
chars.shape
Out[4]: torch.Size([2, 3, 5])
做Embedding
chars = self.char_embedding(chars)
chars.shape
Out[6]: torch.Size([2, 3, 5, 50])
把字符和batch合并为batch
reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1)
reshaped_chars.shape
Out[5]: torch.Size([6, 5, 50])
过3个CNN,concat
这不就是Group Conv吗?喵喵喵
conv_chars.shape
Out[7]: torch.Size([2, 3, 5, 90])
再按照mask做pooling,过FC
chars.shape
Out[8]: torch.Size([2, 3, 50])
1
2
charCNN的本质是unigram,可以改为bigram(有对应的预训练词向量)
fastNLP.modules.tokenizer.bert_tokenizer.WordpieceTokenizer#tokenize
word_to_wordpieces
Out[14]:
[[0],
[100],
[782, 16753],
[704, 14347],
[1066, 14526, 14801],
[704, 14982],
[3124, 15481],
[791, 14978],
[2768, 18046],
[749]]
txt = ["中华 人民 共和国",
"中央 人民 政府",
"今天 成立 了"]
做词表映射
words
Out[16]:
tensor([[3, 2, 4],
[5, 2, 6],
[7, 8, 9]])
统计Word piece长度
batch_word_pieces_length.sum(dim=-1)
Out[15]: tensor([7, 6, 5])
最大长度7,加上[CLS]和[SEP]就是9
word_pieces.size()
Out[18]: torch.Size([3, 9])
每个batch按char来
word_pieces_i
Out[19]: [704, 14347, 782, 16753, 1066, 14526, 14801]
len(word_pieces_i)
Out[20]: 7
word_pieces_i
Out[21]: [704, 14982, 782, 16753, 3124, 15481]
len(word_pieces_i)
Out[22]: 6
感觉写错了 个屁
fastNLP.modules.encoder.bert.BertModel#forward
在BertModel
中,先embedding
再过encoder
位置编码,最长512
self.position_embeddings
Out[3]: Embedding(512, 768)
self.word_embeddings
Out[4]: Embedding(21128, 768, padding_idx=0)
self.token_type_embeddings
Out[5]: Embedding(2, 768)
最后做一些pooling等dirty的操作,得到结果
outputs.shape
Out[11]: torch.Size([3, 3, 768])
类似Elmo的mix机制
train支持对最好的一组模型进行ensemble
看CRF的正向传播
fastNLP.modules.decoder.crf.ConditionalRandomField#forward
batch 2, seq 26, n_classes 2
feats.shape
Out[17]: torch.Size([2, 26, 17])
torch.exp(feats).sum(2)
Out[15]:
tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
fastNLP.modules.decoder.crf.ConditionalRandomField#_normalizer_likelihood
seq_len, batch_size, n_tags = logits.size()
logits.shape
Out[6]: torch.Size([26, 2, 17])
start_scores
是初始权重, alpha
是初始概率(可以理解为 x x x)
看起来是 w + x w + x w+x,其实是 w ⋅ x w \cdot x w⋅x,因为 x x x是logits
# alpha [2, 17]
if self.include_start_end_trans:
alpha = alpha + self.start_scores.view(1, -1)
emit_score
为当前的标签概率
emit_score = logits[i].view(batch_size, 1, n_tags)
# [2, 1, 17]
alpha
为上一个标签的概率
alpha.view(batch_size, n_tags, 1)
# [2, 17, 1]
trans_score
# [1, 17, 17]
处理后的emit_score
, alpha
, trans_score
这3个玩意会加起来,其实也是累乘
计算转移概率
trans_score = self.trans_m[tags[:seq_len - 1], tags[1:]].masked_fill(flip_mask[1:, :], 0)
计算发射概率
emit_score = logits[seq_idx.view(-1, 1), batch_idx.view(1, -1), tags].masked_fill(flip_mask, 0)
本质上gold_score
指的是状态为y_true
时的最大似然
_normalizer_likelihood
是模型自己瞎猜一串状态,然后求最大似然
fastNLP官方给的例子只用chars,根本不做分词。于是以ChnSentiCorpPipe
为基类开发MyChnSentiCorpPipe
,在dataset上加了几个filed,并且改写了模型,弄了两个LSTM。
TODO: 试下attention和CNN, charCNN
跑起来效果可以,接近BERT的效果。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author : qichun tang
# @Date : 2021-01-06
# @Contact : [email protected]
import os
import pkuseg
import torch
from joblib import load, dump
from torch import nn
from torch.optim import Adam
from fastNLP import AccuracyMetric
from fastNLP import Const
from fastNLP import CrossEntropyLoss
from fastNLP import Trainer
from fastNLP.embeddings import StaticEmbedding
from fastNLP.io import ChnSentiCorpLoader, DataBundle
from fastNLP.io import ChnSentiCorpPipe
from fastNLP.io.pipe.utils import _indexize
from fastNLP.modules import LSTM
loader = ChnSentiCorpLoader() # 初始化一个中文情感分类的loader
data_dir = loader.download() # 这一行代码将自动下载数据到默认的缓存地址, 并将该地址返回
data_bundle = loader.load(data_dir) # 这一行代码将从{data_dir}处读取数据至DataBundle
WORDS_SEQ_LEN = "words_seq_len"
class MyChnSentiCorpPipe(ChnSentiCorpPipe):
def process(self, data_bundle: DataBundle):
seg = pkuseg.pkuseg()
data_bundle = super(MyChnSentiCorpPipe, self).process(data_bundle)
data_bundle.apply_field(seg.cut, field_name=Const.RAW_CHAR, new_field_name=Const.INPUT)
data_bundle.set_input(Const.INPUT)
_indexize(data_bundle)
for name, dataset in data_bundle.datasets.items():
dataset.add_seq_len(Const.INPUT, new_field_name=WORDS_SEQ_LEN)
data_bundle.set_input(WORDS_SEQ_LEN)
return data_bundle
# 定义模型
class BiLSTMMaxPoolCls(nn.Module):
def __init__(self, char_embed, word_embed, num_classes, hidden_size=400, num_layers=1, dropout=0.3):
super().__init__()
self.char_embed = char_embed
self.word_embed = word_embed
self.lstm1 = LSTM(self.char_embed.embedding_dim, hidden_size=hidden_size // 4, num_layers=num_layers,
batch_first=True, bidirectional=True)
self.lstm2 = LSTM(self.word_embed.embedding_dim, hidden_size=hidden_size // 4, num_layers=num_layers,
batch_first=True, bidirectional=True)
self.dropout_layer = nn.Dropout(dropout)
self.fc = nn.Linear(hidden_size, num_classes)
def forward(self, chars, words, seq_len, words_seq_len):
# 这里的名称必须和DataSet中相应的field对应,比如之前我们DataSet中有chars,这里就必须为chars
# chars:[batch_size, max_len]
# seq_len: [batch_size, ]
chars = self.char_embed(chars)
words = self.word_embed(words)
outputs1, _ = self.lstm1(chars, seq_len)
outputs2, _ = self.lstm2(words, words_seq_len)
outputs1 = self.dropout_layer(outputs1)
outputs2 = self.dropout_layer(outputs2)
outputs1, _ = torch.max(outputs1, dim=1)
outputs2, _ = torch.max(outputs2, dim=1)
outputs = torch.cat([outputs1, outputs2], dim=1)
outputs = self.fc(outputs)
return {
'pred': outputs} # [batch_size,], 返回值必须是dict类型,且预测值的key建议设为pred
# 加载与处理数据
cache_name = "ChnSentiCorp.pkl"
if os.path.exists(cache_name):
data_bundle = load(cache_name)
else:
pipe = MyChnSentiCorpPipe(bigrams=False)
data_bundle = pipe.process(data_bundle) # 所有的Pipe都实现了process()方法,且输入输出都为DataBundle类型
dump(data_bundle, cache_name)
# 初始化模型
char_embed = StaticEmbedding(data_bundle.get_vocab("chars"), model_dir_or_name='cn-char-fastnlp-100d')
word_embed = StaticEmbedding(data_bundle.get_vocab("words"),
model_dir_or_name='/media/tqc/doc/dataset/fastNLP/70000-small.txt')
model = BiLSTMMaxPoolCls(char_embed, word_embed, len(data_bundle.get_vocab('target')))
loss = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)
metric = AccuracyMetric()
device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快
trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss,
optimizer=optimizer, batch_size=32, dev_data=data_bundle.get_dataset('dev'),
metrics=metric, device=device)
trainer.train() # 开始训练,训练完成之后默认会加载在dev上表现最好的模型
# 在测试集上测试一下模型的性能
from fastNLP import Tester
print("Performance on test is:")
tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)
tester.test()