论文:Effective Approaches to Attention-based Neural Machine Translation
作者:Minh-Thang Luong, Hieu Pham, Christopher D. Manning
时间:2015
这里我们使用tensorflow
实现,代码如下:
# 完整代码在这里
import tensorflow as tf
import keras_nlp
import matplotlib.pyplot as plt
import numpy as np
import os
import random
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
# 数据处理
def process_data(x):
res = tf.strings.split(x, '\t')
return res[1], res[3]
# 导入数据
dataset = tf.data.TextLineDataset('./data/transformer_data.tsv')
dataset = dataset.map(process_data)
# 建立中英文wordpiece词表
vocab_chinese = keras_nlp.tokenizers.compute_word_piece_vocabulary(
dataset.map(lambda x, y: x),
vocabulary_size=20_000,
lowercase=True,
strip_accents=True,
split_on_cjk=True,
reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
)
vocab_english = keras_nlp.tokenizers.compute_word_piece_vocabulary(
dataset.map(lambda x, y: y),
vocabulary_size=20_000,
lowercase=True,
strip_accents=True,
split_on_cjk=True,
reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
)
# 构建分词器
chinese_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab_chinese, oov_token="[UNK]")
english_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab_english, oov_token="[UNK]")
# 再进行一次数据处理
def process_data_(ch, en, maxtoken=128):
ch = chinese_tokenizer(ch)[:,:maxtoken]
en = english_tokenizer(tf.strings.lower(en))[:,:maxtoken]
ch = tf.concat([tf.ones(shape=(64,1), dtype='int32'), ch, tf.ones(shape=(64,1), dtype='int32')*2], axis=-1).to_tensor()
en = tf.concat([tf.ones(shape=(64,1), dtype='int32'), en, tf.ones(shape=(64,1), dtype='int32')*2], axis=-1)
en_inputs = en[:, :-1].to_tensor() # Drop the [END] tokens
en_labels = en[:, 1:].to_tensor() # Drop the [START] tokens
return (ch, en_inputs), en_labels
dataset = dataset.batch(64).map(process_data_)
train_dataset = dataset.take(1000)
val_dataset = dataset.skip(500).take(300)
# 数据准备完毕 查看数据
for (pt, en), en_labels in dataset.take(1):
break
print(pt.shape)
print(en.shape)
print(en_labels.shape)
# 构建encoder
class Encoder(tf.keras.layers.Layer):
def __init__(self, vocabulary_size, d_model, units):
super().__init__()
self.embedding = tf.keras.layers.Embedding(vocabulary_size, d_model)
self.rnn = tf.keras.layers.Bidirectional(
layer=tf.keras.layers.LSTM(units=units, return_sequences=True, return_state=False),
merge_mode='sum'
)
def call(self, inputs):
x = inputs
x = self.embedding(x)
x = self.rnn(x)
return x
# 构建crossattention
class CrossAttention(tf.keras.layers.Layer):
def __init__(self, units, **kwargs):
super().__init__()
self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=1, **kwargs)
self.add = tf.keras.layers.Add()
self.norm = tf.keras.layers.LayerNormalization()
def call(self, inputs):
x, context = inputs
attention_out, attention_score = self.mha(query=x, value=context, key=context, return_attention_scores=True)
self.last_attention_score = attention_score
x = self.add([x, attention_out])
x = self.norm(x)
return x
# 构建decoder
class Decoder(tf.keras.layers.Layer):
def __init__(self, vocabulary_size, d_model, units, **kwargs):
super().__init__()
self.embedding = tf.keras.layers.Embedding(vocabulary_size, d_model)
self.rnn = tf.keras.layers.LSTM(units, return_sequences=True)
self.attention = CrossAttention(units, **kwargs)
self.dense = tf.keras.layers.Dense(vocabulary_size, activation='softmax')
def call(self, inputs):
x, context = inputs
x = self.embedding(x)
x = self.rnn(x)
x = self.attention((x, context))
x = self.dense(x)
return x
# 构建最后的模型
class Seq2Seq(tf.keras.models.Model):
def __init__(self, vocabulary_size_1, vocabulary_size_2, d_model, units, **kwargs):
super().__init__()
self.encoder = Encoder(vocabulary_size=vocabulary_size_1, d_model=d_model, units=units)
self.decoder = Decoder(vocabulary_size=vocabulary_size_2, d_model=d_model, units=units)
def call(self, inputs):
pt, en = inputs
context = self.encoder(pt)
output = self.decoder((en, context))
return output
seq2seq = Seq2Seq(chinese_tokenizer.vocabulary_size(), english_tokenizer.vocabulary_size(), 512, 30)
# 模型总览
seq2seq((pt, en))
seq2seq.summary()
# 模型配置
def masked_loss(y_true, y_pred):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none')
loss = loss_fn(y_true, y_pred)
mask = tf.cast(y_true != 0, loss.dtype)
loss *= mask
return tf.reduce_sum(loss)/tf.reduce_sum(mask)
def masked_acc(y_true, y_pred):
y_pred = tf.argmax(y_pred, axis=-1)
y_pred = tf.cast(y_pred, y_true.dtype)
match = tf.cast(y_true == y_pred, tf.float32)
mask = tf.cast(y_true != 0, tf.float32)
return tf.reduce_sum(match)/tf.reduce_sum(mask)
seq2seq.compile(
optimizer='adam',
loss=masked_loss,
metrics=[masked_acc, masked_loss]
)
# 模型训练
seq2seq.fit(train_dataset, epochs=20, validation_data=val_dataset)
# 推理
class Inference(tf.Module):
def __init__(self, model, tokenizer_1, tokenizer_2):
self.model = model
self.tokenizer_1 = tokenizer_1
self.tokenizer_2 = tokenizer_2
def __call__(self, sentence, MAX_TOKEN=128):
assert isinstance(sentence, tf.Tensor)
if len(sentence.shape) == 0:
sentence = sentence[tf.newaxis]
sentence = self.tokenizer_1(sentence)
sentence = tf.concat([tf.ones(shape=[sentence.shape[0], 1], dtype='int32'), sentence, tf.ones(shape=[sentence.shape[0], 1], dtype='int32')*2], axis=-1).to_tensor()
encoder_input = sentence
start = tf.constant(1, dtype='int64')[tf.newaxis]
end = tf.constant(2, dtype='int64')[tf.newaxis]
# tf.TensorArray 类似于python中的列表
output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
# 在index=0的位置写入start
output_array = output_array.write(0, start)
for i in tf.range(MAX_TOKEN):
output = tf.transpose(output_array.stack())
predictions = self.model.predict((encoder_input, output), verbose=0) # Shape `(batch_size, seq_len, vocab_size)`
# 从seq_len中的最后一个维度选择last token
predictions = predictions[:, -1:, :] # Shape `(batch_size, 1, vocab_size)`.
predicted_id = tf.argmax(predictions, axis=-1)
# `predicted_id`加入到output_array中作为一个新的输入
output_array = output_array.write(i+1, predicted_id[0])
# 如果输出end就表明停止
if predicted_id == end:
break
output = tf.squeeze(output_array.stack())
output = self.tokenizer_2.detokenize(output)
return output
inference = Inference(seq2seq, chinese_tokenizer, english_tokenizer)
# 开始推理
sentence = '你好'
sentence = tf.constant(sentence)
inference(sentence)
# 输出
#
介绍seq2seq
之前我们需要介绍一下RNN模型,RNN模型表示循环神经网络,具有代表性的有SimpleRNN,GRU,LSTM;其基本实现原理如图:
用公式表达如下:
O t = g ( V ⋅ S t + B 1 ) S t = f ( U ⋅ X t + W ⋅ S t − 1 + B 2 ) \begin{aligned} & O_t = g(V·S_t+B_1) \\ & S_t = f(U·X_t+W·S_{t-1}+B_2) \end{aligned} Ot=g(V⋅St+B1)St=f(U⋅Xt+W⋅St−1+B2)
其中 W W W, U U U, V V V三者权重是共享的,所有RNN的参数数量是与 X t X_t Xt的最后一个维度有关的,维度变化公式如下:
O [ o u t p u t d i m ∗ 1 ] = V [ o u t p u t d i m ∗ u n i t s ] ⋅ S [ u n i t s ∗ 1 ] + B [ o u t p u t d i m ∗ 1 ] S [ u n i t s ∗ 1 ] = U [ u n i t s ∗ x d i m ] ⋅ X [ x d i m ∗ 1 ] + W [ u n i t s ∗ u n i t s ] ⋅ S [ u n i t s ∗ 1 ] + B [ u n i t s ∗ 1 ] ) \begin{aligned} & O_{[output_{dim}*1]} =V_{[output_{dim}*units]}·S_{[units*1]} + B_{[output_{dim}*1]} \\ & S_{[units*1]} = U_{[units*x_{dim}]}·X_{[x_{dim}*1]}+W_{[units*units]}·S_{[units*1]}+B_{[units*1]}) \end{aligned} O[outputdim∗1]=V[outputdim∗units]⋅S[units∗1]+B[outputdim∗1]S[units∗1]=U[units∗xdim]⋅X[xdim∗1]+W[units∗units]⋅S[units∗1]+B[units∗1])
所以,RNN需要的参数数量为(units+x_dim+1)*units + (units+1)*output_dim
论文中提出了两种Attention-based Model,分别是全局注意力模型和局部注意力模型;其结构图如下:
从图中可以看到,其全局和局部的区别在于 a t a_t at 和 c t c_t ct 的不同,在分析之前,我们先定义一些变量: t t t 是时间步, a t a_t at 是模型对其权重向量,其主要是由于 h t h_t ht 和 h ‾ s \overline h_s hs计算得到, h s ‾ \overline{h_s} hs 是decoder中第s位置的state, c t c_t ct 被称作为内容向量,由 a t a_t at 和 h s ‾ \overline{h_s} hs计算得到;
接下来我们依次对全局注意力和局部注意力进行分析;
如图, c t c_t ct 是由 a t a_t at 和 h s ‾ \overline{h_s} hs 计算得到,这里首先定义 a t ( s ) a_t(s) at(s)的计算公式为:
a t ( s ) = a l i g n ( h t , h s ‾ ) = e x p ( s c o r e ( h t , h s ‾ ) ) ∑ s ′ e x p ( s c o r e ( h t , h s ′ ‾ ) ) \begin{aligned} a_t(s) &= align(h_t, \overline{h_s})\\ & = \frac{exp(score(h_t, \overline{h_s}))}{\sum_{s'}exp(score(h_t, \overline{h_{s'}}))} \end{aligned} at(s)=align(ht,hs)=∑s′exp(score(ht,hs′))exp(score(ht,hs))
论文中这里定义 s c o r e ( h t , h s ‾ ) score(h_t, \overline{h_{s}}) score(ht,hs)有三种方式:
KaTeX parse error: Undefined control sequence: \cases at position 32: …rline{h_s}) = \̲c̲a̲s̲e̲s̲{ h_t^T\overlin…
这里用 q u e r y query query, k e y key key, v a l u e value value 来解释就相当于 h t h_t ht 做 q u e r y query query , h ‾ s \overline h_s hs 做 k e y key key 和 v a l u e value value;其流程为 h t → a t → c t → h ~ t h_t \rightarrow a_t \rightarrow c_t \rightarrow \tilde h_t ht→at→ct→h~t
全局注意力机制有一个缺点,即它必须关注每个目标词的源端的所有单词,这是昂贵的,并可能使翻译更长的序列不切实际,例如段落或文档。这里使用局部注意力机制进行优化;
所谓局部注意力机制就是说我们不去计算所有位置,而是计算部分位置,那么这部分位置该怎么选择呢,在语言翻译模型中,某部分的target是由某部分的source构成的,在已知target的位置 t t t 时找到source的位置 p t p_t pt 论文中有两种方式取实现:
这里的 v p v_p vp, W p W_p Wp都是参数;
在找到 p t p_t pt 之后,我们对 [ p t − D , p t + D ] [p_t - D, p_t + D] [pt−D,pt+D]这些位置上的 h ‾ s \overline h_s hs 进行注意力机制计算 a t , c t a_t, c_t at,ct;
同时由于词距离 p t p_t pt 越远,则其影响越弱,这里论文中使用高斯分布的方式对 a t a_t at取值:取值方式如下: a t ( s ) = a l i g n ( h t , h ‾ s ) e x p ( − ( s − p t ) 2 2 σ 2 ) a_t(s)=align(h_t, \overline h_s)exp(-\frac{(s-p_t)^2}{2\sigma^2}) at(s)=align(ht,hs)exp(−2σ2(s−pt)2)
根据经验我们一般把 σ \sigma σ 设置为 D 2 \frac{D}{2} 2D ,这就是局部注意力机制;
在全局和局部注意力模型中,其注意力部分都是独立进行的,并没有对下一个时间步的过程产生影响,这并不合理,在标准的MT中,通常在翻译过程中会维护一个覆盖集,以跟踪哪些源词已经被翻译过。同样地,在注意nmt中,对齐决策应该共同考虑到过去的对齐信息。我们可以优化一下,把每次的输出作为下一个时间步的输入;如图所示:
论文中模型效果如图所示:
这里要用到的包有:tensorflow
, keras_nlp
, matplotlib
, numpy
import tensorflow as tf
import keras_nlp
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
这里使用的是中英文翻译数据集,进行清洗和dataset
构造
# 数据处理
def process_data(x):
res = tf.strings.split(x, '\t')
return res[1], res[3]
# 导入数据
dataset = tf.data.TextLineDataset('./data/transformer_data.tsv')
dataset = dataset.map(process_data)
# 建立中英文wordpiece词表
vocab_chinese = keras_nlp.tokenizers.compute_word_piece_vocabulary(
dataset.map(lambda x, y: x),
vocabulary_size=20_000,
lowercase=True,
strip_accents=True,
split_on_cjk=True,
reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
)
vocab_english = keras_nlp.tokenizers.compute_word_piece_vocabulary(
dataset.map(lambda x, y: y),
vocabulary_size=20_000,
lowercase=True,
strip_accents=True,
split_on_cjk=True,
reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
)
# 构建分词器
chinese_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab_chinese, oov_token="[UNK]")
english_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab_english, oov_token="[UNK]")
# 再进行一次数据处理
def process_data_(ch, en, maxtoken=128):
ch = chinese_tokenizer(ch)[:,:maxtoken]
en = english_tokenizer(tf.strings.lower(en))[:,:maxtoken]
ch = tf.concat([tf.ones(shape=(64,1), dtype='int32'), ch, tf.ones(shape=(64,1), dtype='int32')*2], axis=-1).to_tensor()
en = tf.concat([tf.ones(shape=(64,1), dtype='int32'), en, tf.ones(shape=(64,1), dtype='int32')*2], axis=-1)
en_inputs = en[:, :-1].to_tensor() # Drop the [END] tokens
en_labels = en[:, 1:].to_tensor() # Drop the [START] tokens
return (ch, en_inputs), en_labels
dataset = dataset.batch(64).map(process_data_)
train_dataset = dataset.take(1000)
val_dataset = dataset.skip(500).take(300)
# 数据准备完毕 查看数据
for (pt, en), en_labels in dataset.take(1):
break
print(pt.shape)
print(en.shape)
print(en_labels.shape)
encoder:
class Encoder(tf.keras.layers.Layer):
def __init__(self, vocabulary_size, d_model, units):
super().__init__()
self.embedding = tf.keras.layers.Embedding(vocabulary_size, d_model)
self.rnn = tf.keras.layers.Bidirectional(
layer=tf.keras.layers.LSTM(units=units, return_sequences=True, return_state=False),
merge_mode='sum'
)
def call(self, inputs):
x = inputs
x = self.embedding(x)
x = self.rnn(x)
return x
crossattention:
class CrossAttention(tf.keras.layers.Layer):
def __init__(self, units, **kwargs):
super().__init__()
self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=1, **kwargs)
self.add = tf.keras.layers.Add()
self.norm = tf.keras.layers.LayerNormalization()
def call(self, inputs):
x, context = inputs
attention_out, attention_score = self.mha(query=x, value=context, key=context, return_attention_scores=True)
self.last_attention_score = attention_score
x = self.add([x, attention_out])
x = self.norm(x)
return x
decoder:
class Decoder(tf.keras.layers.Layer):
def __init__(self, vocabulary_size, d_model, units, **kwargs):
super().__init__()
self.embedding = tf.keras.layers.Embedding(vocabulary_size, d_model)
self.rnn = tf.keras.layers.LSTM(units, return_sequences=True)
self.attention = CrossAttention(units, **kwargs)
self.dense = tf.keras.layers.Dense(vocabulary_size, activation='softmax')
def call(self, inputs):
x, context = inputs
x = self.embedding(x)
x = self.rnn(x)
x = self.attention((x, context))
x = self.dense(x)
return x
seq2seq:
class Seq2Seq(tf.keras.models.Model):
def __init__(self, vocabulary_size_1, vocabulary_size_2, d_model, units, **kwargs):
super().__init__()
self.encoder = Encoder(vocabulary_size=vocabulary_size_1, d_model=d_model, units=units)
self.decoder = Decoder(vocabulary_size=vocabulary_size_2, d_model=d_model, units=units)
def call(self, inputs):
pt, en = inputs
context = self.encoder(pt)
output = self.decoder((en, context))
return output
构建模型如下:
seq2seq = Seq2Seq(chinese_tokenizer.vocabulary_size(), english_tokenizer.vocabulary_size(), 512, 30)
# build model
seq2seq((pt, en))
seq2seq.summary()
模型配置:
def masked_loss(y_true, y_pred):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none')
loss = loss_fn(y_true, y_pred)
mask = tf.cast(y_true != 0, loss.dtype)
loss *= mask
return tf.reduce_sum(loss)/tf.reduce_sum(mask)
def masked_acc(y_true, y_pred):
y_pred = tf.argmax(y_pred, axis=-1)
y_pred = tf.cast(y_pred, y_true.dtype)
match = tf.cast(y_true == y_pred, tf.float32)
mask = tf.cast(y_true != 0, tf.float32)
return tf.reduce_sum(match)/tf.reduce_sum(mask)
seq2seq.compile(
optimizer='adam',
loss=masked_loss,
metrics=[masked_acc, masked_loss]
)
seq2seq.fit(train_dataset, epochs=10, validation_data=val_dataset)
模型训练结果如下:
作图:
plt.plot(seq2seq.history.history['masked_loss'], label='loss')
plt.plot(seq2seq.history.history['val_masked_loss'], label='val_loss')
plt.plot(seq2seq.history.history['masked_acc'], label='accuracy')
plt.plot(seq2seq.history.history['val_masked_acc'], label='val_accuracy')
构建推理类:
class Inference(tf.Module):
def __init__(self, model, tokenizer_1, tokenizer_2):
self.model = model
self.tokenizer_1 = tokenizer_1
self.tokenizer_2 = tokenizer_2
def __call__(self, sentence, MAX_TOKEN=128):
assert isinstance(sentence, tf.Tensor)
if len(sentence.shape) == 0:
sentence = sentence[tf.newaxis]
sentence = self.tokenizer_1(sentence)
sentence = tf.concat([tf.ones(shape=[sentence.shape[0], 1], dtype='int32'), sentence, tf.ones(shape=[sentence.shape[0], 1], dtype='int32')*2], axis=-1).to_tensor()
encoder_input = sentence
start = tf.constant(1, dtype='int64')[tf.newaxis]
end = tf.constant(2, dtype='int64')[tf.newaxis]
# tf.TensorArray 类似于python中的列表
output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
# 在index=0的位置写入start
output_array = output_array.write(0, start)
for i in tf.range(MAX_TOKEN):
output = tf.transpose(output_array.stack())
predictions = self.model.predict((encoder_input, output), verbose=0) # Shape `(batch_size, seq_len, vocab_size)`
# 从seq_len中的最后一个维度选择last token
predictions = predictions[:, -1:, :] # Shape `(batch_size, 1, vocab_size)`.
predicted_id = tf.argmax(predictions, axis=-1)
# `predicted_id`加入到output_array中作为一个新的输入
output_array = output_array.write(i+1, predicted_id[0])
# 如果输出end就表明停止
if predicted_id == end:
break
output = tf.squeeze(output_array.stack())
output = self.tokenizer_2.detokenize(output)
return output
开始推理:
inference = Inference(seq2seq, chinese_tokenizer, english_tokenizer)
sentence = '你好呀'
sentence = tf.constant(sentence)
inference(sentence)
# 输出
#
效果还不错!训练一定时长后能够正确的翻译,好像相较于Transformer逊色了一点,但是毕竟这个模型结构比Transformer早两年;