论文:Attention Is All You Need
作者:Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin
时间:2017
首先,transformer模型架构起初是由Vaswani等人在2017年一篇名为 "Attention is all you need"的论文中提出来的;其本质是利用self-attention去代替循环神经网络RNN和卷积神经网络CNN的一种sequence-to-sequence,encoder-decoder神经网络模型;
这里使用基础tensorflow代码来构建一个transformer模型
import tensorflow as tf
import keras_nlp
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
# 准备dataset
dataset = tf.data.TextLineDataset('data.tsv')
def process_data(x):
res = tf.strings.split(x, '\t')
return res[1], res[3]
dataset.map(process_data).take(1).get_single_element()
dataset = dataset.map(process_data).batch(64)
vocab_chinese = keras_nlp.tokenizers.compute_word_piece_vocabulary(
dataset.map(lambda x, y: x),
vocabulary_size=20000,
lowercase=True,
strip_accents=True,
split_on_cjk=True,
reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
)
vocab_english = keras_nlp.tokenizers.compute_word_piece_vocabulary(
dataset.map(lambda x, y: y),
vocabulary_size=20000,
lowercase=True,
strip_accents=True,
split_on_cjk=True,
reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
)
chinese_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab_chinese, oov_token="[UNK]")
english_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab_english, oov_token="[UNK]")
def process_data_(ch, en, maxtoken=128):
ch = chinese_tokenizer(ch)[:,:maxtoken]
en = english_tokenizer(tf.strings.lower(en))[:,:maxtoken]
ch = tf.concat([tf.ones(shape=(64,1), dtype='int32'), ch, tf.ones(shape=(64,1), dtype='int32')*2], axis=-1).to_tensor()
en = tf.concat([tf.ones(shape=(64,1), dtype='int32'), en, tf.ones(shape=(64,1), dtype='int32')*2], axis=-1)
en_inputs = en[:, :-1].to_tensor() # Drop the [END] tokens
en_labels = en[:, 1:].to_tensor() # Drop the [START] tokens
return (ch, en_inputs), en_labels
dataset = dataset.batch(64).map(process_data_)
train_dataset = dataset.take(1000)
val_dataset = dataset.skip(500).take(300)
# 定义Transformer
def positional_encoding(length, depth):
depth = depth/2
positions = np.arange(length)[:, np.newaxis] # (seq, 1)
depths = np.arange(depth)[np.newaxis, :]/depth # (1, depth)
angle_rates = 1 / (10000**depths) # (1, depth)
angle_rads = positions * angle_rates # (pos, depth)
pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)],axis=-1)
return tf.cast(pos_encoding, dtype=tf.float32)
class PositionEmbedding(tf.keras.layers.Layer):
def __init__(self, vocabulary_size, d_model):
super().__init__()
self.d_model = d_model
self.embedding = tf.keras.layers.Embedding(vocabulary_size, d_model, mask_zero=True)
self.pos_encoding = positional_encoding(length=2048, depth=d_model)
def compute_mask(self, *args, **kwargs):
return self.embedding.compute_mask(*args, **kwargs)
def call(self, x):
length = tf.shape(x)[1]
x = self.embedding(x)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x = x + self.pos_encoding[tf.newaxis, :length, :]
return x
class BaseAttention(tf.keras.layers.Layer):
def __init__(self, **kwargs):
super().__init__()
self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
self.layernorm = tf.keras.layers.LayerNormalization()
self.add = tf.keras.layers.Add()
class CrossAttention(BaseAttention):
def call(self, x, context):
attn_output, attn_scores = self.mha(
query=x,
key=context,
value=context,
return_attention_scores=True)
# Cache the attention scores for plotting later.
self.last_attn_scores = attn_scores
x = self.add([x, attn_output])
x = self.layernorm(x)
return x
class GlobalSelfAttention(BaseAttention):
def call(self, x):
attn_output = self.mha(
query=x,
value=x,
key=x)
x = self.add([x, attn_output])
x = self.layernorm(x)
return x
class CausalSelfAttention(BaseAttention):
def call(self, x):
attn_output = self.mha(
query=x,
value=x,
key=x,
use_causal_mask = True)
x = self.add([x, attn_output])
x = self.layernorm(x)
return x
class FeedForward(tf.keras.layers.Layer):
def __init__(self, d_model, dff, dropout_rate=0.1):
super().__init__()
self.seq = tf.keras.Sequential([
tf.keras.layers.Dense(dff, activation='relu'),
tf.keras.layers.Dense(d_model),
tf.keras.layers.Dropout(dropout_rate)
])
self.add = tf.keras.layers.Add()
self.layer_norm = tf.keras.layers.LayerNormalization()
def call(self, x):
x = self.add([x, self.seq(x)])
x = self.layer_norm(x)
return x
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self, *, d_model, num_heads, dff, dropout=0.1):
super().__init__()
self.self_attention = GlobalSelfAttention(
num_heads = num_heads,
key_dim = d_model,
dropout = dropout
)
self.ffn = FeedForward(d_model, dff)
def call(self, x):
x = self.self_attention(x)
x = self.ffn(x)
return x
class Encoder(tf.keras.layers.Layer):
def __init__(self, *, vocabulary_size, d_model, num_heads, dff, num_layers=6, dropout=0.1):
super().__init__()
# 给Encoder添加属性,便于辨识
self.d_model = d_model
self.num_layers = num_layers
self.pos_embedding = PositionEmbedding(vocabulary_size, d_model)
self.encoder_layers = [EncoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout=dropout) for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(dropout)
def call(self, x):
x = self.pos_embedding(x)
x = self.dropout(x)
for encoder_layer in self.encoder_layers:
x = encoder_layer(x)
return x
class DecoderLayer(tf.keras.layers.Layer):
def __init__(self, *, d_model, num_heads, dff, dropout=0.1):
super().__init__()
self.causal_self_attention = CausalSelfAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout)
self.cross_attention = CrossAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout)
self.ffn = FeedForward(d_model, dff)
def call(self, x, context):
x = self.causal_self_attention(x)
x = self.cross_attention(x, context)
# 这里存储最后的注意力分数为了后面的画图
self.last_attn_scores = self.cross_attention.last_attn_scores
x = self.ffn(x)
return x
class Decoder(tf.keras.layers.Layer):
def __init__(self, *, vocabulary_size, d_model, num_heads, dff, num_layers=6, dropout=0.1):
super(Decoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.pos_embedding = PositionEmbedding(vocabulary_size=vocabulary_size, d_model=d_model)
self.decoder_layers = [DecoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout=dropout) for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(rate=dropout)
self.last_attn_scores = None
def call(self, x, content):
x = self.pos_embedding(x)
x = self.dropout(x)
for decoder_layer in self.decoder_layers:
x = decoder_layer(x, content)
self.last_attn_scores = self.decoder_layers[-1].last_attn_scores
return x
class Transformer(tf.keras.Model):
def __init__(self, *, num_layers, d_model, num_heads, dff, input_vocabulary_size, target_vocabulary_size, dropout=0.1):
super().__init__()
self.encoder = Encoder(vocabulary_size=input_vocabulary_size, d_model=d_model, num_layers=num_layers, num_heads=num_heads, dff=dff)
self.decoder = Decoder(vocabulary_size=target_vocabulary_size, d_model=d_model, num_layers=num_layers, num_heads=num_heads, dff=dff)
self.final_layer = tf.keras.layers.Dense(target_vocabulary_size, activation='softmax')
def call(self, inputs):
context, x = inputs
context = self.encoder(context)
x = self.decoder(x, context)
logits = self.final_layer(x)
# 不太理解
try:
# Drop the keras mask, so it doesn't scale the losses/metrics.
# b/250038731
del logits._keras_mask
except AttributeError:
pass
return logits
# 定义超参
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout = 0.1
MAX_TOKENS = 128
# 准备模型
model = Transformer(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, input_vocabulary_size=chinese_tokenizer.vocabulary_size(), target_vocabulary_size=english_tokenizer.vocabulary_size(), dropout=dropout)
# build模型
(ch, input_en), output_en = dataset.take(1).get_single_element()
model.predict((ch, input_en)).shape
model.summary()
# 模型loss,optimizer定义
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
def __init__(self, d_model, warmup_steps=4000):
super().__init__()
self.d_model = d_model
self.d_model = tf.cast(self.d_model, tf.float32)
self.warmup_steps = warmup_steps
def __call__(self, step):
step = tf.cast(step, dtype=tf.float32)
arg1 = tf.math.rsqrt(step)
arg2 = step * (self.warmup_steps ** -1.5)
return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
def masked_loss(label, pred):
mask = label != 0
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none')
loss = loss_object(label, pred)
mask = tf.cast(mask, dtype=loss.dtype)
loss *= mask
loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
return loss
def masked_accuracy(label, pred):
pred = tf.argmax(pred, axis=2)
label = tf.cast(label, pred.dtype)
match = label == pred
mask = label != 0
match = match & mask
match = tf.cast(match, dtype=tf.float32)
mask = tf.cast(mask, dtype=tf.float32)
return tf.reduce_sum(match)/tf.reduce_sum(mask)
model.compile(
loss=masked_loss,
optimizer=optimizer,
metrics=[masked_accuracy])
# 训练模型
model.fit(train_dataset, epochs=10, validation_data=val_dataset)
# 推理
class Translator(tf.Module):
def __init__(self, tokenizers, transformer):
self.tokenizers = tokenizers
self.transformer = transformer
def __call__(self, sentence, max_length=MAX_TOKENS):
# sentence是中文,因此需要tokenizer并且加上:1和:2
assert isinstance(sentence, tf.Tensor)
if len(sentence.shape) == 0:
sentence = sentence[tf.newaxis]
sentence = self.tokenizers(sentence)
sentence = tf.concat([tf.ones(shape=[sentence.shape[0], 1], dtype='int32'), sentence, tf.ones(shape=[sentence.shape[0], 1], dtype='int32')*2], axis=-1).to_tensor()
encoder_input = sentence
# As the output language is English, initialize the output with the
start = tf.constant(1, dtype='int64')[tf.newaxis]
end = tf.constant(2, dtype='int64')[tf.newaxis]
# tf.TensorArray 类似于python中的列表
output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
# 在index=0的位置写入start
output_array = output_array.write(0, start)
for i in tf.range(max_length):
output = tf.transpose(output_array.stack())
predictions = self.transformer([encoder_input, output], training=False) # Shape `(batch_size, seq_len, vocab_size)`
# 从seq_len中的最后一个维度选择last token
predictions = predictions[:, -1:, :] # Shape `(batch_size, 1, vocab_size)`.
predicted_id = tf.argmax(predictions, axis=-1)
# `predicted_id`加入到output_array中作为一个新的输入
output_array = output_array.write(i+1, predicted_id[0])
# 如果输出end就表明停止
if predicted_id == end:
break
output = tf.transpose(output_array.stack())
# 重新计算一下最外面的循环,得到最后的注意力得分
self.transformer([encoder_input, output[:,:-1]], training=False)
attention_weights = self.transformer.decoder.last_attn_scores
lst = []
for item in output[0].numpy():
lst.append(english_tokenizer.vocabulary[item])
translated_text = ' '.join(lst)
translated_tokens = output[0]
return translated_text, translated_tokens, attention_weights
translator = Translator(chinese_tokenizer, model)
# 普通推理
def print_translation(sentence, tokens, ground_truth):
print(f'{"Input:":15s}: {sentence}')
print(f'{"Prediction":15s}: {tokens}')
print(f'{"Ground truth":15s}: {ground_truth}')
sentence = '我們試試看!'
ground_truth = "Let's try it."
translated_text, translated_tokens, attention_weights = translator(tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)
# 注意力可视化
def plot_attention_head(in_tokens, translated_tokens, attention):
# 模型在输出中不产生,我们直接忽略
translated_tokens = translated_tokens[1:]
ax = plt.gca()
ax.matshow(attention)
ax.set_xticks(range(len(in_tokens)))
ax.set_yticks(range(len(translated_tokens)))
labels = [vocab_chinese[label] for label in in_tokens.numpy()]
ax.set_xticklabels(labels, rotation=90)
labels = [vocab_english[label] for label in translated_tokens.numpy()]
ax.set_yticklabels(labels)
plt.show()
sentence = '我們試試看!'
ground_truth = "Let's try it."
translated_text, translated_tokens, attention_weights = translator(tf.constant(sentence))
in_tokens = tf.concat([tf.constant(1)[tf.newaxis], chinese_tokenizer(tf.constant(sentence)), tf.constant(2)[tf.newaxis]], axis=-1)
attention = tf.squeeze(attention_weights, 0)
plot_attention_head(in_tokens, translated_tokens, attention[0])
Transformers的优点:
词嵌入和位置编码的地方一共有两处,一处是Input,一处是Output(shifted right);两处的流程是一致的,首先进入Embedding层,然后进行位置编码,这里两处采用位置编码的方式是一样的;
位置编码的方式如下:
为什么要采用位置编码:因为模型不包含任何循环或卷积层。它需要一些方法来识别单词顺序,否则它会将输入序列看作无序的;例如:how are you
, how you are
, you how are
,注意力机制无法识别顺序,若不采用位置编码,其得到的结果是一样的,这显然不是我们想要的结果;
为什么这样处理:首先看下面这张图,其为上述位置编码方法的可视化表示:
其生成的代码如下:
def positional_encoding(length, depth):
depth = depth/2
positions = np.arange(length)[:, np.newaxis] # (seq, 1)
depths = np.arange(depth)[np.newaxis, :]/depth # (1, depth)
angle_rates = 1 / (10000**depths) # (1, depth)
angle_rads = positions * angle_rates # (pos, depth)
pos_encoding = np.concatenate(
[np.sin(angle_rads), np.cos(angle_rads)],
axis=-1)
return tf.cast(pos_encoding, dtype=tf.float32)
pos_encoding = positional_encoding(length=2048, depth=512)
plt.pcolormesh(pos_encoding.numpy().T, cmap='RdBu')
plt.ylabel('Depth')
plt.xlabel('Position')
plt.colorbar()
plt.show()
当然实际每一列是交叉组合的,这里图像是把 s i n e sine sine弄在一起; c o s i n e cosine cosine弄在一起,计算不涉及到 p o s i t i o n position position内部 d e p t h depth depth的位置关系,所以这样并没有什么实际影响;
以上是该论文位置编码方案的可视化展示,这里要明确一个合理的位置编码应该满足什么条件,合理的位置编码满足的条件如下Transformer的位置编码_transformer 位置编码器-CSDN博客:
这里固定 P o s t i o n = 1000 Postion=1000 Postion=1000,分别对每一个 P o s i t o n Positon Positon做点积,可以发现距离1000越近的 P o s i t i o n Position Position之间的点积就越大;又因为是三角函数正好满足所有条件,所有采用三角函数是最好的;
有些解释是这样的,但我认为不对,其解释如下:how are you
中how
的值可以被其他的值线性表示;
s i n ( α + β ) = s i n ( α ) c o s ( β ) + c o s ( α ) s i n ( β ) sin(\alpha + \beta) = sin(\alpha)cos(\beta) + cos(\alpha)sin(\beta) sin(α+β)=sin(α)cos(β)+cos(α)sin(β) c o s ( α + β ) = c o s ( α ) c o s ( β ) − s i n ( α ) s i n ( β ) cos(\alpha + \beta) = cos(\alpha)cos(\beta) - sin(\alpha)sin(\beta) cos(α+β)=cos(α)cos(β)−sin(α)sin(β)
可以看到Add & Norm
遍布了整个模型,其有什么作用呢?
残差连接如下: A d d ( x ) = f ( x ) + x Add(x) = f(x) + x Add(x)=f(x)+x
对其求导与直接对 f ( x ) f(x) f(x)求导来说多了一个1,而正是这一个1,可以使每一次学习是更新而不是替换,这就是残差连接的好处;
归一化可以使输出的结果保持相同的尺寸;
从图中可以发现,Multi-Head Attention
和Add & Norm
同样一起遍布了整个模型;
注意力机制是怎么工作的:
图中有四个东西,Query
,Key
,Value
,Attention
,图中把Key
和Value
放在了一起是因为其产生的output
是与Key
,Value
的维度是无关的,只与Query
的维度有关;文中把Key
,Value
称作为Context sequence
,Query
还是称作为Query sequence
;为什么要这么做?可以看模型中右上方的Multi-Head Attention
和左下角的Multi-Head Attention
的区别进行分析;
Query
,Key
,Value
这三个东西可以用python
中的字典来解释,Key
,Value
表示字典中的键值对,而Query
表示我们需要查询的键,Query
与Key
,Value
匹配其得到的结果就是我们需要的信息;但是在这里并不要求Query
与Key
严格匹配,只需要模糊匹配就可以;Query
对每一个Key
进行一次模糊的匹配,并给出匹配程度,越适配权重就越大,然后根据权重再与每一个Value
进行组合,得到最后的结果;其匹配程度的权重就代表了注意力机制的权重;
多头注意力机制就是把Query
,Key
,Value
多个维度的向量分为几个少数维度的向量组合,再在Query_i
,Key_i
,Value_i
上进行操作,最后把结果合并;
模型中Multi-Head Attention
有三个, 这三个分别对应三种Multi-Head Attention Layer
:the cross attention layer
,the global self attention layer
, the causal self attention layer
,从图中也可以发现每一层都有各自的不同,下面来一一介绍;
the cross attention layer
:模型右上角(解码器)的Multi-Head Attention
是注意力机制最直接的使用,其将编码器的信息和解码器的信息充分的结合了起来,文中把这一层叫做the cross attention layer
;其context sequence
是Encoder
中得到的;
在这里要注意的是,每一次的查询是可以看得到所有的Key
,Value
的,但是查询与查询相互之间是看不到的,即独立的;
the global self attention layer
:模型左下角(编码器)的Multi-Head Attention
,这一层负责处理上下文序列,并沿着他的长度去传播信息即Query
与Query
之间的信息;
Query与Query之间的信息传播有很多种方式,例如在Transformer没出来之间我们普遍采用Bidirectional RNNs
和 CNNs
的方式来处理;
但是为什么这里不使用RNN
和CNN
的方法呢?
RNN
和CNN
的限制:
the global self attention layer
允许每个序列元素直接访问每个其他序列元素,只需少量操作,并且所有输出都可以并行计算。 就像下图这样:
虽然图像类似于线性层,其本质好像也是线性层,但是其信息传播能力要比普通的线性层要强;
the causal self attention layer
:因果自注意层,这一层与the global self attention layer
类似,其不同的特点是需要mask
,Masked Multi-Head Attention
;这里要注意的是Transformer
是一个自回归模型,每次产生一个输出并且把输出当作输入用来继续产生输出,因此这些模型确保每个序列元素的输出仅依赖于先前的序列元素,所以需要对Attention weights
进行处理;
根据论文公式: A t t e n t i o n ( Q , K , V ) = s o f t m a x ( Q K T d k ) V Attention(Q,K,V)=softmax(\frac{QK^T}{\sqrt{d_k}})V Attention(Q,K,V)=softmax(dkQKT)V
可以发现 Q Q Q和 K K K是做矩阵运算的, A t t e n t i o n Attention Attention中的单个元素计算是 Q Q Q中的某行和 K K K中的某行做点积运算;论文中是计算 A t t e n t i o n Attention Attention后在相应掩码位置乘一个无穷小,这里其实可以优化一下,只计算 A t t e n t i o n Attention Attention有效的位置,可以加快速度;
这一层根据训练和推理有一点不同,如上文所说,在训练时,我们不需要每一次新产生的输出做为输入,我们直接用真值(shift)代替新产生的输出进行训练就好,这样可以在缺失一点稳定性的情况下,加快训练速度,并得到每一个位置上的损失大小;
在推理时,我们并没有真值,我们只能以每一次新产生的输出作为输入进行计算,这里有两种自回归的处理方式;一个是RNN,一个是CNN:Fast Wavenet;
以下是该层的简要表示,即每个序列元素的输出仅依赖于先前的序列元素;
在encoder和decoder中,都包含了Feed Forward
网络,如图所示:
该网络由两个线性层组成。中间有一个relu
激活函数,还有一个dropout
层;这里面维度变化是把d_model
维先提升到dff
维度,然后把dff
维度降低到d_model
维度;
import tensorflow as tf
import keras_nlp
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
训练的过程如图所示:
从图中我们可以看出,我们需要准备好源sequence和两个目标sequence,其中源sequence前后应该有\这里需要提醒的一点是,右侧训练过程是不依赖每一步的输出结果的:这种设置被称为“teacher forcing”,因为不管模型在每个时间步的输出如何,它都会获得下一个时间步的真值作为输入。这是训练文本生成模型的一种简单而有效的方法。这是有效的,因为不需要顺序运行模型,在不同的序列位置的输出可以并行计算。
You might have expected the `input, output`, pairs to simply be the `Portuguese, English` sequences. Given the Portuguese sequence, the model would try to generate the English sequence.
It's possible to train a model that way. You'd need to write out the inference loop and pass the model's output back to the input. It's slower (time steps can't run in parallel), and a harder task to learn (the model can't get the end of a sentence right until it gets the beginning right), but it can give a more stable model because the model has to learn to correct its own errors during training.
总结就是 teacher forcing 通过舍弃模型的稳定性,加快学习训练速度,相关代码如下:
# 数据处理
def process_data(x):
res = tf.strings.split(x, '\t')
return res[1], res[3]
# 导入数据
dataset = tf.data.TextLineDataset('ch-en.tsv')
dataset = dataset.map(process_data)
# 建立中英文wordpiece词表
vocab_chinese = keras_nlp.tokenizers.compute_word_piece_vocabulary(
dataset.map(lambda x, y: x),
vocabulary_size=20_000,
lowercase=True,
strip_accents=True,
split_on_cjk=True,
reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
)
vocab_english = keras_nlp.tokenizers.compute_word_piece_vocabulary(
dataset.map(lambda x, y: y),
vocabulary_size=20_000,
lowercase=True,
strip_accents=True,
split_on_cjk=True,
reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
)
# 构建分词器
chinese_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab_chinese, oov_token="[UNK]")
english_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab_english, oov_token="[UNK]")
# 再进行一次数据处理
def process_data_(ch, en, maxtoken=128):
ch = chinese_tokenizer(ch)[:,:maxtoken]
en = english_tokenizer(tf.strings.lower(en))[:,:maxtoken]
ch = tf.concat([tf.ones(shape=(64,1), dtype='int32'), ch, tf.ones(shape=(64,1), dtype='int32')*2], axis=-1).to_tensor()
en = tf.concat([tf.ones(shape=(64,1), dtype='int32'), en, tf.ones(shape=(64,1), dtype='int32')*2], axis=-1)
en_inputs = en[:, :-1].to_tensor() # Drop the [END] tokens
en_labels = en[:, 1:].to_tensor() # Drop the [START] tokens
return (ch, en_inputs), en_labels
dataset = dataset.batch(64).map(process_data_)
train_dataset = dataset.take(1000)
val_dataset = dataset.skip(500).take(300)
# 数据准备完毕 查看数据
# for (pt, en), en_labels in dataset.take(1):
# break
# print(pt.shape)
# print(en.shape)
# print(en_labels.shape)
# (64, 33)
# (64, 28)
# (64, 28)
词嵌入和位置编码的地方一共有两处,一处是Input,一处是Output(shifted right);两处采用位置编码的方式是一样的;
位置编码函数如下:
def positional_encoding(length, depth):
depth = depth/2
positions = np.arange(length)[:, np.newaxis] # (seq, 1)
depths = np.arange(depth)[np.newaxis, :]/depth # (1, depth)
angle_rates = 1 / (10000**depths) # (1, depth)
angle_rads = positions * angle_rates # (pos, depth)
pos_encoding = np.concatenate(
[np.sin(angle_rads), np.cos(angle_rads)],
axis=-1)
return tf.cast(pos_encoding, dtype=tf.float32)
词嵌入和位置编码层的代码如下:
class PositionEmbedding(tf.keras.layers.Layer):
def __init__(self, vocabulary_size, d_model):
super().__init__()
self.d_model = d_model
self.embedding = tf.keras.layers.Embedding(vocabulary_size, d_model, mask_zero=True)
self.pos_encoding = positional_encoding(length=2048, depth=d_model)
def compute_mask(self, *args, **kwargs):
return self.embedding.compute_mask(*args, **kwargs)
def call(self, x):
length = tf.shape(x)[1]
x = self.embedding(x)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x = x + self.pos_encoding[tf.newaxis, :length, :]
return x
模型中Multi-Head Attention
有三个, 这三个分别对应三种Multi-Head Attention Layer
:the cross attention layer
,the global self attention layer
, the causal self attention layer
,如图所示:
定义一个BaseAttention
class BaseAttention(tf.keras.layers.Layer):
def __init__(self, **kwargs):
super().__init__()
self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
self.layernorm = tf.keras.layers.LayerNormalization()
self.add = tf.keras.layers.Add()
the cross attention layer
定义如下:
class CrossAttention(BaseAttention):
def call(self, x, context):
attn_output, attn_scores = self.mha(
query=x,
key=context,
value=context,
return_attention_scores=True)
# Cache the attention scores for plotting later.
self.last_attn_scores = attn_scores
x = self.add([x, attn_output])
x = self.layernorm(x)
return x
the global self attention layer
定义如下:
class GlobalSelfAttention(BaseAttention):
def call(self, x):
attn_output = self.mha(
query=x,
value=x,
key=x)
x = self.add([x, attn_output])
x = self.layernorm(x)
return x
the causal self attention layer
定义如下:
class CausalSelfAttention(BaseAttention):
def call(self, x):
attn_output = self.mha(
query=x,
value=x,
key=x,
use_causal_mask = True)
x = self.add([x, attn_output])
x = self.layernorm(x)
return x
该网络由两个线性层组成。中间有一个relu
激活函数,还有一个dropout
层;这里面维度变化是把d_model
维先提升到dff
维度,然后把dff
维度降低到d_model
维度;
代码如下:
class FeedForward(tf.keras.layers.Layer):
def __init__(self, d_model, dff, dropout_rate=0.1):
super().__init__()
self.seq = tf.keras.Sequential([
tf.keras.layers.Dense(dff, activation='relu'),
tf.keras.layers.Dense(d_model),
tf.keras.layers.Dropout(dropout_rate)
])
self.add = tf.keras.layers.Add()
self.layer_norm = tf.keras.layers.LayerNormalization()
def call(self, x):
x = self.add([x, self.seq(x)])
x = self.layer_norm(x)
return x
模型结构如下,论文中编码器是由6个编码层组成的;
编码层代码如下:
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self, *, d_model, num_heads, dff, dropout=0.1):
super().__init__()
self.self_attention = GlobalSelfAttention(
num_heads = num_heads,
key_dim = d_model,
dropout = dropout
)
self.ffn = FeedForward(d_model, dff)
def call(self, x):
x = self.self_attention(x)
x = self.ffn(x)
return x
编码器代码如下:
class Encoder(tf.keras.layers.Layer):
def __init__(self, *, vocabulary_size, d_model, nums_heads, dff, num_layers=6, dropout=0.1):
super().__init__()
# 给Encoder添加属性,便于辨识
self.d_model = d_model
self.num_layers = num_layers
self.pos_embedding = PositionEmbedding(vocabulary_size, d_model)
self.encoder_layers = [EncoderLayer(d_model=d_model, num_heads=nums_heads, dff=dff, dropout=dropout) for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(dropout)
def call(self, x):
x = self.pos_embedding(x)
x = self.dropout(x)
for encoder_layer in self.encoder_layers:
x = encoder_layer(x)
return x
模型结构如下,论文中解码器是由6个解码层组成的;
解码层代码如下:
class DecoderLayer(tf.keras.layers.Layer):
def __init__(self, *, d_model, num_heads, dff, dropout=0.1):
super().__init__()
self.causal_self_attention = CausalSelfAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout)
self.cross_attention = CrossAttention(num_heads=num_heads, key_dim=key_dim, dropout=dropout)
self.ffn = FeedForward(d_model, dff)
def call(self, x, context):
x = self.causal_self_attention(x)
x = self.cross_attention(x, context)
# 这里存储最后的注意力分数为了后面的画图
self.last_attn_scores = self.cross_attention.last_attn_scores
x = self.ffn(x)
return x
解码器代码如下:
class Decoder(tf.keras.layers.Layer):
def __init__(self, *, vocabulary_size, d_model, num_heads, dff, num_layers=6, dropout=0.1):
super(Decoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.pos_embedding = PositionEmbedding(vocabulary_size=vocabulary_size, d_model=d_model)
self.decoder_layers = [DecoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout=dropout) for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(rate=dropout)
self.last_attn_scores = None
def call(self, x, content):
x = self.pos_embedding(x)
x = self.dropout(x)
for decoder_layer in self.decoder_layers:
x = decoder_layer(x, content)
self.last_attn_scores = self.decoder_layers[-1].last_attn_scores
return x
有了编码器和解码器,现在我们来构造Transformer
模型,模型的整体框架如下:
需要设置的超参如下:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout = 0.1
Transformer模型代码如下:
class Transformer(tf.keras.Model):
def __init__(self, *, num_layers, d_model, num_heads, dff, input_vocabulary_size, target_vocabulary_size, dropout=0.1):
super().__init__()
self.encoder = Encoder(vocabulary_size=input_vocabulary_size, d_model=d_model, num_layers=num_layers, num_heads=num_heads, dff=dff)
self.decoder = Decoder(vocabulary_size=target_vocabulary_size, d_model=d_model, num_layers=num_layers, num_heads=num_heads, dff=dff)
self.final_layer = tf.keras.layers.Dense(target_vocabulary_size, activation='softmax')
def call(self, inputs):
context, x = inputs
context = self.encoder(context)
x = self.decoder(x, context)
logits = self.final_layer(x)
# 不太理解
try:
# Drop the keras mask, so it doesn't scale the losses/metrics.
# b/250038731
del logits._keras_mask
except AttributeError:
pass
return logits
以下是不同层数的Transformer和RNN+Attention模型的可视化模型:
模型参数大小如下:
model.summary()
如果出现:This model has not yet been built. Build the model first by calling build()
or by calling the model on a batch of data.,试着带入数据build一下模型
model = Transformer(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, input_vocabulary_size=chinese_tokenizer.vocabulary_size(), target_vocabulary_size=english_tokenizer.vocabulary_size(), dropout=dropout)
output = model((pt, en))
print(en.shape)
print(pt.shape)
print(output.shape)
根据原文的公式自定义一个学习率调度器: l r a t e = d m o d e l − 0.5 ∗ m i n ( s t e p _ n u m − 0.5 , s t e p _ n u m ⋅ w a r m u p _ s t e p s − 1.5 ) lrate=d_{model}^{-0.5}*min(step\_num^{-0.5}, step\_num·warmup\_steps^{-1.5}) lrate=dmodel−0.5∗min(step_num−0.5,step_num⋅warmup_steps−1.5)
代码如下:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
def __init__(self, d_model, warmup_steps=4000):
super().__init__()
self.d_model = d_model
self.d_model = tf.cast(self.d_model, tf.float32)
self.warmup_steps = warmup_steps
def __call__(self, step):
step = tf.cast(step, dtype=tf.float32)
arg1 = tf.math.rsqrt(step)
arg2 = step * (self.warmup_steps ** -1.5)
return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
调度曲线如下所示:
定义loss和metrics:
def masked_loss(label, pred):
mask = label != 0
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none')
loss = loss_object(label, pred)
mask = tf.cast(mask, dtype=loss.dtype)
loss *= mask
loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
return loss
def masked_accuracy(label, pred):
pred = tf.argmax(pred, axis=2)
label = tf.cast(label, pred.dtype)
match = label == pred
mask = label != 0
match = match & mask
match = tf.cast(match, dtype=tf.float32)
mask = tf.cast(mask, dtype=tf.float32)
return tf.reduce_sum(match)/tf.reduce_sum(mask)
开始训练:
model.compile(loss=masked_loss, optimizer=optimizer, metrics=[masked_accuracy])
model.fit(train_dataset, epochs=10, validation_data=val_dataset)
如上文所说,由于推理过程于训练过程不一致,我们需要重新写一个推理的代码,代码如下:
MAX_TOKENS = 128
class Translator(tf.Module):
def __init__(self, tokenizers, transformer):
self.tokenizers = tokenizers
self.transformer = transformer
def __call__(self, sentence, max_length=MAX_TOKENS):
# sentence是中文,因此需要tokenizer并且加上:1和:2
assert isinstance(sentence, tf.Tensor)
if len(sentence.shape) == 0:
sentence = sentence[tf.newaxis]
sentence = self.tokenizers(sentence)
sentence = tf.concat([tf.ones(shape=[sentence.shape[0], 1], dtype='int32'), sentence, tf.ones(shape=[sentence.shape[0], 1], dtype='int32')*2], axis=-1).to_tensor()
encoder_input = sentence
# As the output language is English, initialize the output with the
start = tf.constant(1, dtype='int64')[tf.newaxis]
end = tf.constant(2, dtype='int64')[tf.newaxis]
# tf.TensorArray 类似于python中的列表
output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
# 在index=0的位置写入start
output_array = output_array.write(0, start)
for i in tf.range(max_length):
output = tf.transpose(output_array.stack())
predictions = self.transformer([encoder_input, output], training=False) # Shape `(batch_size, seq_len, vocab_size)`
# 从seq_len中的最后一个维度选择last token
predictions = predictions[:, -1:, :] # Shape `(batch_size, 1, vocab_size)`.
predicted_id = tf.argmax(predictions, axis=-1)
# `predicted_id`加入到output_array中作为一个新的输入
output_array = output_array.write(i+1, predicted_id[0])
# 如果输出end就表明停止
if predicted_id == end:
break
output = tf.transpose(output_array.stack())
# 重新计算一下最外面的循环,得到最后的注意力得分
self.transformer([encoder_input, output[:,:-1]], training=False)
attention_weights = self.transformer.decoder.last_attn_scores
lst = []
for item in output[0].numpy():
lst.append(english_tokenizer.vocabulary[item])
translated_text = ''.join(lst)
translated_tokens = output[0]
return translated_text, translated_tokens, attention_weights```
推理测试:
```python
def print_translation(sentence, tokens, ground_truth):
print(f'{"Input:":15s}: {sentence}')
print(f'{"Prediction":15s}: {tokens}')
print(f'{"Ground truth":15s}: {ground_truth}')
translator = Translator(chinese_tokenizer, model)
sentence = '我們試試看!'
ground_truth = "Let's try it."
translated_text, translated_tokens, attention_weights = translator(tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)
# Input: : 我們試試看!
# Prediction : [START] let ' s try to see this . [END]
# Ground truth : Let's try it.
定义画注意力权重的函数:
def plot_attention_head(in_tokens, translated_tokens, attention):
# 模型在输出中不产生,我们直接忽略
translated_tokens = translated_tokens[1:]
ax = plt.gca()
ax.matshow(attention[0])
ax.set_xticks(range(len(in_tokens)))
ax.set_yticks(range(len(translated_tokens)))
labels = [vocab_chinese[label] for label in in_tokens.numpy()]
ax.set_xticklabels(labels, rotation=90)
labels = [vocab_english[label] for label in translated_tokens.numpy()]
ax.set_yticklabels(labels)
测试代码如下:
sentence = '我們試試看!'
ground_truth = "Let's try it."
translated_text, translated_tokens, attention_weights = translator(tf.constant(sentence))
in_tokens = tf.concat([tf.constant(1)[tf.newaxis], chinese_tokenizer(tf.constant(sentence)), tf.constant(2)[tf.newaxis]], axis=-1)
attention = tf.squeeze(attention_weights, 0)
# 画第0个头 attention[0] 这里一共有8个头
plot_attention_head(in_tokens, translated_tokens, attention[0])
第0个头结果如下:
代码有点长!