transformer模型由encoder,decoder构成,每一个encoder分为六层,每一层又分为一个前馈网络和一个多头注意力层。decoder也分为六层,每一层都由两个注意力层和一个前馈网络组成,一个是自注意力层,一个是和encoder最终输出组成的注意力层。下面是利用transformer模型实现机器翻译的简单例子,框架用的是tensorflow。
import re
import string
from itertools import count
from operator import pos
from typing import Text
from keras import layers
from keras import Model
import keras
import tensorflow as tf
from tensorflow._api.v2 import data
import random
from collections import Counter
import numpy as np
from keras.layers import TextVectorization
import tensorflow as tf
france = []
english = []
with open('./dataset/spa-eng/spa.txt', 'r', encoding='utf-8') as file:
for idx, line in enumerate(file.readlines()):
if idx >= 30000:
break
line = line.split('\t')
english.append(line[0])
france.append('[start] '+line[1]+' [end]')
# #config
num_words = 15000
eng_text_length = 20
fra_text_length = 20
batch_size = 64
# # english tokenize
eng_tokenizer = TextVectorization(
max_tokens=num_words, output_mode="int", output_sequence_length=eng_text_length)
eng_tokenizer.adapt(english)
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
def custom_standardization(input_string):
lowercase = tf.strings.lower(input_string)
return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")
fra_tokenizer = TextVectorization(
max_tokens=num_words, output_sequence_length=fra_text_length+1, standardize=custom_standardization)
fra_tokenizer.adapt(france)
# # #set train data
encoder_input_data = eng_tokenizer(english)
decoder_input_data = fra_tokenizer(france)[:, :-1]
decoder_output_data = fra_tokenizer(france)[:, 1:]
# print(encoder_input_data[:10])
# print(decoder_output_data[:10])
en_word_to_idx = dict([(v, k)
for k, v in enumerate(eng_tokenizer.get_vocabulary())])
en_idx_to_word = dict([(k, v)
for k, v in enumerate(eng_tokenizer.get_vocabulary())])
fra_word_to_idx = dict([(v, k)
for k, v in enumerate(fra_tokenizer.get_vocabulary())])
fra_idx_to_word = dict([(k, v)
for k, v in enumerate(fra_tokenizer.get_vocabulary())])
def get_angles(pos, i, d_model):
angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
return pos * angle_rates
def positional_encoding(pos, d_model):
'''
:param pos: 词在句子中的位置,句子上的维族;(i是d_model上的维度)
:param d_model: 隐状态的维度,相当于num_units
:return: 位置编码 shape=[1, position_num, d_model], 其中第一个维度是为了匹配batch_size
'''
def get_angles(position, i):
# 这里的i相当于公式里面的2i或2i+1
# 返回shape=[position_num, d_model]
return position / np.power(10000., 2. * (i // 2.) / np.float(d_model))
angle_rates = get_angles(np.arange(pos)[:, np.newaxis],
np.arange(d_model)[np.newaxis, :])
# 2i位置使用sin编码,2i+1位置使用cos编码
pe_sin = np.sin(angle_rates[:, 0::2])
pe_cos = np.cos(angle_rates[:, 1::2])
pos_encoding = np.concatenate([pe_sin, pe_cos], axis=-1)
pos_encoding = tf.cast(pos_encoding[np.newaxis, ...], tf.float32)
return pos_encoding
#注意!!!!!! 一定要加掩码,否则结果都是start end。
def create_padding_mask(seq):
seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
return seq[:, np.newaxis, np.newaxis, :]
# look-ahead mask
def create_look_ahead_mask(size):
mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
return mask # shape=[seq_len, seq_len]
def create_mask(inputs, targets):
# 编码器只有padding_mask
encoder_padding_mask = create_padding_mask(inputs)
# 解码器decoder_padding_mask,用于第二层multi-head attention
decoder_padding_mask = create_padding_mask(inputs)
# seq_mask mask掉未预测的词
look_ahead_mask = create_look_ahead_mask(tf.shape(targets)[1])
# decoder_targets_padding_mask 解码层的输入padding mask
decoder_targets_padding_mask = create_padding_mask(targets)
# 合并解码层mask,用于第一层masked multi-head attention
combined_mask = tf.maximum(decoder_targets_padding_mask, look_ahead_mask)
return encoder_padding_mask, combined_mask, decoder_padding_mask
embedding_size = 256
n_head = 8
n_layer = 2
def splite_tensor(tensor):
shape = tf.shape(tensor)
tensor = tf.reshape(
tensor, shape=[shape[0], -1, n_head, embedding_size//n_head])
tensor = tf.transpose(tensor, perm=[0, 2, 1, 3])
return tensor
class MultiHeadAttention(layers.Layer):
def __init__(self):
super(MultiHeadAttention, self).__init__()
def build(self, input_shape):
self.dense_query = layers.Dense(
units=embedding_size, activation='relu')
self.dense_key = layers.Dense(units=embedding_size, activation='relu')
self.dense_value = layers.Dense(
units=embedding_size, activation='relu')
self.layer_norm = layers.LayerNormalization()
super(MultiHeadAttention, self).build(input_shape)
def call(self, inputs):
query, key, value, mask = inputs
shape = tf.shape(query)
query_dense = self.dense_query(query)
key_dense = self.dense_key(key)
value_dense = self.dense_value(value)
query_dense = splite_tensor(query_dense)
key_dense = splite_tensor(key_dense)
value_dense = splite_tensor(value_dense)
attention = tf.matmul(query_dense, key_dense, transpose_b=True) / \
tf.math.sqrt(tf.cast(embedding_size, tf.float32))
attention += (mask*-1e9)
attention = tf.nn.softmax(attention)
attention = layers.Dropout(0.1)(attention)
attention = tf.matmul(attention, value_dense)
attention = tf.transpose(attention, [0, 2, 1, 3])
attention = tf.reshape(attention, [shape[0], -1, embedding_size])
attention = self.layer_norm((attention+query))
return attention
class EncoderLayer(layers.Layer):
def __init__(self, n_head, emb_dim, dense_dim, dropout):
super(EncoderLayer, self).__init__()
self.attn = MultiHeadAttention()
self.drop_attn = layers.Dropout(dropout)
self.dense1 = layers.Dense(dense_dim, activation='relu')
self.dense2 = layers.Dense(emb_dim)
self.drop_dense = layers.Dropout(dropout)
self.layer_norm_attn = layers.LayerNormalization()
self.layer_norm_dense = layers.LayerNormalization()
def call(self, inputs,training=None):
encoder_inputs,mask = inputs
# if mask is not None:
# padding_mask = tf.cast(
# mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
# print('new mask is:',mask)
att_out = self.attn([encoder_inputs, encoder_inputs,encoder_inputs,mask])
att_out = self.drop_attn(att_out, training=training)
att_out = self.layer_norm_attn(encoder_inputs+att_out)
dense = self.dense1(att_out)
dense = self.dense2(dense)
dense = self.drop_dense(dense, training=training)
x = self.layer_norm_dense(att_out+dense)
return x
class Encoder(layers.Layer):
def __init__(self, vocab, emb_dim=512, dense_dim=2048, n_layers=6, n_head=8, dropout=0.1):
super(Encoder, self).__init__()
self.emb_dim = emb_dim
self.emb = layers.Embedding(input_dim=vocab, output_dim=emb_dim)
self.pos = positional_encoding(eng_text_length, emb_dim)
# self.emb = PositionalEmbedding(eng_text_length, vocab, emb_dim)
self.encoder_layers = [EncoderLayer(
emb_dim=emb_dim, n_head=n_head, dense_dim=dense_dim, dropout=dropout) for _ in range(n_layers)]
self.dropout = layers.Dropout(dropout)
def call(self, inputs,training=None):
encoder_inputs,mask = inputs
# shape=[batch_size, seq_len, d_model]
seq_len = encoder_inputs.shape[1] # 句子真实长度
# shape=[batch_size, seq_len, d_model]
word_embedding = self.emb(encoder_inputs)
word_embedding *= tf.math.sqrt(tf.cast(self.emb_dim, tf.float32))
emb = word_embedding + self.pos[:, :seq_len, :]
# emb = self.emb(inputs)
x = self.dropout(emb, training=training)
# print('pos:',x)
for encoder_layer in self.encoder_layers:
x = encoder_layer([x,mask])
#print('x:',x.shape)
return x
# def compute_mask(self, inputs, mask=None):
# return self.emb.compute_mask(inputs)
class DecoderLayer(layers.Layer):
def __init__(self, n_head, emb_dim, dense_dim, dropout):
super(DecoderLayer, self).__init__()
self.attn1 = MultiHeadAttention()
self.layer_norm_attn1 = layers.LayerNormalization()
self.drop_attn1 = layers.Dropout(dropout)
self.attn2 = MultiHeadAttention()
self.layer_norm_attn2 = layers.LayerNormalization()
self.drop_attn2 = layers.Dropout(dropout)
self.dense1 = layers.Dense(dense_dim, activation='relu')
self.dense2 = layers.Dense(emb_dim)
self.drop_dense = layers.Dropout(dropout)
self.layer_norm_dense = layers.LayerNormalization()
def call(self, inputs,training=None):
# causal_mask = self.get_causal_attention_mask(inputs)
# if mask is not None:
# padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
# padding_mask = tf.minimum(padding_mask, causal_mask)
decoder_inputs, encoder_outputs,mask1,mask2 = inputs
att_out1 = self.attn1([decoder_inputs,decoder_inputs,decoder_inputs, mask1])
att_out1 = self.drop_attn1(att_out1, training=training)
att_out1 = self.layer_norm_attn1(decoder_inputs+att_out1)
att_out2 = self.attn2([att_out1,encoder_outputs, encoder_outputs,mask2])
att_out2 = self.drop_attn2(att_out2, training=training)
att_out2 = self.layer_norm_attn2(att_out1+att_out2)
dense = self.dense1(att_out2)
dense = self.dense2(dense)
dense = self.drop_dense(dense, training=training)
x = self.layer_norm_dense(att_out2+dense)
return x
# def get_causal_attention_mask(self, inputs):
# input_shape = tf.shape(inputs)
# batch_size, sequence_length = input_shape[0], input_shape[1]
# i = tf.range(sequence_length)[:, tf.newaxis]
# j = tf.range(sequence_length)
# mask = tf.cast(i >= j, dtype="int32")
# mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
# mult = tf.concat(
# [tf.expand_dims(batch_size, -1),
# tf.constant([1, 1], dtype=tf.int32)],
# axis=0,
# )
# return tf.tile(mask, mult)
class Decoder(layers.Layer):
def __init__(self, vocab, n_head=8, n_layers=6, emb_dim=512, dense_dim=2048, dropout=0.1):
super(Decoder, self).__init__()
self.emb_dim = emb_dim
self.emb = layers.Embedding(input_dim=vocab, output_dim=emb_dim)
self.pos = positional_encoding(fra_text_length, emb_dim)
# self.emb = PositionalEmbedding(fra_text_length, vocab, emb_dim)
self.decoder_layers = [DecoderLayer(
n_head=n_head, emb_dim=emb_dim, dense_dim=dense_dim, dropout=dropout) for _ in range(n_layers)]
self.dropout = layers.Dropout(dropout)
def call(self, inputs,training=None):
decoder_inputs, encoder_outputs,mask1,mask2 = inputs
seq_len = decoder_inputs.shape[1]
word_embedding = self.emb(decoder_inputs)
word_embedding *= tf.math.sqrt(tf.cast(self.emb_dim, tf.float32))
emb = word_embedding + self.pos[:, :seq_len, :]
# emb = self.emb(inputs)
x = self.dropout(emb, training=training)
for decoder_layer in self.decoder_layers:
x = decoder_layer([x, encoder_outputs,mask1,mask2])
return x
# def compute_mask(self, inputs, mask=None):
# return self.emb.compute_mask(inputs)
class Transformer(layers.Layer):
def __init__(self):
super(Transformer,self).__init__()
self.encoder = Encoder(vocab=num_words, n_head=8, n_layers=1,
emb_dim=256, dense_dim=512)
self.decoder = Decoder(vocab=num_words, n_head=8, n_layers=1,
emb_dim=256, dense_dim=512)
self.dense = layers.Dense(num_words, activation='softmax')
def call(self,encoder_inputs,decoder_inputs):
encoder_padding_mask, look_ahead_mask, decoder_padding_mask = create_mask(
encoder_inputs, decoder_inputs)
encoder_outputs = self.encoder([encoder_inputs,encoder_padding_mask])
x = self.decoder([decoder_inputs, encoder_outputs,look_ahead_mask,decoder_padding_mask])
x = layers.Dropout(0.5)(x)
decoder_outputs = self.dense(x)
return decoder_outputs
encoder_inputs = layers.Input((None,))
decoder_inputs = layers.Input((None,))
transformer = Transformer()
decoder_outputs=transformer(encoder_inputs,decoder_inputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
model.compile(optimizer='rmsprop', metrics=['acc'],
loss="sparse_categorical_crossentropy")
model.fit([encoder_input_data, decoder_input_data], decoder_output_data,
validation_split=0.2, epochs=1, batch_size=batch_size)
model.save('trans')
model = keras.models.load_model('trans')
max_decoded_sentence_length = 10
def decode_sequence(input_sentence):
tokenized_input_sentence = eng_tokenizer([input_sentence])
decoded_sentence = "[start]"
for i in range(max_decoded_sentence_length):
tokenized_target_sentence = fra_tokenizer([decoded_sentence])[
:, :-1]
predictions = model(
[tokenized_input_sentence, tokenized_target_sentence])
sampled_token_index = np.argmax(predictions[0, i, :])
sampled_token = fra_idx_to_word[sampled_token_index]
decoded_sentence += " " + sampled_token
if sampled_token == "[end]":
break
return decoded_sentence
test_eng_texts = english[:30]
for _ in range(30):
input_sentence = random.choice(test_eng_texts)
translated = decode_sequence(input_sentence)
print('src:',input_sentence)
print('trans:',translated)