【自然语言处理六】利用transformer模型实现机器翻译

transformer模型由encoder,decoder构成,每一个encoder分为六层,每一层又分为一个前馈网络和一个多头注意力层。decoder也分为六层,每一层都由两个注意力层和一个前馈网络组成,一个是自注意力层,一个是和encoder最终输出组成的注意力层。下面是利用transformer模型实现机器翻译的简单例子,框架用的是tensorflow。

训练部分

import re
import string
from itertools import count
from operator import pos
from typing import Text
from keras import layers
from keras import Model
import keras
import tensorflow as tf
from tensorflow._api.v2 import data
import random
from collections import Counter
import numpy as np
from keras.layers import TextVectorization
import tensorflow as tf
france = []
english = []
with open('./dataset/spa-eng/spa.txt', 'r', encoding='utf-8') as file:
    for idx, line in enumerate(file.readlines()):
        if idx >= 30000:
            break
        line = line.split('\t')
        english.append(line[0])
        france.append('[start] '+line[1]+' [end]')

# #config
num_words = 15000
eng_text_length = 20
fra_text_length = 20
batch_size = 64
# # english tokenize
eng_tokenizer = TextVectorization(
    max_tokens=num_words, output_mode="int", output_sequence_length=eng_text_length)
eng_tokenizer.adapt(english)

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")


fra_tokenizer = TextVectorization(
    max_tokens=num_words, output_sequence_length=fra_text_length+1, standardize=custom_standardization)
fra_tokenizer.adapt(france)


# # #set train data
encoder_input_data = eng_tokenizer(english)
decoder_input_data = fra_tokenizer(france)[:, :-1]
decoder_output_data = fra_tokenizer(france)[:, 1:]

# print(encoder_input_data[:10])
# print(decoder_output_data[:10])

en_word_to_idx = dict([(v, k)
                      for k, v in enumerate(eng_tokenizer.get_vocabulary())])
en_idx_to_word = dict([(k, v)
                      for k, v in enumerate(eng_tokenizer.get_vocabulary())])

fra_word_to_idx = dict([(v, k)
                       for k, v in enumerate(fra_tokenizer.get_vocabulary())])
fra_idx_to_word = dict([(k, v)
                       for k, v in enumerate(fra_tokenizer.get_vocabulary())])


def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates


def positional_encoding(pos, d_model):
    '''
    :param pos: 词在句子中的位置,句子上的维族;(i是d_model上的维度)
    :param d_model: 隐状态的维度,相当于num_units
    :return: 位置编码 shape=[1, position_num, d_model], 其中第一个维度是为了匹配batch_size
    '''
    def get_angles(position, i):
        # 这里的i相当于公式里面的2i或2i+1
        # 返回shape=[position_num, d_model]
        return position / np.power(10000., 2. * (i // 2.) / np.float(d_model))

    angle_rates = get_angles(np.arange(pos)[:, np.newaxis],
                             np.arange(d_model)[np.newaxis, :])
    # 2i位置使用sin编码,2i+1位置使用cos编码
    pe_sin = np.sin(angle_rates[:, 0::2])
    pe_cos = np.cos(angle_rates[:, 1::2])
    pos_encoding = np.concatenate([pe_sin, pe_cos], axis=-1)
    pos_encoding = tf.cast(pos_encoding[np.newaxis, ...], tf.float32)
    return pos_encoding

#注意!!!!!! 一定要加掩码,否则结果都是start end。

def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, np.newaxis, np.newaxis, :]

# look-ahead mask


def create_look_ahead_mask(size):

    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # shape=[seq_len, seq_len]


def create_mask(inputs, targets):
    # 编码器只有padding_mask
    encoder_padding_mask = create_padding_mask(inputs)
    # 解码器decoder_padding_mask,用于第二层multi-head attention
    decoder_padding_mask = create_padding_mask(inputs)
    # seq_mask mask掉未预测的词
    look_ahead_mask = create_look_ahead_mask(tf.shape(targets)[1])
    # decoder_targets_padding_mask 解码层的输入padding mask
    decoder_targets_padding_mask = create_padding_mask(targets)
    # 合并解码层mask,用于第一层masked multi-head attention
    combined_mask = tf.maximum(decoder_targets_padding_mask, look_ahead_mask)
    return encoder_padding_mask, combined_mask, decoder_padding_mask


embedding_size = 256
n_head = 8
n_layer = 2

def splite_tensor(tensor):
    shape = tf.shape(tensor)
    tensor = tf.reshape(
        tensor, shape=[shape[0], -1, n_head, embedding_size//n_head])
    tensor = tf.transpose(tensor, perm=[0, 2, 1, 3])
    return tensor

class MultiHeadAttention(layers.Layer):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()

    def build(self, input_shape):
        self.dense_query = layers.Dense(
            units=embedding_size, activation='relu')
        self.dense_key = layers.Dense(units=embedding_size, activation='relu')
        self.dense_value = layers.Dense(
            units=embedding_size, activation='relu')

        self.layer_norm = layers.LayerNormalization()
        super(MultiHeadAttention, self).build(input_shape)

    def call(self, inputs):
        query, key, value, mask = inputs
        shape = tf.shape(query)

        query_dense = self.dense_query(query)
        key_dense = self.dense_key(key)
        value_dense = self.dense_value(value)

        query_dense = splite_tensor(query_dense)
        key_dense = splite_tensor(key_dense)
        value_dense = splite_tensor(value_dense)

        attention = tf.matmul(query_dense, key_dense, transpose_b=True) / \
            tf.math.sqrt(tf.cast(embedding_size, tf.float32))
        attention += (mask*-1e9)
        attention = tf.nn.softmax(attention)
        attention = layers.Dropout(0.1)(attention)
        attention = tf.matmul(attention, value_dense)
        attention = tf.transpose(attention, [0, 2, 1, 3])
        attention = tf.reshape(attention, [shape[0], -1, embedding_size])

        attention = self.layer_norm((attention+query))
        return attention



class EncoderLayer(layers.Layer):
    def __init__(self, n_head, emb_dim, dense_dim, dropout):
        super(EncoderLayer, self).__init__()
        self.attn = MultiHeadAttention()
        self.drop_attn = layers.Dropout(dropout)
        self.dense1 = layers.Dense(dense_dim, activation='relu')
        self.dense2 = layers.Dense(emb_dim)
        self.drop_dense = layers.Dropout(dropout)
        self.layer_norm_attn = layers.LayerNormalization()
        self.layer_norm_dense = layers.LayerNormalization()

    def call(self, inputs,training=None):
        
        encoder_inputs,mask = inputs
        # if mask is not None:
        #     padding_mask = tf.cast(
        #         mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        # print('new mask is:',mask)        
        
        att_out = self.attn([encoder_inputs, encoder_inputs,encoder_inputs,mask])
        att_out = self.drop_attn(att_out, training=training)
        att_out = self.layer_norm_attn(encoder_inputs+att_out)

        dense = self.dense1(att_out)
        dense = self.dense2(dense)
        dense = self.drop_dense(dense, training=training)
        x = self.layer_norm_dense(att_out+dense)

        return x


class Encoder(layers.Layer):
    def __init__(self, vocab, emb_dim=512, dense_dim=2048, n_layers=6, n_head=8, dropout=0.1):
        super(Encoder, self).__init__()
        self.emb_dim = emb_dim

        self.emb = layers.Embedding(input_dim=vocab, output_dim=emb_dim)
        self.pos = positional_encoding(eng_text_length, emb_dim)
        # self.emb = PositionalEmbedding(eng_text_length, vocab, emb_dim)
        
        self.encoder_layers = [EncoderLayer(
            emb_dim=emb_dim, n_head=n_head, dense_dim=dense_dim, dropout=dropout) for _ in range(n_layers)]
        self.dropout = layers.Dropout(dropout)

    def call(self, inputs,training=None):
        
        encoder_inputs,mask = inputs
        # shape=[batch_size, seq_len, d_model]
        seq_len = encoder_inputs.shape[1]  # 句子真实长度
        # shape=[batch_size, seq_len, d_model]
        word_embedding = self.emb(encoder_inputs)
        word_embedding *= tf.math.sqrt(tf.cast(self.emb_dim, tf.float32))
        emb = word_embedding + self.pos[:, :seq_len, :]
        
        # emb = self.emb(inputs)
        x = self.dropout(emb, training=training)
        # print('pos:',x)

        for encoder_layer in self.encoder_layers:
            x = encoder_layer([x,mask])
            #print('x:',x.shape)
        return x

    # def compute_mask(self, inputs, mask=None):
    #     return self.emb.compute_mask(inputs)

class DecoderLayer(layers.Layer):
    def __init__(self, n_head, emb_dim, dense_dim, dropout):
        super(DecoderLayer, self).__init__()
        self.attn1 = MultiHeadAttention()
        self.layer_norm_attn1 = layers.LayerNormalization()
        self.drop_attn1 = layers.Dropout(dropout)

        self.attn2 = MultiHeadAttention()
        self.layer_norm_attn2 = layers.LayerNormalization()
        self.drop_attn2 = layers.Dropout(dropout)

        self.dense1 = layers.Dense(dense_dim, activation='relu')
        self.dense2 = layers.Dense(emb_dim)
        self.drop_dense = layers.Dropout(dropout)
        self.layer_norm_dense = layers.LayerNormalization()

    def call(self, inputs,training=None):
        # causal_mask = self.get_causal_attention_mask(inputs)
        # if mask is not None:
        #     padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
        #     padding_mask = tf.minimum(padding_mask, causal_mask)
        decoder_inputs, encoder_outputs,mask1,mask2 = inputs
        att_out1 = self.attn1([decoder_inputs,decoder_inputs,decoder_inputs, mask1])
        att_out1 = self.drop_attn1(att_out1, training=training)
        att_out1 = self.layer_norm_attn1(decoder_inputs+att_out1)

        att_out2 = self.attn2([att_out1,encoder_outputs, encoder_outputs,mask2])
        att_out2 = self.drop_attn2(att_out2, training=training)
        att_out2 = self.layer_norm_attn2(att_out1+att_out2)

        dense = self.dense1(att_out2)
        dense = self.dense2(dense)
        dense = self.drop_dense(dense, training=training)
        x = self.layer_norm_dense(att_out2+dense)

        return x

    # def get_causal_attention_mask(self, inputs):
    #     input_shape = tf.shape(inputs)
    #     batch_size, sequence_length = input_shape[0], input_shape[1]
    #     i = tf.range(sequence_length)[:, tf.newaxis]
    #     j = tf.range(sequence_length)
    #     mask = tf.cast(i >= j, dtype="int32")
    #     mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
    #     mult = tf.concat(
    #         [tf.expand_dims(batch_size, -1),
    #          tf.constant([1, 1], dtype=tf.int32)],
    #         axis=0,
    #     )
    #     return tf.tile(mask, mult)


class Decoder(layers.Layer):

    def __init__(self, vocab, n_head=8, n_layers=6, emb_dim=512, dense_dim=2048, dropout=0.1):
        super(Decoder, self).__init__()
        self.emb_dim = emb_dim
        self.emb = layers.Embedding(input_dim=vocab, output_dim=emb_dim)
        self.pos = positional_encoding(fra_text_length, emb_dim)
        # self.emb = PositionalEmbedding(fra_text_length, vocab, emb_dim)
        
        self.decoder_layers = [DecoderLayer(
            n_head=n_head, emb_dim=emb_dim, dense_dim=dense_dim, dropout=dropout) for _ in range(n_layers)]
        self.dropout = layers.Dropout(dropout)

    def call(self, inputs,training=None):
           
        decoder_inputs, encoder_outputs,mask1,mask2 = inputs
        
        seq_len = decoder_inputs.shape[1]
        
        word_embedding = self.emb(decoder_inputs)
        word_embedding *= tf.math.sqrt(tf.cast(self.emb_dim, tf.float32))
        emb = word_embedding + self.pos[:, :seq_len, :]

        # emb = self.emb(inputs)
        x = self.dropout(emb, training=training)

        for decoder_layer in self.decoder_layers:
            x = decoder_layer([x, encoder_outputs,mask1,mask2])

        return x

    # def compute_mask(self, inputs, mask=None):
    #     return self.emb.compute_mask(inputs)
class Transformer(layers.Layer):
    def __init__(self):
        super(Transformer,self).__init__()
        self.encoder = Encoder(vocab=num_words, n_head=8, n_layers=1,
                      emb_dim=256, dense_dim=512)
        self.decoder = Decoder(vocab=num_words, n_head=8, n_layers=1,
                      emb_dim=256, dense_dim=512)
        self.dense = layers.Dense(num_words, activation='softmax')
    def call(self,encoder_inputs,decoder_inputs):
        encoder_padding_mask, look_ahead_mask, decoder_padding_mask = create_mask(
            encoder_inputs, decoder_inputs)

        encoder_outputs = self.encoder([encoder_inputs,encoder_padding_mask])
        x = self.decoder([decoder_inputs, encoder_outputs,look_ahead_mask,decoder_padding_mask])
        x = layers.Dropout(0.5)(x)
        decoder_outputs = self.dense(x)
        return decoder_outputs

encoder_inputs = layers.Input((None,))
decoder_inputs = layers.Input((None,))

transformer = Transformer()
decoder_outputs=transformer(encoder_inputs,decoder_inputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()
model.compile(optimizer='rmsprop', metrics=['acc'],
              loss="sparse_categorical_crossentropy")
model.fit([encoder_input_data, decoder_input_data], decoder_output_data,
          validation_split=0.2, epochs=1, batch_size=batch_size)

model.save('trans')

预测部分

model = keras.models.load_model('trans') 
max_decoded_sentence_length = 10
def decode_sequence(input_sentence):
    tokenized_input_sentence = eng_tokenizer([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = fra_tokenizer([decoded_sentence])[
            :, :-1]
        predictions = model(
            [tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = fra_idx_to_word[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


test_eng_texts = english[:30]
for _ in range(30):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequence(input_sentence)
    print('src:',input_sentence)
    print('trans:',translated)

你可能感兴趣的:(自然语言处理,transformer,机器翻译)