华为创新实验-三

这是我个人综合实验手册所得的源代码,现在托管在该平台。

#下载  
! wget https://huawei-ai-certification.obs.cn-north-4.myhuaweicloud.com/CHS/HCIP-AI%20EI%20Developer/V2.1/machine_translation/data.zip

#解压   
! unzip data.zip

#导入相关库
import re
import os
import io
import time
import jieba
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

#指定数据路径
path_to_file = "data/cmn.txt"

#预处理函数定义
def preprocess_eng(w):
    w = w.lower().strip()
    w = re.sub(r"([?.!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,]+", " ", w)
    w = w.rstrip().strip()
    w = ' ' + w + ' '
    return w
def preprocess_chinese(w):
    w = w.lower().strip()
    w = jieba.cut(w, cut_all=False, HMM=True)
    w = " ".join(list(w))
    w = ' ' + w + ' '
    return w

#预处理测试
en_sentence = "May I borrow this book?"
chn_sentence = "我可以借这本书吗?"
print(preprocess_eng(en_sentence))
print(preprocess_chinese(chn_sentence))

# 读取数据,每个元素的样式是 [英文, 中文]
def create_dataset(path, num_examples=None):
        lines = open(path, encoding='UTF-8').read().strip().split('\n')
        word_pairs = [[w for w in l.split('\t')] for l in lines[:num_examples]]
        word_pairs = [[preprocess_eng(w[0]), preprocess_chinese(w[1])]
        for w in word_pairs]
        return word_pairs
word_pairs = create_dataset(path_to_file)
word_pairs[:20]

#中英文分离
en, chn = zip(*create_dataset(path_to_file))
print(en[-1])
print(chn[-1])
print(len(en), len(chn))

# 取数据中的最大文本长度,用来将所有文本统一成一致的长度,模型才能够正常训练
def max_length(tensor):
         return max(len(t) for t in tensor)

def tokenize(lang):
        lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
        lang_tokenizer.fit_on_texts(lang)
        text_ids = lang_tokenizer.texts_to_sequences(lang)
        padded_text_ids = tf.keras.preprocessing.sequence.pad_sequences(text_ids,padding='post')
        return padded_text_ids, lang_tokenizer

# 将中文设置为源语言,英文设置为目标语言
def load_dataset(path, num_examples=None):
        targ_lang, inp_lang = zip(*create_dataset(path, num_examples))
        input_data, inp_lang_tokenizer = tokenize(inp_lang)
        target_data, targ_lang_tokenizer = tokenize(targ_lang)
        return input_data, target_data, inp_lang_tokenizer, targ_lang_tokenizer

#设置训练集大小限制等
num_examples = None
input_data, target_data, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)
max_length_targ, max_length_inp = max_length(target_data), max_length(input_data)
input_train, input_val, target_train, target_val = train_test_split(input_data, target_data, test_size=0.05)
print(len(input_train), len(target_train),len(input_val), len(target_val))

#查看词和id的对应关系
def convert(lang, data):
        for t in data:
                if t != 0:
                       print("%d ----> %s" % (t, lang.index_word[t]))
print("输入:源语言:中文, 词和 id 的映射关系")
convert(inp_lang, input_train[0])
print()
print("输出:目标语言:英文, 词和 id 的映射关系")
convert(targ_lang, target_train[0])

#步骤10 转换成tf.data.Dataset
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1
dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape


#定义Encoder
class Encoder(tf.keras.Model):
            def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
                             super(Encoder, self).__init__()
                             self.batch_sz = batch_sz
                             self.enc_units = enc_units
                             self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
                             self.gru = tf.keras.layers.GRU(self.enc_units,return_sequences=True,return_state=True,#recurrent_activation='sigmoid',
                                                            recurrent_initializer='glorot_uniform')
                        
            def call(self, x, hidden):
                             x = self.embedding(x)
                             output,state = self.gru(x, initial_state=hidden)
                             return output, state
            def initialize_hidden_state(self):
                             return tf.zeros((self.batch_sz, self.enc_units))

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print('Encoder 输出的维度: (batch size, sequence length, units) {}'.format(sample_output.shape))
print('Encoder 隐层的维度: (batch size, units) {}'.format(sample_hidden.shape))
print(sample_output[-1, -1, :] == sample_hidden[-1, :])

#定义 Attention 层

class BahdanauAttention(tf.keras.Model):
                             def __init__(self, units):
                                   super(BahdanauAttention, self).__init__()
                                   self.W1 = tf.keras.layers.Dense(units)
                                   self.W2 = tf.keras.layers.Dense(units)
                                   self.V = tf.keras.layers.Dense(1)
                             def call(self, query, values):
                                    hidden_with_time_axis = tf.expand_dims(query, 1)
                                    score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
                                    attention_weights = tf.nn.softmax(score, axis=1)
                                    context_vector = attention_weights * values
                                    context_vector = tf.reduce_sum(context_vector, axis=1)
                                    return context_vector, attention_weights

attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)
print("Attention 输出的维度: (batch size, units) {}".format(attention_result.shape))
print("Attention 权值参数的维度: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

#定义 Decoder

class Decoder(tf.keras.Model):
                                        def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
                                               super(Decoder, self).__init__()
                                               self.batch_sz = batch_sz
                                               self.dec_units = dec_units
                                               self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
                                               self.gru = tf.keras.layers.GRU(self.dec_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform')
                                               self.fc = tf.keras.layers.Dense(vocab_size)
                                               self.attention = BahdanauAttention(self.dec_units)
                                        def call(self, x, hidden, enc_output):
                                                context_vector, attention_weights = self.attention(hidden, enc_output)
                                                x = self.embedding(x)
                                                x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
                                                output, state = self.gru(x)
                                                output = tf.reshape(output, (-1, output.shape[2]))
                                                x = self.fc(output)
                                                return x, state, attention_weights
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _, _ = decoder(tf.random.uniform((64, 1)),sample_hidden, sample_output)
print('Decoder 输出的维度: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

#定义优化器和损失
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(real, pred):
            mask = tf.math.logical_not(tf.math.equal(real, 0))
            loss_ = loss_object(real, pred)
            mask = tf.cast(mask, dtype=loss_.dtype)
            loss_ *= mask
            return tf.reduce_mean(loss_)

#设置 checkpoint 保存路径
checkpoint_dir = 'checkpoints/chinese-eng'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,encoder=encoder,decoder=decoder)

#步骤16 训练模型
@tf.function
def train_step(inp, targ, enc_hidden):
                loss = 0
                with tf.GradientTape() as tape:
                                enc_output, enc_hidden = encoder(inp, enc_hidden)
                                dec_hidden = enc_hidden
                                dec_input = tf.expand_dims([targ_lang.word_index['']] * BATCH_SIZE, 1)
                                for t in range(1, targ.shape[1]):
                                                          predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
                                                          loss += loss_function(targ[:, t], predictions)
                                                          dec_input = tf.expand_dims(targ[:, t], 1)
                batch_loss = (loss / int(targ.shape[1]))
                variables = encoder.trainable_variables + decoder.trainable_variables
                gradients = tape.gradient(loss, variables)
                optimizer.apply_gradients(zip(gradients, variables))
                return batch_loss
EPOCHS = 2
for epoch in range(EPOCHS):
                start = time.time()
                enc_hidden = encoder.initialize_hidden_state()
                total_loss = 0
                for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
                             batch_loss = train_step(inp, targ, enc_hidden)
                             total_loss += batch_loss
                             if batch % 100 == 0:
                                       print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,batch,batch_loss.numpy()))
                if (epoch + 1) % 2 == 0:
                             checkpoint.save(file_prefix=checkpoint_prefix)
                print('Epoch {} Loss {:.4f}'.format(epoch + 1,total_loss / steps_per_epoch))
print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


#定义测试和可视化函数
def evaluate(sentence):
                        attention_plot = np.zeros((max_length_targ, max_length_inp))
                        sentence = preprocess_chinese(sentence)
                        inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
                        inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
                        inputs = tf.convert_to_tensor(inputs)
                        result = ''
                        hidden = [tf.zeros((1, units))]
                        enc_out, enc_hidden = encoder(inputs, hidden)
                        dec_hidden = enc_hidden
                        dec_input = tf.expand_dims([targ_lang.word_index['']], 0)
                        for t in range(max_length_targ):
                                                predictions, dec_hidden, attention_weights = decoder(dec_input,dec_hidden,enc_out)
                                                attention_weights = tf.reshape(attention_weights, (-1, ))
                                                attention_plot[t] = attention_weights.numpy()
                                                predicted_id = tf.argmax(predictions[0]).numpy()
                                                result += targ_lang.index_word[predicted_id] + ' '
                                                if targ_lang.index_word[predicted_id] == '':
                                                               return result, sentence, attention_plot
                        dec_input = tf.expand_dims([predicted_id], 0)
                        return result, sentence, attention_plot
def plot_attention(attention, sentence, predicted_sentence):
                         fig = plt.figure(figsize=(10, 10))
                         ax = fig.add_subplot(1, 1, 1)
                         ax.matshow(attention, cmap='viridis')
                         fontdict = {'fontsize': 14}
                         ax.set_xticklabels([''] + sentence, fontdict=fontdict)
                         ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
                         ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
                         ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
                         plt.show()
def translate(sentence):
                     result, sentence, attention_plot = evaluate(sentence)
                     print('Input: %s' % (sentence))
                     print('Predicted translation: {}'.format(result))
                     attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
                     plot_attention(attention_plot, sentence.split(' '), result.split(' '))
#离线加载模型测试
checkpoint_dir = 'checkpoints/chinese-eng'
print(tf.train.latest_checkpoint(checkpoint_dir))
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
translate('我有一只猫')

你可能感兴趣的:(python)