这是我个人综合实验手册所得的源代码,现在托管在该平台。
#下载
! wget https://huawei-ai-certification.obs.cn-north-4.myhuaweicloud.com/CHS/HCIP-AI%20EI%20Developer/V2.1/machine_translation/data.zip
#解压
! unzip data.zip
#导入相关库
import re
import os
import io
import time
import jieba
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
#指定数据路径
path_to_file = "data/cmn.txt"
#预处理函数定义
def preprocess_eng(w):
w = w.lower().strip()
w = re.sub(r"([?.!,])", r" \1 ", w)
w = re.sub(r'[" "]+', " ", w)
w = re.sub(r"[^a-zA-Z?.!,]+", " ", w)
w = w.rstrip().strip()
w = '
return w
def preprocess_chinese(w):
w = w.lower().strip()
w = jieba.cut(w, cut_all=False, HMM=True)
w = " ".join(list(w))
w = '
return w
#预处理测试
en_sentence = "May I borrow this book?"
chn_sentence = "我可以借这本书吗?"
print(preprocess_eng(en_sentence))
print(preprocess_chinese(chn_sentence))
# 读取数据,每个元素的样式是 [英文, 中文]
def create_dataset(path, num_examples=None):
lines = open(path, encoding='UTF-8').read().strip().split('\n')
word_pairs = [[w for w in l.split('\t')] for l in lines[:num_examples]]
word_pairs = [[preprocess_eng(w[0]), preprocess_chinese(w[1])]
for w in word_pairs]
return word_pairs
word_pairs = create_dataset(path_to_file)
word_pairs[:20]
#中英文分离
en, chn = zip(*create_dataset(path_to_file))
print(en[-1])
print(chn[-1])
print(len(en), len(chn))
# 取数据中的最大文本长度,用来将所有文本统一成一致的长度,模型才能够正常训练
def max_length(tensor):
return max(len(t) for t in tensor)
def tokenize(lang):
lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
lang_tokenizer.fit_on_texts(lang)
text_ids = lang_tokenizer.texts_to_sequences(lang)
padded_text_ids = tf.keras.preprocessing.sequence.pad_sequences(text_ids,padding='post')
return padded_text_ids, lang_tokenizer
# 将中文设置为源语言,英文设置为目标语言
def load_dataset(path, num_examples=None):
targ_lang, inp_lang = zip(*create_dataset(path, num_examples))
input_data, inp_lang_tokenizer = tokenize(inp_lang)
target_data, targ_lang_tokenizer = tokenize(targ_lang)
return input_data, target_data, inp_lang_tokenizer, targ_lang_tokenizer
#设置训练集大小限制等
num_examples = None
input_data, target_data, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)
max_length_targ, max_length_inp = max_length(target_data), max_length(input_data)
input_train, input_val, target_train, target_val = train_test_split(input_data, target_data, test_size=0.05)
print(len(input_train), len(target_train),len(input_val), len(target_val))
#查看词和id的对应关系
def convert(lang, data):
for t in data:
if t != 0:
print("%d ----> %s" % (t, lang.index_word[t]))
print("输入:源语言:中文, 词和 id 的映射关系")
convert(inp_lang, input_train[0])
print()
print("输出:目标语言:英文, 词和 id 的映射关系")
convert(targ_lang, target_train[0])
#步骤10 转换成tf.data.Dataset
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1
dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape
#定义Encoder
class Encoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(self.enc_units,return_sequences=True,return_state=True,#recurrent_activation='sigmoid',
recurrent_initializer='glorot_uniform')
def call(self, x, hidden):
x = self.embedding(x)
output,state = self.gru(x, initial_state=hidden)
return output, state
def initialize_hidden_state(self):
return tf.zeros((self.batch_sz, self.enc_units))
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print('Encoder 输出的维度: (batch size, sequence length, units) {}'.format(sample_output.shape))
print('Encoder 隐层的维度: (batch size, units) {}'.format(sample_hidden.shape))
print(sample_output[-1, -1, :] == sample_hidden[-1, :])
#定义 Attention 层
class BahdanauAttention(tf.keras.Model):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
hidden_with_time_axis = tf.expand_dims(query, 1)
score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
attention_weights = tf.nn.softmax(score, axis=1)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)
print("Attention 输出的维度: (batch size, units) {}".format(attention_result.shape))
print("Attention 权值参数的维度: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))
#定义 Decoder
class Decoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
super(Decoder, self).__init__()
self.batch_sz = batch_sz
self.dec_units = dec_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(self.dec_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform')
self.fc = tf.keras.layers.Dense(vocab_size)
self.attention = BahdanauAttention(self.dec_units)
def call(self, x, hidden, enc_output):
context_vector, attention_weights = self.attention(hidden, enc_output)
x = self.embedding(x)
x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
output, state = self.gru(x)
output = tf.reshape(output, (-1, output.shape[2]))
x = self.fc(output)
return x, state, attention_weights
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _, _ = decoder(tf.random.uniform((64, 1)),sample_hidden, sample_output)
print('Decoder 输出的维度: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))
#定义优化器和损失
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(real, pred):
mask = tf.math.logical_not(tf.math.equal(real, 0))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_mean(loss_)
#设置 checkpoint 保存路径
checkpoint_dir = 'checkpoints/chinese-eng'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,encoder=encoder,decoder=decoder)
#步骤16 训练模型
@tf.function
def train_step(inp, targ, enc_hidden):
loss = 0
with tf.GradientTape() as tape:
enc_output, enc_hidden = encoder(inp, enc_hidden)
dec_hidden = enc_hidden
dec_input = tf.expand_dims([targ_lang.word_index['
for t in range(1, targ.shape[1]):
predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
loss += loss_function(targ[:, t], predictions)
dec_input = tf.expand_dims(targ[:, t], 1)
batch_loss = (loss / int(targ.shape[1]))
variables = encoder.trainable_variables + decoder.trainable_variables
gradients = tape.gradient(loss, variables)
optimizer.apply_gradients(zip(gradients, variables))
return batch_loss
EPOCHS = 2
for epoch in range(EPOCHS):
start = time.time()
enc_hidden = encoder.initialize_hidden_state()
total_loss = 0
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = train_step(inp, targ, enc_hidden)
total_loss += batch_loss
if batch % 100 == 0:
print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,batch,batch_loss.numpy()))
if (epoch + 1) % 2 == 0:
checkpoint.save(file_prefix=checkpoint_prefix)
print('Epoch {} Loss {:.4f}'.format(epoch + 1,total_loss / steps_per_epoch))
print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
#定义测试和可视化函数
def evaluate(sentence):
attention_plot = np.zeros((max_length_targ, max_length_inp))
sentence = preprocess_chinese(sentence)
inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
inputs = tf.convert_to_tensor(inputs)
result = ''
hidden = [tf.zeros((1, units))]
enc_out, enc_hidden = encoder(inputs, hidden)
dec_hidden = enc_hidden
dec_input = tf.expand_dims([targ_lang.word_index['
for t in range(max_length_targ):
predictions, dec_hidden, attention_weights = decoder(dec_input,dec_hidden,enc_out)
attention_weights = tf.reshape(attention_weights, (-1, ))
attention_plot[t] = attention_weights.numpy()
predicted_id = tf.argmax(predictions[0]).numpy()
result += targ_lang.index_word[predicted_id] + ' '
if targ_lang.index_word[predicted_id] == '
return result, sentence, attention_plot
dec_input = tf.expand_dims([predicted_id], 0)
return result, sentence, attention_plot
def plot_attention(attention, sentence, predicted_sentence):
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(1, 1, 1)
ax.matshow(attention, cmap='viridis')
fontdict = {'fontsize': 14}
ax.set_xticklabels([''] + sentence, fontdict=fontdict)
ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
plt.show()
def translate(sentence):
result, sentence, attention_plot = evaluate(sentence)
print('Input: %s' % (sentence))
print('Predicted translation: {}'.format(result))
attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
plot_attention(attention_plot, sentence.split(' '), result.split(' '))
#离线加载模型测试
checkpoint_dir = 'checkpoints/chinese-eng'
print(tf.train.latest_checkpoint(checkpoint_dir))
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
translate('我有一只猫')