基于前面word2vec
的原理与训练实践、seq2seq
模型的原理与实践以及attention机制,已经分别写了相关的文章来记录,此篇文章就是基于前面所学,开始着手训练文本摘要模型,当然仅是一个比较普通的baseline,后面还会不断优化模型。
seq2seq
模型首先利用上一节seq2seq
实践中,封装的encoder、decoder和attention,集成到此模型中来,另外就是增加了一个训练技巧–teacher forcing。那么teacher forcing是啥意思呢?
seq2seq
模型的输出为decoder解码出的一系列概率分布,因此采用何种方式进行解码,就显得尤为重要。如贪心解码(greedy search)、teacher forcing以及介于两种之间的beam search等。
贪心解码的思想是,预测 t 时刻输出的单词时,直接将t−1时刻的输出词汇表中概率最大的单词,作为t时刻的输入,因此可能导致如果前一个预测值就不准的话,后面一系列都不准的问题。
Teacher Forcing的方法是,预测 t时刻输出的单词时,直接将t−1时刻的实际单词,作为输入,因此可能带来的问题是,训练过程预测良好(因为有标签,即实际单词),但是测试过程极差(因为测试过程不会给对应的真实单词)。
实际应用中,往往采用介于这两种极端方式之间的解码方式,如beam search 等,具体思路是预测 t 时刻输出的单词时,保留t−1时刻的输出词汇表中概率最大的前K个单词,以此带来更多的可能性(解决第一个方法的缺陷);而且在训练过程,采用一定的概率P,来决定是否使用真实单词作为输入(解决第二个方法的缺陷)。greedy search 和beam search后面我们也会一一介绍,下面是teacher forcing的具体实现。
import tensorflow as tf
from src.seq2seq_tf2.model_layers import Encoder, BahdanauAttention, Decoder
from src.utils.gpu_utils import config_gpu
from src.utils.params_utils import get_params
from src.utils.wv_loader import load_embedding_matrix, Vocab
class Seq2Seq(tf.keras.Model):
def __init__(self, params, vocab):
super(Seq2Seq, self).__init__()
self.embedding_matrix = load_embedding_matrix()
self.params = params
self.vocab = vocab
self.batch_size = params["batch_size"]
self.enc_units = params["enc_units"]
self.dec_units = params["dec_units"]
self.attn_units = params["attn_units"]
self.encoder = Encoder(self.embedding_matrix,
self.enc_units,
self.batch_size)
self.attention = BahdanauAttention(self.attn_units)
self.decoder = Decoder(self.embedding_matrix,
self.dec_units,
self.batch_size)
def teacher_decoder(self, dec_hidden, enc_output, dec_target):
predictions = []
# 第一个输入
dec_input = tf.expand_dims([self.vocab.START_DECODING_INDEX] * self.batch_size, 1)
# Teacher forcing 将target作为下一次的输入,依次解码
for t in range(1, dec_target.shape[1]):
# passing enc_output to the decoder
# 应用decoder来一步一步预测生成词语概论分布
pred, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
dec_input = tf.expand_dims(dec_target[:, t], 1)
predictions.append(pred)
return tf.stack(predictions, 1), dec_hidden
import tensorflow as tf
from src.seq2seq_tf2.seq2seq_model import Seq2Seq
from src.seq2seq_tf2.train_helper import train_model
from src.utils.gpu_utils import config_gpu
from src.utils.params_utils import get_params
from src.utils.wv_loader import Vocab
def train(params):
# GPU资源配置
config_gpu(use_cpu=True)
# 读取vocab训练
vocab = Vocab(params["vocab_path"], params["vocab_size"])
params['vocab_size'] = vocab.count
# 构建模型
print("Building the model ...")
model = Seq2Seq(params, vocab)
# 获取保存管理者
checkpoint = tf.train.Checkpoint(Seq2Seq=model)
checkpoint_manager = tf.train.CheckpointManager(checkpoint, params['checkpoint_dir'], max_to_keep=5)
# 训练模型
train_model(model, vocab, params, checkpoint_manager)
import tensorflow as tf
# from src.pgn_tf2.batcher import batcher
from src.seq2seq_tf2.seq2seq_batcher import train_batch_generator
import time
from functools import partial
def train_model(model, vocab, params, checkpoint_manager):
epochs = params['epochs']
pad_index = vocab.word2id[vocab.PAD_TOKEN]
# 获取vocab大小
params['vocab_size'] = vocab.count
optimizer = tf.keras.optimizers.Adam(name='Adam', learning_rate=params['learning_rate'])
train_dataset, val_dataset, train_steps_per_epoch, val_steps_per_epoch = train_batch_generator(
params['batch_size'], params['max_enc_len'], params['max_dec_len'], params['buffer_size']
)
for epoch in range(epochs):
start = time.time()
enc_hidden = model.encoder.initialize_hidden_state()
total_loss = 0.
running_loss = 0.
for (batch, (inputs, target)) in enumerate(train_dataset.take(train_steps_per_epoch), start=1):
batch_loss = train_step(model, inputs, target, enc_hidden,
loss_function=partial(loss_function, pad_index=pad_index),
optimizer=optimizer)
total_loss += batch_loss
if batch % 50 == 0:
print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
batch,
(total_loss - running_loss) / 50))
running_loss = total_loss
# saving (checkpoint) the model every 2 epochs
if (epoch + 1) % 2 == 0:
ckpt_save_path = checkpoint_manager.save()
print('Saving checkpoint for epoch {} at {}'.format(epoch + 1,
ckpt_save_path))
valid_loss = evaluate(model, val_dataset, val_steps_per_epoch,
loss_func=partial(loss_function, pad_index=pad_index))
print('Epoch {} Loss {:.4f}; val Loss {:.4f}'.format(
epoch + 1, total_loss / train_steps_per_epoch, valid_loss)
)
print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
# 定义损失函数
def loss_function(real, pred, pad_index):
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
mask = tf.math.logical_not(tf.math.equal(real, pad_index))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_mean(loss_)
def train_step(model, enc_inp, dec_target, enc_hidden, loss_function=None, optimizer=None, mode='train'):
with tf.GradientTape() as tape:
enc_output, enc_hidden = model.encoder(enc_inp, enc_hidden)
# 第一个隐藏层输入
dec_hidden = enc_hidden
# 逐个预测序列
predictions, _ = model.teacher_decoder(dec_hidden, enc_output, dec_target)
batch_loss = loss_function(dec_target[:, 1:], predictions)
if mode == 'train':
variables = (model.encoder.trainable_variables + model.decoder.trainable_variables
+ model.attention.trainable_variables)
gradients = tape.gradient(batch_loss, variables)
gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
optimizer.apply_gradients(zip(gradients, variables))
return batch_loss
def evaluate(model, val_dataset, val_steps_per_epoch, loss_func):
print('Starting evaluate ...')
total_loss = 0.
enc_hidden = model.encoder.initialize_hidden_state()
for (batch, (inputs, target)) in enumerate(val_dataset.take(val_steps_per_epoch), start=1):
batch_loss = train_step(model, inputs, target, enc_hidden,
loss_function=loss_func, mode='val')
total_loss += batch_loss
return total_loss / val_steps_per_epoch
from src.build_data.data_loader import load_dataset
import tensorflow as tf
from src.utils import config
from tqdm import tqdm
def train_batch_generator(batch_size, max_enc_len=200, max_dec_len=50, buffer_size=5, sample_sum=None):
# 加载数据集
train_X, train_Y = load_dataset(config.train_x_path, config.train_y_path,
max_enc_len, max_dec_len)
val_X, val_Y = load_dataset(config.test_x_path, config.test_y_path,
max_enc_len, max_dec_len)
if sample_sum:
train_X = train_X[:sample_sum]
train_Y = train_Y[:sample_sum]
print(f'total {len(train_Y)} examples ...')
train_dataset = tf.data.Dataset.from_tensor_slices((train_X, train_Y)).shuffle(len(train_X),
reshuffle_each_iteration=True)
val_dataset = tf.data.Dataset.from_tensor_slices((val_X, val_Y)).shuffle(len(val_X),
reshuffle_each_iteration=True)
train_dataset = train_dataset.batch(batch_size, drop_remainder=True).prefetch(buffer_size)
val_dataset = val_dataset.batch(batch_size, drop_remainder=True).prefetch(buffer_size)
train_steps_per_epoch = len(train_X) // batch_size
val_steps_per_epoch = len(val_X) // batch_size
return train_dataset, val_dataset, train_steps_per_epoch, val_steps_per_epoch
def load_dataset(x_path, y_path, max_enc_len, max_dec_len, sample_sum=None):
x = np.load(x_path+".npy")
y = np.load(y_path+".npy")
if sample_sum:
x = x[:sample_sum, :max_enc_len]
y = y[:sample_sum, :max_dec_len]
else:
x = x[:, :max_enc_len]
y = y[:, :max_dec_len]
return x, y