通过用周杰伦的歌词数据生成新的歌词的任务,分析比较不同的语言模型。
数据集中的训练集合采用的是周杰伦十张专辑中的歌词,用此来训练一个语言模型,并用其来生成新的歌词。
通过with open读取数据集,并将换行符替换成空格。去除换行符时,需要同时去除’\n’和’\r’:
with open ('jaychou_lyrics.txt')as f:
# 将文件读取为字符串
corpus_chars = f.read()
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
print(corpus_chars)
将每个字符映射成一个从0开始的连续整数作为索引。
先把数据集中所有不同字符取出,然后将其映射到索引俩构造词典,将训练数据集中每个字符转化为索引。
# 转化为词典字符的list
idx_to_char = list(set(corpus_chars))
# 构建字符索引词典
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
# 将数据集终得每个字符转化为索引,并打印前20个字符及其对应得索引
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:',sample)
在训练中需要每次随机读取小批量样本和标签,这里的时序数据的一个样本通常包含连续的字符。
将采样后的样本分为样本序列和标签序列,标签序列是对应样本序列字符的下一个字符。
每次从数据里随机采样一个小批量,batch_size指每个小批量的样本数,num_steps为每个样本所包含的时间步数;
随机采样中每个样本是原始序列上任意截取的一段序列,相邻的两个随机小批量在原始序列上的位置不一定相互相邻;
每次随机采样前都需要重新初始化隐藏状态,因为无法用一个小批量最终时间步的隐藏状态来初始化下一个小批量的隐藏状态;
def data_iter_random(corpus_indices, batch_size, num_steps, ctx=None):
# 减1是因为输出的索引x是相应输入的索引y加1
num_examples = (len(corpus_indices) - 1) // num_steps
epoch_size = num_examples // batch_size
example_indices = list(range(num_examples))
random.shuffle(example_indices)
# 返回从pos开始的长为num_steps的序列
def _data(pos):
return corpus_indices[pos: pos + num_steps]
for i in range(epoch_size):
# 每次读取batch_size个随机样本
i = i * batch_size
batch_indices = example_indices[i: i + batch_size]
X = [_data(j * num_steps) for j in batch_indices]
Y = [_data(j * num_steps + 1) for j in batch_indices]
# yield torch.tensor(X, dtype=torch.float32, device=device), torch.tensor(Y, dtype=torch.float32, device=device)
yield nd.array(X, ctx), nd.array(Y, ctx)
相邻采样的目的是可以用一个小批量最终时间步的隐藏状态来初始化下一个小批量的隐藏状态,从而使下一个小批量的输出也取决于当前小批量的输入,并如此循环。
def data_iter_consecutive(corpus_indices, batch_size, num_steps, ctx=None):
corpus_indices = nd.array(corpus_indices, ctx=ctx)
data_len = len(corpus_indices)
batch_len = data_len // batch_size
indices = corpus_indices[0: batch_size*batch_len].reshape((batch_size, batch_len))
epoch_size = (batch_len - 1) // num_steps
for i in range(epoch_size):
i = i * num_steps
X = indices[:, i: i + num_steps]
Y = indices[:, i + 1: i + num_steps + 1]
yield X, Y
使用困惑度(perplexity)来评价语言模型的好坏,困惑度是对交叉熵损失函数做指数运算后得到的值
问题一:
# 数据预处理
def load_data_jay_lyrics():
"""Load the Jay Chou lyric data set (available in the Chinese book)."""
with zipfile.ZipFile('jaychou_lyrics.txt.zip') as zin:
with zin.open('jaychou_lyrics.txt') as f:
corpus_chars = f.read().decode('utf-8')
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[0:10000]
idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
corpus_indices = [char_to_idx[char] for char in corpus_chars]
return corpus_indices, char_to_idx, idx_to_char, vocab_size
'''
idx_to_char:字符词典列表;
char_to_idx:字符词典--key为字符,value为索引index
corpus_indices:字符词典的索引
'''
(corpus_indices, char_to_idx, idx_to_char, vocab_size) = load_data_jay_lyrics()
# 构造一个含单隐藏层、隐藏单元个数为256的循环神经网络层rnn_layer,并对权重做初始化
num_hiddens = 256
rnn_layer = rnn.RNN(num_hiddens)
rnn_layer.initialize()
# 使用begin_state 来返回初始化的隐藏状态列表,包括形状为(隐藏层,批量大小,隐藏单元个数)的元素
batch_size = 2
state = rnn_layer.begin_state(batch_size=batch_size)
print(state[0].shape) # (1, 2, 256)
num_steps = 35
X = nd.random.uniform(shape=(num_steps, batch_size, vocab_size))
Y, state_new = rnn_layer(X, state)
# 继承block类来定义一个完整的循环神经网络,首先将输入数据使用one-hot向量表示后输入道rnn_layer中,然后使用全连接输出层得到输出。输出个数等于词典大小;
class RNNModel(nn.Block):
def __init__(self, rnn_layer, vocab_size, **kwargs):
super(RNNModel, self).__init__(**kwargs)
self.rnn = rnn_layer
self.vocab_size = vocab_size
self.dense = nn.Dense(vocab_size)
def forward(self, inputs, state):
# 将输入转置为(num_steps, batch_size)后获取one-hot向量表示
X = nd.one_hot(inputs.T, self.vocab_size)
Y, state = self.rnn(X, state)
# 全连接层会首先将Y的形状变为(num_steps * batch_size, num_hiddens)
output = self.dense(Y.reshape((-1, Y.shape[-1])))
return output, state
def begin_state(self, *args, **kwargs):
return self.rnn.begin_state(*args, **kwargs)
# RNN值预测
def predict_rnn_gluon(prefix, num_chars, model, vocab_size, ctx, idx_to_char,
char_to_idx):
"""Precit next chars with a Gluon RNN model"""
state = model.begin_state(batch_size=1, ctx=ctx)
output = [char_to_idx[prefix[0]]]
for t in range(num_chars + len(prefix) - 1):
X = nd.array([output[-1]], ctx=ctx).reshape((1, 1))
(Y, state) = model(X, state)
if t < len(prefix) - 1:
output.append(char_to_idx[prefix[t + 1]])
else:
output.append(int(Y.argmax(axis=1).asscalar()))
return ''.join([idx_to_char[i] for i in output])
# 使用权重为随机值的模型进行预测
ctx = mx.cpu()
model = RNNModel(rnn_layer, vocab_size)
model.initialize(force_reinit=True, ctx=ctx)
predict_rnn_gluon('分开', 10, model, vocab_size, ctx, idx_to_char, char_to_idx)
使用相邻采样实现训练函数:
def grad_clipping(params, theta, ctx):
"""Clip the gradient."""
if theta is not None:
norm = nd.array([0], ctx)
for param in params:
norm += (param.grad ** 2).sum()
norm = norm.sqrt().asscalar()
if norm > theta:
for param in params:
param.grad[:] *= theta / norm
def train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx,
corpus_indices, idx_to_char, char_to_idx,
num_epochs, num_steps, lr, clipping_theta,
batch_size, pred_period, pred_len, prefixes):
"""Train an Gluon RNN model and predict the next item in the sequence."""
loss = gloss.SoftmaxCrossEntropyLoss()
model.initialize(ctx=ctx, force_reinit=True, init=init.Normal(0.01))
trainer = gluon.Trainer(model.collect_params(), 'sgd',
{'learning_rate': lr, 'momentum': 0, 'wd': 0})
for epoch in range(num_epochs):
l_sum, n, start = 0.0, 0, time.time()
data_iter = data_iter_consecutive(
corpus_indices, batch_size, num_steps, ctx)
state = model.begin_state(batch_size=batch_size, ctx=ctx)
for X, Y in data_iter:
for s in state:
s.detach()
with autograd.record():
(output, state) = model(X, state)
y = Y.T.reshape((-1,))
l = loss(output, y).mean()
l.backward(retain_graph=True)
params = [p.data() for p in model.collect_params().values()]
grad_clipping(params, clipping_theta, ctx)
trainer.step(1)
l_sum += l.asscalar() * y.size
n += y.size
if (epoch + 1) % pred_period == 0:
print('epoch %d, perplexity %f, time %.2f sec' % (
epoch + 1, math.exp(l_sum / n), time.time() - start))
for prefix in prefixes:
print(' -', predict_rnn_gluon(
prefix, pred_len, model, vocab_size, ctx, idx_to_char,
char_to_idx))
# 使用超参数来训练模型
num_epochs, batch_size, lr, clipping_theta = 250, 32, 1e2, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开']
train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx,
num_epochs, num_steps, lr, clipping_theta,batch_size, pred_period, pred_len, prefixes)
问题二:
问题三: