数据准备包括:1.生成word的词汇表类; 2.生成字符的词汇表类; 3.以word-ids作为输入的训练batch生成类; 4.以char-ids作为输入的训练batch生成类; 5.生成语言模型输入的数据集类
根据一个词汇表文件,生成word和索引的相互对应关系,即_id_to_word和_word_to_id,前者是一个数组,后者是一个字典。当然,我们也需要加上一个特殊的词,比如
, ,
(分别表示句首,句尾和不知词)。主要的代码如下:
def __init__(self, filename, validate_file=False):
'''
filename = the vocabulary file. It is a flat text file with one
(normalized) token per line. In addition, the file should also
contain the special tokens , , (case sensitive).
vocab文件,是一个纯文本,每一行只有一个词。另外,这个文件应该包含特殊词,
比如, , 等
'''
self._id_to_word = []
self._word_to_id = {}
self._unk = -1
self._bos = -1
self._eos = -1
with open(filename) as f:
idx = 0
for line in f: #词汇表中一行就是一个单词
word_name = line.strip()
if word_name == '':
self._bos = idx
elif word_name == '':
self._eos = idx
elif word_name == '':
self._unk = idx
if word_name == '!!!MAXTERMID':
continue
self._id_to_word.append(word_name)
self._word_to_id[word_name] = idx
idx += 1
# check to ensure file has special tokens
if validate_file:
if self._bos == -1 or self._eos == -1 or self._unk == -1:
raise ValueError("Ensure the vocabulary file has "
", , tokens")
当然,类中还有两个很实用的函数,一个是编码函数encode,另一个是解码函数decode。编码器encode的作用是将一条句子sentence转化为一个word-ids列表,注意要加上句首和句尾token。当然包括反转选项,用来做双向的LSTM。而解码器decode就是将word-ids列表转化为相应的单词。
def encode(self, sentence, reverse=False, split=True):
"""Convert a sentence to a list of ids, with special tokens added.
Sentence is a single string with tokens separated by whitespace.
If reverse, then the sentence is assumed to be reversed, and
this method will swap the BOS/EOS tokens appropriately.
将一个sentenct转化为ids序列
并提供句子反转的功能
"""
if split:
word_ids = [
self.word_to_id(cur_word) for cur_word in sentence.split()
]
else:
word_ids = [self.word_to_id(cur_word) for cur_word in sentence]
if reverse:
return np.array([self.eos] + word_ids + [self.bos], dtype=np.int32) #在每一条句子首位加上了和
else:
return np.array([self.bos] + word_ids + [self.eos], dtype=np.int32)
def decode(self, cur_ids):
"""Convert a list of ids to a sentence, with space inserted.
将一个ids序列转化为word序列
"""
return ' '.join([self.id_to_word(cur_id) for cur_id in cur_ids])
注意这个类是上面word词汇表Vocabulary的子类,这意味着这个字符类包含了Vocabulary的所有变量和方法!
每个字符(character)的id是用该字符对应的utf-8编码,这样也就可以形成id和char之间的转换,因为使用utf-8编码,这将限制char词汇表中所有可能的id数量为256。当然,我们也需要加入5个额外的特殊字符,包括:句首,句尾,词头,词尾和padding。通过词汇表文件,形成字符词汇表的_word_char_ids的代码为:
#将词转化为char_ids
def _convert_word_to_char_ids(self, word):
code = np.zeros([self.max_word_length], dtype=np.int32)
code[:] = self.pad_char
#将word中每一个字符转化为utf-8编码,然后用数组存起来,例如:
#english中,e:101, n:110, g:103, l:108, h:105, s:115, h:104
word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length-2)]
code[0] = self.bow_char #加上词开始和结尾的编码
for k, chr_id in enumerate(word_encoded, start=1):
code[k] = chr_id
code[k + 1] = self.eow_char
return code
def __init__(self, filename, max_word_length, **kwargs):
#调用父类Vocabulary,生成word和id之间的转换等
super(UnicodeCharsVocabulary, self).__init__(filename, **kwargs)
self._max_word_length = max_word_length #每个词对应最大字符长
# char ids 0-255 come from utf-8 encoding bytes
# assign 256-300 to special chars
self.bos_char = 256 #
self.eos_char = 257 #
self.bow_char = 258 #
self.eow_char = 259 #
self.pad_char = 260 #
num_words = len(self._id_to_word) #单词的个数,父类中的属性
#每个词都会对应一个char_ids列表
self._word_char_ids = np.zeros([num_words, max_word_length],
dtype=np.int32)
# the charcter representation of the begin/end of sentence characters
# 对句首或者句尾的token来一个字符的表示
def _make_bos_eos(c):
r = np.zeros([self.max_word_length], dtype=np.int32)
r[:] = self.pad_char
r[0] = self.bow_char #词的开始
r[1] = c
r[2] = self.eow_char #词的结束
return r
self.bos_chars = _make_bos_eos(self.bos_char) #句子开始对应的char_ids
self.eos_chars = _make_bos_eos(self.eos_char) #句子的结尾对应的char_ids
for i, word in enumerate(self._id_to_word): #遍历id2word数组,得到每一个词的char_ids
self._word_char_ids[i] = self._convert_word_to_char_ids(word)
self._word_char_ids[self.bos] = self.bos_chars #将句子开头和结尾当作一个word处理
self._word_char_ids[self.eos] = self.eos_chars
通过以上两个函数,我们就可以得到每个单词(word)对应的字符id序列(char-ids),包括句首和句尾的字符id序列表示。
这个类还提供将句子转化为相应的char-ids数组的功能,它首先查词汇表字典_word_char_ids来得到每个词的char_ids表示,然后组成句子,返回的是一个二维数组。实现如下:
#返回word对应的char_ids数组
def word_to_char_ids(self, word):
if word in self._word_to_id:
return self._word_char_ids[self._word_to_id[word]]
else:
return self._convert_word_to_char_ids(word)
def encode_chars(self, sentence, reverse=False, split=True):
'''
Encode the sentence as a white space delimited string of tokens.
对一整句话进行编码,编码成chars
'''
if split: #如果切割了句子
chars_ids = [self.word_to_char_ids(cur_word)
for cur_word in sentence.split()]
else:
chars_ids = [self.word_to_char_ids(cur_word)
for cur_word in sentence]
if reverse:
return np.vstack([self.eos_chars] + chars_ids + [self.bos_chars]) #在每一条句子上都加了和
else:
return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars])
将一个batch的句子文本转化为相应的word-ids形式。主要代码如下:
def batch_sentences(self, sentences: List[List[str]]):
'''
Batch the sentences as character ids
确定是character_ids?而不是word_ids
Each sentence is a list of tokens without or , e.g.
[['The', 'first', 'sentence', '.'], ['Second', '.']]
'''
n_sentences = len(sentences)
max_length = max(len(sentence) for sentence in sentences) + 2
X_ids = np.zeros((n_sentences, max_length), dtype=np.int64) #word_ids是二维的,[batch_size, max_len]
for k, sent in enumerate(sentences):
length = len(sent) + 2
ids_without_mask = self._lm_vocab.encode(sent, split=False)
# add one so that 0 is the mask value
X_ids[k, :length] = ids_without_mask + 1 #0表示mask值
return X_ids
和上面类似,只是这里生成的是一个batch的句子文本的char-ids的表示,形成的是一个三维数组。主要代码为:
def batch_sentences(self, sentences: List[List[str]]):
'''
Batch the sentences as character ids
Each sentence is a list of tokens without or , e.g.
[['The', 'first', 'sentence', '.'], ['Second', '.']]
'''
n_sentences = len(sentences) #句子个数
max_length = max(len(sentence) for sentence in sentences) + 2 #句子最大长度,加上句首和句尾?
X_char_ids = np.zeros( #三维数组,每条句子中每个单词对应的char_ids数组
(n_sentences, max_length, self._max_token_length),
dtype=np.int64
)
#遍历数组
for k, sent in enumerate(sentences):
length = len(sent) + 2
char_ids_without_mask = self._lm_vocab.encode_chars( #对每个sentence得到char_ids数组
sent, split=False)
# add one so that 0 is the mask value, 加上1,所以0是mask值
X_char_ids[k, :length, :] = char_ids_without_mask + 1 #直接复制粘贴?将对应值加1,其他值填0
return X_char_ids
接着定义了一个生成各种数据的batch的方法,该方法每次从输入中读取一个batch的数据,batch中每个数据条目就是一条句子,每个条目包括句子的word-ids表示,char-ids表示和targets(即句子每个词要预测的下一个词)。该方法中有一个生成器(generator),每次会产生一条句子的数据,包括句子的word-ids和char-ids表示,所有只要重复调用该generator的next方法batch_size次就能够构造出一个batch的数据,代码如下:
def _get_batch(generator, batch_size, num_steps, max_word_length):
"""Read batches of input.
都一个batch的输入
"""
cur_stream = [None] * batch_size #None表示任意大小
no_more_data = False
while True:
inputs = np.zeros([batch_size, num_steps], np.int32) #batch中word_ids
if max_word_length is not None: #batch中每条句子每个word对应的char_ids
char_inputs = np.zeros([batch_size, num_steps, max_word_length],
np.int32)
else:
char_inputs = None
targets = np.zeros([batch_size, num_steps], np.int32) #我们的目标是预测下一个词来优化emlo,所以我们以向右滑动的1个词作为target
for i in range(batch_size): #每一条句子
cur_pos = 0 #这个值?
while cur_pos < num_steps: #循环是不是有点多余, 毫无意义
if cur_stream[i] is None or len(cur_stream[i][0]) <= 1:
try:
cur_stream[i] = list(next(generator)) #一个生成器一次只生成一条句子信息
except StopIteration:
# No more data, exhaust current streams and quit
no_more_data = True
break
#感觉cur_stream是这样一个东西,[i][0]代表的是word_ids,[i][1]代表的是char_ids?
#你的猜测是完全正确的,num_steps是一个窗口大小吗?
#所以下面的一次是,读一个窗口的数据?
how_many = min(len(cur_stream[i][0]) - 1, num_steps - cur_pos)
next_pos = cur_pos + how_many
inputs[i, cur_pos:next_pos] = cur_stream[i][0][:how_many]
if max_word_length is not None:
char_inputs[i, cur_pos:next_pos] = cur_stream[i][1][
:how_many]
targets[i, cur_pos:next_pos] = cur_stream[i][0][1:how_many+1] #后一个词是预测对象
cur_pos = next_pos
cur_stream[i][0] = cur_stream[i][0][how_many:] #cur_stream也跟着往后移动?
if max_word_length is not None:
cur_stream[i][1] = cur_stream[i][1][how_many:]
if no_more_data:
# There is no more data. Note: this will not return data
# for the incomplete batch
break
X = {'token_ids': inputs, 'tokens_characters': char_inputs,
'next_token_id': targets}
yield X
数据集类为语言模型训练提供相应的数据输入。它是随机的从数据文件列表中选取一个文件(数据不是仅仅在一个文件里面,而是很多文件),一次读取所有数据到内存中,然后提供一个句子生成器,再调用上面定义的_get_batch()函数来每次产生一个batch的数据集。具体实现代码如下:
def get_sentence(self):
"""
构造一个生成器吗?
"""
while True:
if self._i == self._nids:
self._ids = self._load_random_shard() #重新加载文件读取
ret = self._ids[self._i] #一次仅仅训练一条句子?
self._i += 1
yield ret
def iter_batches(self, batch_size, num_steps):
"""一个生成数据的迭代器"""
for X in _get_batch(self.get_sentence(), batch_size, num_steps,
self.max_word_length):
# token_ids = (batch_size, num_steps)
# char_inputs = (batch_size, num_steps, 50) of character ids
# targets = word ID of next word (batch_size, num_steps)
yield X
上面的语言模型只是普通的语言模型的输入,为了构建双向的LSTM模型,我们得将正常的数据反转,得到反向LSTM的输入。于是有了BidirectionalLMDataset类,其核心代码如下:
def __init__(self, filepattern, vocab, test=False, shuffle_on_load=False):
'''
bidirectional version of LMDataset
前向的LSTM传播过程数据正常取
反向的LSTM传播过程只需要将数据反转就好了
'''
self._data_forward = LMDataset( #正向数据集
filepattern, vocab, reverse=False, test=test,
shuffle_on_load=shuffle_on_load)
self._data_reverse = LMDataset(
filepattern, vocab, reverse=True, test=test, #反向数据集
shuffle_on_load=shuffle_on_load)
def iter_batches(self, batch_size, num_steps):
"""
将二者合成一个数据集?
"""
max_word_length = self._data_forward.max_word_length
for X, Xr in zip(
_get_batch(self._data_forward.get_sentence(), batch_size,
num_steps, max_word_length),
_get_batch(self._data_reverse.get_sentence(), batch_size,
num_steps, max_word_length)
):
for k, v in Xr.items(): #都合并到X中去
#形成token_ids_reverse, token_characters_reverse等
X[k + '_reverse'] = v
yield X