模型代码主要包括以下几个部分:1.构建word embedding; 2.构建word_char embedding的准备; 3.语言模型介绍(双向lstm模型)。
注意:在ELMo语言模型中,无论是word embedding还是word_char embedding都是在模型中一起训练和更新的!
只需要为词汇表中每一个词定义为相同维度的变量即可,然后对输入的word-ids进行embedding_lookup操作,然后再作为输入到语言模型中,随着语言模型一起训练即可。相应的代码如下:
def _build_word_embeddings(self):
n_tokens_vocab = self.options['n_tokens_vocab']
batch_size = self.options['batch_size']
unroll_steps = self.options['unroll_steps']
# LSTM options
projection_dim = self.options['lstm']['projection_dim']
# the input token_ids and word embeddings
#句子中词ids输入,并预先准备词向量表示
self.token_ids = tf.placeholder(DTYPE_INT,
shape=(batch_size, unroll_steps),
name='token_ids')
# the word embeddings
#将句子的word_ids表示转化为词向量矩阵
with tf.device("/cpu:0"):
self.embedding_weights = tf.get_variable(
"embedding", [n_tokens_vocab, projection_dim],
dtype=DTYPE,
)
self.embedding = tf.nn.embedding_lookup(self.embedding_weights,
self.token_ids)
# if a bidirectional LM then make placeholders for reverse
#如果是双向语言模型,则输入的placeholders则需要反转一下
# model and embeddings
#模型和embeddings
if self.bidirectional:
self.token_ids_reverse = tf.placeholder(DTYPE_INT,
shape=(batch_size, unroll_steps),
name='token_ids_reverse')
with tf.device("/cpu:0"):
self.embedding_reverse = tf.nn.embedding_lookup(
self.embedding_weights, self.token_ids_reverse)
word_char_embedding的构建则相对来说比较复杂,需要使用CNN进行卷积训练得来。具体的实现网络如下两个图所示:
其中第一个图表示的是以一条句子作为训练,实际上是一个batch的句子进行并行训练。单个句子的训练是以[n_token, max_char, char_dim]作为一个训练样本,分别表示句长,单词最大字符长和每个字符维数。我们使用一个大小为[1, n_width, char_dim]的卷积核进行卷积,即高度为1,宽度为n_width,通道数为char_dim的卷积核进行卷积,每次都是一行一行的对n_width大小的字符进行卷积。卷积完之后,我们会形成一个[n_token, max_char-n_width+1]的feature map图,然后我们再对feature map图的每一行进行一个最大池化处理,这样每一个卷积核最终得到[n_token]的数据。我们总共有m=n_filters个卷积核,将每个卷积核的结果拼接起来,最终会形成一个[n_token, m]的数据。
第二个图表示,我们得到了每个词经过m=n_filters个卷积和max pooling形成的feature之后,再通过多层highway网络进行特征筛选处理,最后再通过一个投影层将维度从m投影到p=proj_dim维。highway层和投影层都是可选的。
代码如下所示:
def _build_word_char_embeddings(self):
'''
options contains key 'char_cnn': {
'n_characters': 60, #字符字典中总的字符个数,就60个?
# includes the start / end characters
'max_characters_per_token': 17, #每个单词最大字符数
'filters': [ #所使用的卷积核列表
[1, 32], #表示n_width=1的卷积核32个
[2, 32],
[3, 64],
[4, 128],
[5, 256],
[6, 512],
[7, 512]
],
'activation': 'tanh',
# for the character embedding #每个字符的embedding表示维数
'embedding': {'dim': 16}
# for highway layers #使用high way网络
# if omitted, then no highway layers
'n_highway': 2,
}
'''
batch_size = self.options['batch_size'] #
unroll_steps = self.options['unroll_steps'] #最大时长,n_token
projection_dim = self.options['lstm']['projection_dim'] #最终维度,投影层维度
cnn_options = self.options['char_cnn']
filters = cnn_options['filters'] #卷积核列表?
n_filters = sum(f[1] for f in filters) #一个卷积核一组特征,总的特征组数
max_chars = cnn_options['max_characters_per_token'] #每个词最大字符数
char_embed_dim = cnn_options['embedding']['dim']
n_chars = cnn_options['n_characters'] #字典中字符个数
if cnn_options['activation'] == 'tanh':
activation = tf.nn.tanh
elif cnn_options['activation'] == 'relu':
activation = tf.nn.relu
# the input character ids
self.tokens_characters = tf.placeholder(DTYPE_INT, #字符的输入
shape=(batch_size, unroll_steps, max_chars),
name='tokens_characters')
# the character embeddings
with tf.device("/cpu:0"):
self.embedding_weights = tf.get_variable( #为每个字符形成的嵌入表
"char_embed", [n_chars, char_embed_dim],
dtype=DTYPE,
initializer=tf.random_uniform_initializer(-1.0, 1.0)
)
# shape (batch_size, unroll_steps, max_chars, embed_dim)
self.char_embedding = tf.nn.embedding_lookup(self.embedding_weights,
self.tokens_characters)
if self.bidirectional:
self.tokens_characters_reverse = tf.placeholder(DTYPE_INT, #反转,为双向LSTM做准备
shape=(batch_size, unroll_steps, max_chars),
name='tokens_characters_reverse')
self.char_embedding_reverse = tf.nn.embedding_lookup(
self.embedding_weights, self.tokens_characters_reverse)
# the convolutions
#卷积层, 开始定义卷积操作
def make_convolutions(inp, reuse):
with tf.variable_scope('CNN', reuse=reuse) as scope:
convolutions = []
for i, (width, num) in enumerate(filters): #对于每种width的卷积
if cnn_options['activation'] == 'relu':
# He initialization for ReLU activation
# with char embeddings init between -1 and 1
#w_init = tf.random_normal_initializer(
# mean=0.0,
# stddev=np.sqrt(2.0 / (width * char_embed_dim))
#)
# Kim et al 2015, +/- 0.05
w_init = tf.random_uniform_initializer(
minval=-0.05, maxval=0.05)
elif cnn_options['activation'] == 'tanh':
# glorot init
w_init = tf.random_normal_initializer(
mean=0.0,
stddev=np.sqrt(1.0 / (width * char_embed_dim))
)
w = tf.get_variable( #一个一维的卷积
"W_cnn_%s" % i,
[1, width, char_embed_dim, num], #height, width, in_channel, out_channel
initializer=w_init,
dtype=DTYPE)
b = tf.get_variable( #out_channel
"b_cnn_%s" % i, [num], dtype=DTYPE,
initializer=tf.constant_initializer(0.0))
conv = tf.nn.conv2d( #卷积,从左到右
inp, w,
strides=[1, 1, 1, 1],
padding="VALID") + b
# now max pool
#使用一个max pool,每一行进行pooling
conv = tf.nn.max_pool(
conv, [1, 1, max_chars-width+1, 1],
[1, 1, 1, 1], 'VALID')
# activation
conv = activation(conv)
conv = tf.squeeze(conv, squeeze_dims=[2])
convolutions.append(conv)
return tf.concat(convolutions, 2) #将所有卷积结果连接起来,能够连接,主要是因为max_pool的存在
# for first model, this is False, for others it's True
reuse = tf.get_variable_scope().reuse
embedding = make_convolutions(self.char_embedding, reuse)
self.token_embedding_layers = [embedding]
if self.bidirectional:
# re-use the CNN weights from forward pass
embedding_reverse = make_convolutions(
self.char_embedding_reverse, True)
# for highway and projection layers:
# 开始进行highway网络和投影层, 要n_filters>proj_dim投影才有意义
# reshape from (batch_size, n_tokens, dim) to
n_highway = cnn_options.get('n_highway')
use_highway = n_highway is not None and n_highway > 0
use_proj = n_filters != projection_dim #二者不等才需要用投影层
if use_highway or use_proj:
embedding = tf.reshape(embedding, [-1, n_filters])
if self.bidirectional:
embedding_reverse = tf.reshape(embedding_reverse,
[-1, n_filters])
# set up weights for projection
if use_proj:
assert n_filters > projection_dim
with tf.variable_scope('CNN_proj') as scope:
W_proj_cnn = tf.get_variable(
"W_proj", [n_filters, projection_dim],
initializer=tf.random_normal_initializer(
mean=0.0, stddev=np.sqrt(1.0 / n_filters)),
dtype=DTYPE)
b_proj_cnn = tf.get_variable(
"b_proj", [projection_dim],
initializer=tf.constant_initializer(0.0),
dtype=DTYPE)
# apply highways layers
# 参考highway网络的定义
def high(x, ww_carry, bb_carry, ww_tr, bb_tr):
carry_gate = tf.nn.sigmoid(tf.matmul(x, ww_carry) + bb_carry)
transform_gate = tf.nn.relu(tf.matmul(x, ww_tr) + bb_tr)
return carry_gate * transform_gate + (1.0 - carry_gate) * x
if use_highway: #先进行highway处理
highway_dim = n_filters
for i in range(n_highway):
with tf.variable_scope('CNN_high_%s' % i) as scope:
W_carry = tf.get_variable( #这些都是get_variable
'W_carry', [highway_dim, highway_dim],
# glorit init
initializer=tf.random_normal_initializer(
mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
dtype=DTYPE)
b_carry = tf.get_variable(
'b_carry', [highway_dim],
initializer=tf.constant_initializer(-2.0),
dtype=DTYPE)
W_transform = tf.get_variable(
'W_transform', [highway_dim, highway_dim],
initializer=tf.random_normal_initializer(
mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
dtype=DTYPE)
b_transform = tf.get_variable(
'b_transform', [highway_dim],
initializer=tf.constant_initializer(0.0),
dtype=DTYPE)
embedding = high(embedding, W_carry, b_carry,
W_transform, b_transform)
if self.bidirectional:
embedding_reverse = high(embedding_reverse,
W_carry, b_carry,
W_transform, b_transform)
self.token_embedding_layers.append(
tf.reshape(embedding,
[batch_size, unroll_steps, highway_dim])
)
# finally project down to projection dim if needed
# 在又需要的时候进行投影操作
if use_proj:
embedding = tf.matmul(embedding, W_proj_cnn) + b_proj_cnn
if self.bidirectional:
embedding_reverse = tf.matmul(embedding_reverse, W_proj_cnn) \
+ b_proj_cnn
self.token_embedding_layers.append(
tf.reshape(embedding,
[batch_size, unroll_steps, projection_dim])
)
# reshape back to (batch_size, tokens, dim)
#重塑shape
if use_highway or use_proj:
shp = [batch_size, unroll_steps, projection_dim]
embedding = tf.reshape(embedding, shp)
if self.bidirectional:
embedding_reverse = tf.reshape(embedding_reverse, shp)
# at last assign attributes for remainder of the model
self.embedding = embedding #得到了需要的词向量
if self.bidirectional:
self.embedding_reverse = embedding_reverse