elmo代码实践

构建word embedding
  在ELMo语言模型中,无论是word embedding还是word_char embedding都是在模型中一起训练和更新的!
  只需要为词汇表中每一个词定义为相同维度的变量即可,然后对输入的word-ids进行embedding_lookup操作,然后再作为输入到语言模型中,随着语言模型一起训练即可。相应的代码如下:

def _build_word_embeddings(self):
    n_tokens_vocab = self.options['n_tokens_vocab']
    batch_size = self.options['batch_size']
    unroll_steps = self.options['unroll_steps']

    # LSTM options
    projection_dim = self.options['lstm']['projection_dim']

    # the input token_ids and word embeddings
    #句子中词ids输入,并预先准备词向量表示
    self.token_ids = tf.placeholder(DTYPE_INT,
                           shape=(batch_size, unroll_steps),
                           name='token_ids')
    # the word embeddings
    #将句子的word_ids表示转化为词向量矩阵
    with tf.device("/cpu:0"):
        self.embedding_weights = tf.get_variable(
            "embedding", [n_tokens_vocab, projection_dim],
            dtype=DTYPE,
        )
        self.embedding = tf.nn.embedding_lookup(self.embedding_weights,
                                            self.token_ids)

    # if a bidirectional LM then make placeholders for reverse
    #如果是双向语言模型,则输入的placeholders则需要反转一下
    # model and embeddings
    #模型和embeddings
    if self.bidirectional:
        self.token_ids_reverse = tf.placeholder(DTYPE_INT,
                           shape=(batch_size, unroll_steps),
                           name='token_ids_reverse')
        with tf.device("/cpu:0"):
            self.embedding_reverse = tf.nn.embedding_lookup(
                self.embedding_weights, self.token_ids_reverse)

def _build_word_char_embeddings(self):
‘’’
options contains key ‘char_cnn’: {

'n_characters': 60,                                                              #字符字典中总的字符个数,就60个?

# includes the start / end characters
'max_characters_per_token': 17,                                                  #每个单词最大字符数

'filters': [                                                                     #所使用的卷积核列表
    [1, 32],                                                                     #表示n_width=1的卷积核32个
    [2, 32],
    [3, 64],
    [4, 128],
    [5, 256],
    [6, 512],
    [7, 512]
],
'activation': 'tanh',

# for the character embedding                                                    #每个字符的embedding表示维数
'embedding': {'dim': 16}

# for highway layers                                                             #使用high way网络
# if omitted, then no highway layers
'n_highway': 2,
}
'''
batch_size = self.options['batch_size']                                          #
unroll_steps = self.options['unroll_steps']                                      #最大时长,n_token
projection_dim = self.options['lstm']['projection_dim']                          #最终维度,投影层维度

cnn_options = self.options['char_cnn']
filters = cnn_options['filters']                                                 #卷积核列表?
n_filters = sum(f[1] for f in filters)                                           #一个卷积核一组特征,总的特征组数
max_chars = cnn_options['max_characters_per_token']                              #每个词最大字符数
char_embed_dim = cnn_options['embedding']['dim']
n_chars = cnn_options['n_characters']                                            #字典中字符个数
if cnn_options['activation'] == 'tanh':
    activation = tf.nn.tanh
elif cnn_options['activation'] == 'relu':
    activation = tf.nn.relu

# the input character ids 
self.tokens_characters = tf.placeholder(DTYPE_INT,                               #字符的输入
                           shape=(batch_size, unroll_steps, max_chars),
                           name='tokens_characters')
# the character embeddings
with tf.device("/cpu:0"):
    self.embedding_weights = tf.get_variable(                                    #为每个字符形成的嵌入表
            "char_embed", [n_chars, char_embed_dim],
            dtype=DTYPE,
            initializer=tf.random_uniform_initializer(-1.0, 1.0)
    )
    # shape (batch_size, unroll_steps, max_chars, embed_dim)
    self.char_embedding = tf.nn.embedding_lookup(self.embedding_weights,
                                            self.tokens_characters)

    if self.bidirectional:
        self.tokens_characters_reverse = tf.placeholder(DTYPE_INT,               #反转,为双向LSTM做准备
                           shape=(batch_size, unroll_steps, max_chars),
                           name='tokens_characters_reverse')
        self.char_embedding_reverse = tf.nn.embedding_lookup(
            self.embedding_weights, self.tokens_characters_reverse)


# the convolutions
#卷积层, 开始定义卷积操作
def make_convolutions(inp, reuse):
    with tf.variable_scope('CNN', reuse=reuse) as scope:
        convolutions = []
        for i, (width, num) in enumerate(filters):                               #对于每种width的卷积
            if cnn_options['activation'] == 'relu':
                # He initialization for ReLU activation
                # with char embeddings init between -1 and 1
                #w_init = tf.random_normal_initializer(
                #    mean=0.0,
                #    stddev=np.sqrt(2.0 / (width * char_embed_dim))
                #)

                # Kim et al 2015, +/- 0.05
                w_init = tf.random_uniform_initializer(
                    minval=-0.05, maxval=0.05)
            elif cnn_options['activation'] == 'tanh':
                # glorot init
                w_init = tf.random_normal_initializer(
                    mean=0.0,
                    stddev=np.sqrt(1.0 / (width * char_embed_dim))
                )
            w = tf.get_variable(                                                 #一个一维的卷积
                "W_cnn_%s" % i,
                [1, width, char_embed_dim, num],                                 #height, width, in_channel, out_channel
                initializer=w_init,
                dtype=DTYPE)
            b = tf.get_variable(                                                 #out_channel
                "b_cnn_%s" % i, [num], dtype=DTYPE,
                initializer=tf.constant_initializer(0.0))

            conv = tf.nn.conv2d(                                                 #卷积,从左到右
                    inp, w,
                    strides=[1, 1, 1, 1],
                    padding="VALID") + b
            # now max pool
            #使用一个max pool,每一行进行pooling
            conv = tf.nn.max_pool(
                    conv, [1, 1, max_chars-width+1, 1],             
                    [1, 1, 1, 1], 'VALID')

            # activation
            conv = activation(conv)
            conv = tf.squeeze(conv, squeeze_dims=[2])

            convolutions.append(conv)

    return tf.concat(convolutions, 2)                                            #将所有卷积结果连接起来,能够连接,主要是因为max_pool的存在

# for first model, this is False, for others it's True
reuse = tf.get_variable_scope().reuse
embedding = make_convolutions(self.char_embedding, reuse)

self.token_embedding_layers = [embedding]

if self.bidirectional:
    # re-use the CNN weights from forward pass
    embedding_reverse = make_convolutions(
        self.char_embedding_reverse, True)

# for highway and projection layers:
# 开始进行highway网络和投影层, 要n_filters>proj_dim投影才有意义
#   reshape from (batch_size, n_tokens, dim) to
n_highway = cnn_options.get('n_highway')
use_highway = n_highway is not None and n_highway > 0
use_proj = n_filters != projection_dim                                           #二者不等才需要用投影层

if use_highway or use_proj:
    embedding = tf.reshape(embedding, [-1, n_filters])
    if self.bidirectional:
        embedding_reverse = tf.reshape(embedding_reverse,
            [-1, n_filters])

# set up weights for projection
if use_proj:
    assert n_filters > projection_dim
    with tf.variable_scope('CNN_proj') as scope:
            W_proj_cnn = tf.get_variable(
                "W_proj", [n_filters, projection_dim],
                initializer=tf.random_normal_initializer(
                    mean=0.0, stddev=np.sqrt(1.0 / n_filters)),
                dtype=DTYPE)
            b_proj_cnn = tf.get_variable(
                "b_proj", [projection_dim],
                initializer=tf.constant_initializer(0.0),
                dtype=DTYPE)

# apply highways layers
# 参考highway网络的定义
def high(x, ww_carry, bb_carry, ww_tr, bb_tr):
    carry_gate = tf.nn.sigmoid(tf.matmul(x, ww_carry) + bb_carry)
    transform_gate = tf.nn.relu(tf.matmul(x, ww_tr) + bb_tr)
    return carry_gate * transform_gate + (1.0 - carry_gate) * x

if use_highway:                                                                  #先进行highway处理
    highway_dim = n_filters

    for i in range(n_highway):
        with tf.variable_scope('CNN_high_%s' % i) as scope:
            W_carry = tf.get_variable(                                           #这些都是get_variable
                'W_carry', [highway_dim, highway_dim],
                # glorit init
                initializer=tf.random_normal_initializer(
                    mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
                dtype=DTYPE)
            b_carry = tf.get_variable(
                'b_carry', [highway_dim],
                initializer=tf.constant_initializer(-2.0),
                dtype=DTYPE)
            W_transform = tf.get_variable(
                'W_transform', [highway_dim, highway_dim],
                initializer=tf.random_normal_initializer(
                    mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
                dtype=DTYPE)
            b_transform = tf.get_variable(
                'b_transform', [highway_dim],
                initializer=tf.constant_initializer(0.0),
                dtype=DTYPE)

        embedding = high(embedding, W_carry, b_carry,
                         W_transform, b_transform)
        if self.bidirectional:
            embedding_reverse = high(embedding_reverse,
                                     W_carry, b_carry,
                                     W_transform, b_transform)
        self.token_embedding_layers.append(
            tf.reshape(embedding, 
                [batch_size, unroll_steps, highway_dim])
        )

# finally project down to projection dim if needed
# 在又需要的时候进行投影操作
if use_proj:                                                                     
    embedding = tf.matmul(embedding, W_proj_cnn) + b_proj_cnn
    if self.bidirectional:
        embedding_reverse = tf.matmul(embedding_reverse, W_proj_cnn) \
            + b_proj_cnn
    self.token_embedding_layers.append(
        tf.reshape(embedding,
                [batch_size, unroll_steps, projection_dim])
    )

# reshape back to (batch_size, tokens, dim)                                      
#重塑shape
if use_highway or use_proj:
    shp = [batch_size, unroll_steps, projection_dim]
    embedding = tf.reshape(embedding, shp)
    if self.bidirectional:
        embedding_reverse = tf.reshape(embedding_reverse, shp)

# at last assign attributes for remainder of the model
self.embedding = embedding                                                       #得到了需要的词向量
if self.bidirectional:
    self.embedding_reverse = embedding_reverse

总结
ELMo在处理很多NLP下游任务中表现非常优异。但是我想这跟它集中在产生更好的词语级别的embedding是有关系的。过去介绍到的一些其他的算法,比如Quick thoughts也是利用了语言模型作为句子的encoder;还有InferSent使用biLSTM作为encoder。和ELMo相比,它们都显得“野心”太大:它们为下游的NLP任务提供了句子embedding的解决方案:即直接利用它们的pretrained encoder,最终的预测无非是加上softmax的classifier。
对比而言ELMo要单纯很多,它只提供了word级别的解决方案:利用它的pretrained biLM来产生word embedding,然后提供给下游的模型。这里的模型往往是sequence model,其效果已经在相应的NLP任务上得到验证。这时有了新的兼具语法语义及环境特征的word embedding的加持,难怪效果会更好。更不要说,ELMo还在任务语料库上小心翼翼的再进行过一轮微调,更是保证了对新domain的adaptation

你可能感兴趣的:(自然语言处理)