文本分类任务的一大核心就是获得文本的准确语义表示,笔者之前在文本分类任务中只是简单地调用LSTM或GRU来获得文本的词向量表示。在阅读论文和github项目时,会发现顶会论文在获得文本的语义向量时会使用Attention机制。下面,博主就介绍几种文本分类任务中在获得文本语义向量表示的过程中Attention机制的运用(后续随着论文的阅读会不断更新)。
def _attention(self, H):
"""
利用Attention机制得到句子的向量表示
"""
# 获得最后一层LSTM的神经元数量
hiddenSize = config.model.hiddenSizes[-1]
# 初始化一个权重向量,是可训练的参数
W = tf.Variable(tf.random_normal([hiddenSize], stddev=0.1))
# 对Bi-LSTM的输出用激活函数做非线性转换
M = tf.tanh(H)
# 对W和M做矩阵运算,M=[batch_size, time_step, hidden_size],计算前做维度转换成[batch_size * time_step, hidden_size]
# newM = [batch_size*time_step, 1],每一个时间步的输出由向量转换成一个数字
newM = tf.matmul(tf.reshape(M, [-1, hiddenSize]), tf.reshape(W, [-1, 1]))
# 对newM做维度转换成[batch_size, time_step]
restoreM = tf.reshape(newM, [-1, config.sequenceLength])
# 用softmax做归一化处理[batch_size, time_step]
self.alpha = tf.nn.softmax(restoreM)
# 利用求得的alpha的值对H进行加权求和,用矩阵运算直接操作
r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, config.sequenceLength, 1]))
# 将三维压缩成二维sequeezeR=[batch_size, hidden_size]
sequeezeR = tf.squeeze(r)
sentenceRepren = tf.tanh(sequeezeR)
# 对Attention的输出可以做dropout处理
output = tf.nn.dropout(sentenceRepren, self.dropoutKeepProb)
return output
# Dot-product attention
def get_attention(q, k, v, attn_mask=None):
"""
:param : (batch, seq_len, seq_len)
:return: (batch, seq_len, seq_len)
"""
attn = torch.matmul(q, k.transpose(1, 2))
if attn_mask is not None:
attn.data.masked_fill_(attn_mask, -1e10)
attn = F.softmax(attn, dim=-1)
output = torch.matmul(attn, v)
return output, attn
# Get mask for attention
def get_attn_pad_mask(seq_q, seq_k):
assert seq_q.dim() == 2 and seq_k.dim() == 2
pad_attn_mask = torch.matmul(seq_q.unsqueeze(2).float(), seq_k.unsqueeze(1).float())
pad_attn_mask = pad_attn_mask.eq(Const.PAD) # b_size x 1 x len_k
#print(pad_attn_mask)
return pad_attn_mask.cuda(seq_k.device)
def luong_attention(batch_size, target, condition, target_encoder_length, hidden_dim):
# same dim [batch, max_seq, embed]
batch_seq_embed_target = tf.reshape(target, [batch_size, target_encoder_length, hidden_dim])
batch_embed_given = condition
batch_seq_embed_given = tf.reshape(batch_embed_given, [batch_size, hidden_dim, 1])
# calculate similarity
dot = tf.matmul(batch_seq_embed_target, batch_seq_embed_given)
norm_dot = tf.nn.softmax(dot, dim=1)
# weighted sum by using similarity (normalized)
target_mul_norm = tf.multiply(batch_seq_embed_target, norm_dot)
weighted_sum = tf.reduce_sum(target_mul_norm, axis=1)
return weighted_sum
transformer在获取文本语义词向量表达时并没有使用到任何LSTM或GRU结构。我们结合代码来看一看Transformer在文本分类时的attention机制。
encode部分主函数
def encode(self, xs, training=True):
'''
Returns
memory: encoder outputs. (N, T1, d_model)
'''
with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
x, seqlens, sents1 = xs
# embedding
enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model)
enc *= self.hp.d_model**0.5 # scale
enc += positional_encoding(enc, self.hp.maxlen1)
enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training)
## Blocks
for i in range(self.hp.num_blocks):
with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
# self-attention
enc = multihead_attention(queries=enc,
keys=enc,
values=enc,
num_heads=self.hp.num_heads,
dropout_rate=self.hp.dropout_rate,
training=training,
causality=False)
# feed forward
enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
memory = enc
return memory, sents1
multihead_attention
def multihead_attention(queries, keys, values,
num_heads=8,
dropout_rate=0,
training=True,
causality=False,
scope="multihead_attention"):
'''Applies multihead attention. See 3.2.2
queries: A 3d tensor with shape of [N, T_q, d_model].
keys: A 3d tensor with shape of [N, T_k, d_model].
values: A 3d tensor with shape of [N, T_k, d_model].
num_heads: An int. Number of heads.
dropout_rate: A floating point number.
training: Boolean. Controller of mechanism for dropout.
causality: Boolean. If true, units that reference the future are masked.
scope: Optional scope for `variable_scope`.
Returns
A 3d tensor with shape of (N, T_q, C)
'''
d_model = queries.get_shape().as_list()[-1]
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
# Linear projections
Q = tf.layers.dense(queries, d_model, use_bias=False) # (N, T_q, d_model)
K = tf.layers.dense(keys, d_model, use_bias=False) # (N, T_k, d_model)
V = tf.layers.dense(values, d_model, use_bias=False) # (N, T_k, d_model)
# Split and concat
Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, d_model/h)
K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)
V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)
# Attention
outputs = scaled_dot_product_attention(Q_, K_, V_, causality, dropout_rate, training)
# Restore shape
outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, d_model)
# Residual connection
outputs += queries
# Normalize
outputs = ln(outputs)
return outputs
scaled_dot_product_attention
def scaled_dot_product_attention(Q, K, V,
causality=False, dropout_rate=0.,
training=True,
scope="scaled_dot_product_attention"):
'''See 3.2.1.
Q: Packed queries. 3d tensor. [N, T_q, d_k].
K: Packed keys. 3d tensor. [N, T_k, d_k].
V: Packed values. 3d tensor. [N, T_k, d_v].
causality: If True, applies masking for future blinding
dropout_rate: A floating point number of [0, 1].
training: boolean for controlling droput
scope: Optional scope for `variable_scope`.
'''
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
d_k = Q.get_shape().as_list()[-1]
# dot product
outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) # (N, T_q, T_k)
# scale
outputs /= d_k ** 0.5
# key masking
outputs = mask(outputs, Q, K, type="key")
# causality or future blinding masking
if causality:
outputs = mask(outputs, type="future")
# softmax
outputs = tf.nn.softmax(outputs)
attention = tf.transpose(outputs, [0, 2, 1])
tf.summary.image("attention", tf.expand_dims(attention[:1], -1))
# query masking
outputs = mask(outputs, Q, K, type="query")
# dropout
outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training)
# weighted sum (context vectors)
outputs = tf.matmul(outputs, V) # (N, T_q, d_v)
return outputs
mask
def mask(inputs, queries=None, keys=None, type=None):
"""Masks paddings on keys or queries to inputs
inputs: 3d tensor. (N, T_q, T_k)
queries: 3d tensor. (N, T_q, d)
keys: 3d tensor. (N, T_k, d)
e.g.,
>> queries = tf.constant([[[1.],
[2.],
[0.]]], tf.float32) # (1, 3, 1)
>> keys = tf.constant([[[4.],
[0.]]], tf.float32) # (1, 2, 1)
>> inputs = tf.constant([[[4., 0.],
[8., 0.],
[0., 0.]]], tf.float32)
>> mask(inputs, queries, keys, "key")
array([[[ 4.0000000e+00, -4.2949673e+09],
[ 8.0000000e+00, -4.2949673e+09],
[ 0.0000000e+00, -4.2949673e+09]]], dtype=float32)
>> inputs = tf.constant([[[1., 0.],
[1., 0.],
[1., 0.]]], tf.float32)
>> mask(inputs, queries, keys, "query")
array([[[1., 0.],
[1., 0.],
[0., 0.]]], dtype=float32)
"""
padding_num = -2 ** 32 + 1
if type in ("k", "key", "keys"):
# Generate masks
masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1)) # (N, T_k)
masks = tf.expand_dims(masks, 1) # (N, 1, T_k)
masks = tf.tile(masks, [1, tf.shape(queries)[1], 1]) # (N, T_q, T_k)
# Apply masks to inputs
paddings = tf.ones_like(inputs) * padding_num
outputs = tf.where(tf.equal(masks, 0), paddings, inputs) # (N, T_q, T_k)
elif type in ("q", "query", "queries"):
# Generate masks
masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1)) # (N, T_q)
masks = tf.expand_dims(masks, -1) # (N, T_q, 1)
masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]]) # (N, T_q, T_k)
# Apply masks to inputs
outputs = inputs*masks
elif type in ("f", "future", "right"):
diag_vals = tf.ones_like(inputs[0, :, :]) # (T_q, T_k)
tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k)
masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1]) # (N, T_q, T_k)
paddings = tf.ones_like(masks) * padding_num
outputs = tf.where(tf.equal(masks, 0), paddings, inputs)
else:
print("Check if you entered type correctly!")
return outputs
mask机制对query做mask时,考虑到了补0的位置不参与后续计算,因此在补0的位置上赋予一个极小值,这样在后续经过softmax层进行计算的时候补0的位置概率变为0就不会参与后续的计算。
目前常见的attention机制可以分为两类:
一类:基于循环神经网络的attention机制,要么是加权池化(后续会更新),要么是输出的转置进行相乘,最后会得到shape[batch_size,time_steps,time_steps]或者[batch_size,time_steps,time_steps]的tensor
二类:类似于transformer结构的不用循环神经网络,只有attention机制就可达到很好的效果。
后续还会继续总结。
相关链接:Transformer代码
用于文本分类的Transformer