相关
SCA-CNN Spatial and Channel-wise Attention in Convolutional Networks for Image Captioning 中提到的spatial attention
什么是spatial attention
对于image caption这个任务来说,对于image的特征提取后的Tensor , 记为 V, 其维度为 H, W, C 分别表示了height, width, channel, 我们将其变换为height * width = m个向量,然后通过下列的公式来获得各个部分的权重。
所以要实现attention, 先要获得 , 对于上面的变量
维度为 k , channel
维度为 channel , m(height x width)
维度为 k
维度为 k, d
维度为 d
维度为 k, m
维度为k
维度为1
import tensorflow as tf
import numpy as np
"""
实现 spatial attention, 给定了inputs, 然后返回这个
"""
def spatial_attention(inputs, hidden_states, k):
"""spatial attention is to achieve spatial attention mechanism
git a tensor named inputs, which's shape is [batch_size, height, width, channel]
then return a tensor back with attention.
Arguments:
inputs: a tensor denote the cnn feature [batch_size, height, width, channel]
hidden_states: the hidden_states after a decoder step [batch_size, embed_size]
k: a scalar
Return:
outputs:
"""
inputs_shape = map(lambda x: x.value, inputs.shape)
original_inputs = tf.identity(inputs, name="identity")
batch_size, height, width, channel = inputs_shape
d = hidden_states.shape[-1].value # hidden_states 的 embedding_size 大小
with tf.variable_scope("spatial_attention_variables"):
Ws = tf.get_variable("Ws", shape=(k, channel), initializer=tf.random_uniform_initializer(-1, 1))
bs = tf.get_variable("bs", shape=(k,), initializer=tf.random_uniform_initializer(-0.1, 0.1))
Whs = tf.get_variable("Whs", shape=(k, d), initializer=tf.random_uniform_initializer(-1, 1))
Wi = tf.get_variable("Wi", shape=(k, ), initializer=tf.random_uniform_initializer(-1, 1))
bi = tf.get_variable("bi", shape=(), initializer=tf.random_uniform_initializer(-0.1, 0.1))
# reshape
inputs = tf.reshape(inputs, shape=(batch_size, -1))
inputs = tf.reshape(inputs, shape=(batch_size, height * width, channel))
# visual
inputs = tf.reshape(inputs, shape=(-1, channel))
visual_com = tf.matmul(inputs, Ws, transpose_b=True) # bs, m, k
visual_com = tf.reshape(visual_com, shape=(batch_size, -1, k))
visual_com = visual_com + bs
# textual
textual_com = tf.matmul(hidden_states, Whs, transpose_b=True)
textual_com = tf.expand_dims(textual_com, 1)
a = tf.nn.tanh(visual_com + textual_com) # bs, m, k
a_shape = map(lambda x: x.value, a.shape)
a = tf.reshape(a, shape=(-1, a_shape[-1]))
spatial_attention = tf.matmul(a, tf.expand_dims(Wi, -1)) + bi # batch_size, m, 1
spatial_attention = tf.reshape(spatial_attention, shape=a_shape[:-1])
spatial_attention = tf.nn.softmax(spatial_attention) # softmax 默认维度为 -1
spatial_attention = tf.reshape(spatial_attention, (batch_size, height, width))
output = tf.multiply(original_inputs, tf.expand_dims(spatial_attention, -1))
return spatial_attention, output
if __name__ == '__main__':
"""手工构造数据看是否正确"""
inputs = tf.constant(np.asarray(range(1, 2 * 4 * 4 * 3 + 1), dtype=np.float32).reshape((2, 4, 4, 3)))
hidden_states = tf.constant(np.asarray(range(97, 97 + 2 * 5), dtype=np.float32).reshape((2, 5)))
k = 6
attention, output = spatial_attention(inputs, hidden_states, k)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
for variable in tf.trainable_variables():
print variable.name, variable.dtype
initializer = tf.global_variables_initializer()
session.run(initializer)
for o in session.run([attention, output]):
print o
上面的代码还不知道能不能work, 下面是输出
[[[0.0625 0.0625 0.0625 0.0625 ]
[0.0625 0.0625 0.0625 0.0625 ]
[0.0625 0.0625 0.0625 0.0625 ]
[0.0625 0.0625 0.0625 0.0625 ]]
[[0.01733495 0.01733495 0.01733495 0.01733495]
[0.01733495 0.017335 0.01733827 0.01753132]
[0.02850685 0.11002292 0.12027094 0.12046136]
[0.12046459 0.12046465 0.12046465 0.12046465]]]
[[[[ 0.0625 0.125 0.1875 ]
[ 0.25 0.3125 0.375 ]
[ 0.4375 0.5 0.5625 ]
[ 0.625 0.6875 0.75 ]]
[[ 0.8125 0.875 0.9375 ]
[ 1. 1.0625 1.125 ]
[ 1.1875 1.25 1.3125 ]
[ 1.375 1.4375 1.5 ]]
[[ 1.5625 1.625 1.6875 ]
[ 1.75 1.8125 1.875 ]
[ 1.9375 2. 2.0625 ]
[ 2.125 2.1875 2.25 ]]
[[ 2.3125 2.375 2.4375 ]
[ 2.5 2.5625 2.625 ]
[ 2.6875 2.75 2.8125 ]
[ 2.875 2.9375 3. ]]]
[[[ 0.8494123 0.86674726 0.8840822 ]
[ 0.90141714 0.91875213 0.9360871 ]
[ 0.953422 0.97075695 0.9880919 ]
[ 1.0054269 1.0227618 1.0400968 ]]
[[ 1.0574317 1.0747666 1.0921016 ]
[ 1.1094398 1.1267748 1.1441098 ]
[ 1.1616641 1.1790024 1.1963407 ]
[ 1.2271923 1.2447237 1.262255 ]]
[[ 2.081 2.1095068 2.1380136 ]
[ 8.361742 8.4717655 8.581788 ]
[ 9.501404 9.6216755 9.741946 ]
[ 9.877831 9.998293 10.118754 ]]
[[10.2394905 10.359955 10.480419 ]
[10.600889 10.7213545 10.841819 ]
[10.962283 11.082748 11.203213 ]
[11.323677 11.444142 11.564607 ]]]]
对于使用到的函数功能的探究
- tf.identity
>>> inputs = tf.constant(np.asarray([[1, 2], [3, 4]], dtype=np.float32))
>>> inputs
>>> input_backup = tf.identity(inputs, name="identity")
>>> session.run([inputs, input_backup])
[array([[1., 2.],
[3., 4.]], dtype=float32), array([[1., 2.],
[3., 4.]], dtype=float32)]
- tf.matmul
>>> session.run([ta, tb])
[array([[[1., 2.],
[3., 4.]],
[[5., 6.],
[7., 8.]]], dtype=float32), array([[1., 2.],
[3., 4.]], dtype=float32)]
>>> wrong_matmul = tf.matmul(ta, tb)
Traceback (most recent call last):
File "", line 1, in
File "/home/jack/venv/local/lib/python2.7/site-packages/tensorflow/python/ops/math_ops.py", line 2018, in matmul
a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
File "/home/jack/venv/local/lib/python2.7/site-packages/tensorflow/python/ops/gen_math_ops.py", line 4456, in mat_mul
name=name)
File "/home/jack/venv/local/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/home/jack/venv/local/lib/python2.7/site-packages/tensorflow/python/util/deprecation.py", line 454, in new_func
return func(*args, **kwargs)
File "/home/jack/venv/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 3155, in create_op
op_def=op_def)
File "/home/jack/venv/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1731, in __init__
control_input_ops)
File "/home/jack/venv/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1579, in _create_c_op
raise ValueError(str(e))
ValueError: Shape must be rank 2 but is rank 3 for 'MatMul_2' (op: 'MatMul') with input shapes: [2,2,2], [2,2].
正确的做法为
>>> ta_shape = map(lambda x: x.value, ta.shape)
>>> ta_shape
[2, 2, 2]
>>> ta = tf.reshape(ta, shape=(-1, ta_shape[-1]))
>>> ta
>>> correct_matmul = tf.matmul(ta, tb)
>>> session.run([ta, tb, correct_matmul])
[array([[1., 2.],
[3., 4.],
[5., 6.],
[7., 8.]], dtype=float32),
array([[1., 2.],
[3., 4.]], dtype=float32),
array([[ 7., 10.],
[15., 22.],
[23., 34.],
[31., 46.]], dtype=float32)]
>>> correct_matmul = tf.reshape(correct_matmul, shape=ta_shape)
>>> session.run(correct_matmul)
array([[[ 7., 10.],
[15., 22.]],
[[23., 34.],
[31., 46.]]], dtype=float32)
tf.matmul
貌似只能在对于两个同rank的tensor进行matmul.
- tf.nn.tanh
Computes hyperbolic tangent ofx
element-wise.
>>> for i in range(-10, 11):
... session.run(tf.nn.tanh(float(i)))
...
-1.0
-1.0
-0.99999976
-0.99999833
-0.99998784
-0.99990916
-0.9993292
-0.9950547
-0.9640276
-0.7615942
0.0
0.7615942
0.9640276
0.9950547
0.9993292
0.99990916
0.99998784
0.99999833
0.99999976
1.0
1.0
>>> session.run(tf.nn.tanh(np.asarray(range(-10, 10), dtype=np.float32).reshape(2, 2, 5)))
array([[[-1. , -1. , -0.99999976, -0.99999833,
-0.99998784],
[-0.99990916, -0.9993292 , -0.9950547 , -0.9640276 ,
-0.7615942 ]],
[[ 0. , 0.7615942 , 0.9640276 , 0.9950547 ,
0.9993292 ],
[ 0.99990916, 0.99998784, 0.99999833, 0.99999976,
1. ]]], dtype=float32)
-
+
同秩但不同维度的tensor之间的加法运算
>>> a = tf.constant([[1, 2], [3, 4]])
>>> b = tf.constant([5, 6])
>>> a + b
>>> c = a + b
>>> session.run([a, b, c])
[array([[1, 2],
[3, 4]], dtype=int32), array([5, 6], dtype=int32), array([[ 6, 8],
[ 8, 10]], dtype=int32)]
>>> c = a + b
>>> session.run([c, a, b])
[array([[[ 1, 2, 3, 4],
[ 5, 6, 7, 8],
[ 9, 10, 11, 12]],
[[11, 12, 13, 14],
[15, 16, 17, 18],
[19, 20, 21, 22]]], dtype=int32),
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]],
[[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23]]], dtype=int32),
array([[[ 1, 1, 1, 1]],
[[-1, -1, -1, -1]]], dtype=int32)]
a
的维度为 , b
维度为 , 操作是在最后一个维度上展开的, 这里的第二个维度是重复的,重复了n次。
- tf.nn.softmax 默认的维度为-1
>>> def loge(x):
... return math.log(x, math.e)
...
>>> a = tf.constant([[loge(10), loge(5), loge(5)], [loge(20), loge(3), loge(7)]])
>>> a
>>> b = tf.softmax(a)
Traceback (most recent call last):
File "", line 1, in
AttributeError: 'module' object has no attribute 'softmax'
>>> b = tf.nn.softmax(a)
>>> c = tf.nn.softmax(a, axis=0)
>>> session.run([a, b, c])
[array([[2.3025851, 1.609438 , 1.609438 ],
[2.9957323, 1.0986123, 1.9459101]], dtype=float32),
array([[0.5 , 0.25 , 0.25 ],
[0.6666667 , 0.09999999, 0.23333332]], dtype=float32),
array([[0.33333334, 0.62500006, 0.4166667 ],
[0.6666667 , 0.375 , 0.5833333 ]], dtype=float32)]
- tf.multiply
>>> a = tf.constant([[1, 2, 3], [4, 5, 6]])
>>> a
>>> b = tf.constant([-1, 1, -1])
>>> b
>>> c = tf.multiply(a, b)
>>> session.run(c)
array([[-1, 2, -3],
[-4, 5, -6]], dtype=int32)
上面的b 在 a 的最后一个维度上滚动的来乘积,好像乘积和加法一样的规则
>>> b = tf.constant([[-1], [1]])
>>> b
>>> d = tf.multiply(a, b)
>>> session.run(d)
array([[-1, -2, -3],
[ 4, 5, 6]], dtype=int32)
可以考虑为在b rank 不到 a rank的情况下,调用 n 次 b = np.expand_dims(b, 0)
来增加b的维度。