def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None): """Creates a recurrent neural network specified by RNNCell `cell`. Performs fully dynamic unrolling of `inputs`. Example: ```python # create a BasicRNNCell rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size] # defining initial state initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32) # 'state' is a tensor of shape [batch_size, cell_state_size] outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data, initial_state=initial_state, dtype=tf.float32) ``` ```python # create 2 LSTMCells rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]] # create a RNN cell composed sequentially of a number of RNNCells multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers) # 'outputs' is a tensor of shape [batch_size, max_time, 256] # 'state' is a N-tuple where N is the number of LSTMCells containing a # tf.contrib.rnn.LSTMStateTuple for each cell outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell, inputs=data, dtype=tf.float32) ``` Args: cell: An instance of RNNCell. inputs: The RNN inputs. If `time_major == False` (default), this must be a `Tensor` of shape: `[batch_size, max_time, ...]`, or a nested tuple of such elements. If `time_major == True`, this must be a `Tensor` of shape: `[max_time, batch_size, ...]`, or a nested tuple of such elements. This may also be a (possibly nested) tuple of Tensors satisfying this property. The first two dimensions must match across all the inputs, but otherwise the ranks and other shape components may differ. In this case, input to `cell` at each time-step will replicate the structure of these tuples, except for the time dimension (from which the time is taken). The input to `cell` at each time step will be a `Tensor` or (possibly nested) tuple of Tensors each with dimensions `[batch_size, ...]`. sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. Used to copy-through state and zero-out outputs when past a batch element's sequence length. So it's more for correctness than performance. initial_state: (optional) An initial state for the RNN. If `cell.state_size` is an integer, this must be a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`. If `cell.state_size` is a tuple, this should be a tuple of tensors having shapes `[batch_size, s] for s in cell.state_size`. dtype: (optional) The data type for the initial state and expected output. Required if initial_state is not provided or RNN state has a heterogeneous dtype. parallel_iterations: (Default: 32). The number of iterations to run in parallel. Those operations which do not have any temporal dependency and can be run in parallel, will be. This parameter trades off time for space. Values >> 1 use more memory but take less time, while smaller values use less memory but computations take longer. swap_memory: Transparently swap the tensors produced in forward inference but needed for back prop from GPU to CPU. This allows training RNNs which would typically not fit on a single GPU, with very minimal (or no) performance penalty. time_major: The shape format of the `inputs` and `outputs` Tensors. If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using `time_major = True` is a bit more efficient because it avoids transposes at the beginning and end of the RNN calculation. However, most TensorFlow data is batch-major, so by default this function accepts input and emits output in batch-major form. scope: VariableScope for the created subgraph; defaults to "rnn". Returns: A pair (outputs, state) where: outputs: The RNN output `Tensor`. If time_major == False (default), this will be a `Tensor` shaped: `[batch_size, max_time, cell.output_size]`. If time_major == True, this will be a `Tensor` shaped: `[max_time, batch_size, cell.output_size]`. Note, if `cell.output_size` is a (possibly nested) tuple of integers or `TensorShape` objects, then `outputs` will be a tuple having the same structure as `cell.output_size`, containing Tensors having shapes corresponding to the shape data in `cell.output_size`. state: The final state. If `cell.state_size` is an int, this will be shaped `[batch_size, cell.state_size]`. If it is a `TensorShape`, this will be shaped `[batch_size] + cell.state_size`. If it is a (possibly nested) tuple of ints or `TensorShape`, this will be a tuple having the corresponding shapes. If cells are `LSTMCells` `state` will be a tuple containing a `LSTMStateTuple` for each cell. Raises: TypeError: If `cell` is not an instance of RNNCell. ValueError: If inputs is None or an empty list. """ rnn_cell_impl.assert_like_rnncell("cell", cell) with vs.variable_scope(scope or "rnn") as varscope: # Create a new scope in which the caching device is either # determined by the parent scope, or is set to place the cached # Variable using the same placement as for the rest of the RNN. if not context.executing_eagerly(): if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) # By default, time_major==False and inputs are batch-major: shaped # [batch, time, depth] # For internal calculations, we transpose to [time, batch, depth] flat_input = nest.flatten(inputs) if not time_major: # (B,T,D) => (T,B,D) flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input] flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input) parallel_iterations = parallel_iterations or 32 if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length.get_shape().ndims not in (None, 1): raise ValueError( "sequence_length must be a vector of length batch_size, " "but saw shape: %s" % sequence_length.get_shape()) sequence_length = array_ops.identity( # Just to find it in the graph. sequence_length, name="sequence_length") batch_size = _best_effort_input_batch_size(flat_input) if initial_state is not None: state = initial_state else: if not dtype: raise ValueError("If there is no initial_state, you must give a dtype.") state = cell.zero_state(batch_size, dtype)
这里是系统自己带的解释
用rnn处理变长文本时,使用dynamic_rnn可以跳过padding部分的计算,减少计算量。假设有两个文本,一个长度为10,另一个长度为5,那么需要对第二文本使用0-padding方法填充,得到的shape为(2, 10, dim),其中dim是词向量维度。使用dynamic_rnn的代码如下:
outputs, last_states = tf.nn.dynamic_rnn( cell=cell, dtype=tf.float32, sequence_length=x_lengths, inputs=x)
其中cell是RNN节点,比如tf.contrib.rnn.BasicLSTMCel,x是0-padding以后的数据,x_lengths是每个文本的长度。计算第二个文本的时候,只计算前面5个值,后面的就直接跳过了,对应的output直接设为0,cell的状态保持第5步的值。
dynamic_rnn返回两个变量,第一个是每个step的输出值,第二个是最终的状态。那么问题来了,对于第二个文本,我想取的肯定是第5个output,最后一个output是无效的0对我来说没有意义。目前我知道的有3种做法。
第一种是从别人代码里面看到,链接在此。作者自己写了个index的operation,代码比较绕。
第二种是构建一个mask,长度对应的那位为1,其余的为0,比如第二个文本对应的mask为[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],然后将这个mask与outputs按时间维度进行sum,这样得到的刚好是第5个输出的值。
第三种做法最简单,这得从rnn的定义说起,rnn的输出其实就是状态中的h,因此last_states 中的h状态就是我们需要的output。也就是我们把last_states.h当作rnn的最终输出就行了。
tensorflow 的dynamic_rnn,我们用一个小例子来说明其用法,假设你的RNN的输入input是[2,3,4],其中2是batch_size,3是文本最大长度,一般叫num_steps或者seq_length,4是embedding_size。我们假设第二个文本长度只有2,剩下的1个是使用0-padding方法填充的。dynamic_rnn返回的是两个参数:outputs,last_states,其中outputs是[2,3,4],也就是每一个迭代隐状态的输出,它包括了训练中所有隐层状态,而last_states是由(c,h)组成的tuple,大小均为[batch,hidden_size]也就是[2,2]。
到这里并没有什么不同,但是dynamic有个参数:sequence_length,这个参数用来指定每个example的长度,比如上面的例子中,我们令 sequence_length为[3,2],表示第一个example有效长度为3,第二个example有效长度为2,当我们传入这个参数的时候,对于第二个batch,TensorFlow对于2以后的padding就不计算了,其last_states将重复第2步的last_states直至第3步,而outputs中超过第2步的结果将会被置零。
具体看代码和输出:
#coding:utf-8 import tensorflow as tf import numpy as np # 产生2个batch数据,句子length为3,embedding大小为4 X = np.random.randn(2, 3, 4) # 第二个batch长度为2 X[1,2:] = 0 X_lengths = [3, 2] print(X) # cell = tf.contrib.rnn.BasicLSTMCell(num_units=64, state_is_tuple=True) cell=tf.nn.rnn_cell.BasicLSTMCell(num_units=2,state_is_tuple=True) outputs, last_states = tf.nn.dynamic_rnn( cell=cell, dtype=tf.float64, sequence_length=X_lengths, inputs=X) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) o=sess.run(outputs) s=sess.run(last_states) print('output\n',o) print('last_o\n',o[:,-1,:])# 从output中取最后一次输出 print('--------------------') print('s\n',s) print('s.c\n',s.c) # 这是门控单元的权重,这里不需要 print('s.h\n',s.h) #s.h就是最后一次输出的状态
产生以下输出:
首先能看到第二个句子最后一行全是0,说明句子长度为2,后面的0相当于pading上的。
然后经过LSTM产生两个输出,output和state
在output中,发现第二个句子经过训练后,最后产生的状态全是0,但是在state中用s.h能找到其最后一次输出不是0,而是与output中最后一次不是0的一致,如下面加上高亮的几个数,这些输出是一样的。所以如果想用dynamic_rnn得到输出后,只需要最后一次的状态输出,直接调用s.h即可。
[[[ 0.25807128 2.12146955 -2.05515763 -0.85231539]
[ 1.45649279 1.14585909 0.65417413 0.24676652]
[-0.61412936 -1.032752 -0.03762097 0.07469644]]
[[ 0.32473887 1.62013486 -1.16800708 0.82672849]
[ 0.99570385 -0.80009619 -0.46867865 0.90598246]
[ 0. 0. 0. 0. ]]]
('output\n', array([[[ 0.20033333, 0.08891689],
[ 0.08907282, 0.03732482],
[ 0.13778725, 0.04982935]],
[[ 0.08782086, -0.04113857],
[ 0.18728541, 0.02615149],
[ 0. , 0. ]]]))
('last_o\n', array([[0.13778725, 0.04982935],
[0. , 0. ]]))
--------------------
('s\n', LSTMStateTuple(c=array([[0.19543619, 0.082119 ],
[0.47740047, 0.04340056]]), h=array([[0.13778725, 0.04982935],
[0.18728541, 0.02615149]])))
('s.c\n', array([[0.19543619, 0.082119 ],
[0.47740047, 0.04340056]]))
('s.h\n', array([[0.13778725, 0.04982935],
[0.18728541, 0.02615149]]))