最后的解决办法就是用CudnnLSTM替换原来的LSTM.
我发现跑的模型,有的时候GPU利用率比较低。经过对模型的拆解运行,最后确定是LSTM拉低了GPU利用率。
老的利用率截图
新的利用率接近100%
我最后重新写的代码见附录
原来的代码见附录,我找了一番资料后,感觉写得也没有问题。
搜索一番后,发现可以用CuDNNLSTM
Fail to find the dnn implementation.
如果遇到
tensorflow.python.framework.errors_impl.UnknownError: Fail to find the dnn implementation.
[[node cudnn_lstm_1/CudnnRNNCanonicalToParams (defined at lstm.py:596) ]]
看一下用的GPU是不是有人用了,如果是别人用了的(即便还有显存也不行),换一个新的GPU。
bilstm = my_rnn.CudnnLSTM(
num_layers=1, num_units=parent_hidden_size//2,
direction='bidirectional',
dropout=0.3,
dtype=tf.float32
)
# 省略数据变换
bilstm.build(inputsPath.get_shape())
# [time_len, batch_size, input_size] -> [time_len, batch_size, num_dirs * num_units]
my_rnn_outputs, _ = bilstm(inputsPath, training=is_training)
# 只取最后一个
root_path_output = my_rnn_outputs[-1,:,:]
# 省略其他的操作
def encode_par_path(embedding_inputs, parent_hidden_size, rnn_layers=1, keep_prob=0.7, bi_lstm=True):
with tf.variable_scope('path_encoder') as encoder_scope:
def build_cell(hidden_size):
def get_single_cell(hidden_size, keep_prob):
cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
if keep_prob < 1:
cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob)
return cell
cell = tf.nn.rnn_cell.MultiRNNCell(
[get_single_cell(hidden_size, keep_prob) for _ in range(rnn_layers)])
return cell
if not bi_lstm:
encoder_cell = build_cell(parent_hidden_size)
encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
encoder_cell, embedding_inputs,
# sequence_length=self.par_seq_len,
dtype=tf.float32, scope=encoder_scope)
return encoder_outputs, encoder_final_state
else:
encoder_cell = build_cell(parent_hidden_size / 2)
bw_encoder_cell = build_cell(parent_hidden_size / 2)
encoder_outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn(
encoder_cell, bw_encoder_cell,
embedding_inputs,
# sequence_length=self.par_seq_len,
dtype=tf.float32, scope=encoder_scope)
state = []
for i in range(rnn_layers):
fs = fw_state[i]
bs = bw_state[i]
encoder_final_state_c = tf.concat((fs.c, bs.c), 1)
encoder_final_state_h = tf.concat((fs.h, bs.h), 1)
encoder_final_state = tf.nn.rnn_cell.LSTMStateTuple(
c=encoder_final_state_c,
h=encoder_final_state_h)
state.append(encoder_final_state)
encoder_final_state = tuple(state)
encoder_outputs = tf.concat([encoder_outputs[0], encoder_outputs[1]], -1)
return encoder_outputs, encoder_final_state
# Path2root
root_path = []
with tf.variable_scope("RNN"):
for time_step in range(num_steps):
if time_step > 0:
tf.get_variable_scope().reuse_variables()
path_output, path_state = encode_par_path(
inputsPath[:, time_step, :, :], parent_hidden_size) # [bz, parent_len, hidden]
root_path.append(path_output[:, -1, :]) # [seq_len, bz, hidden]
root_path_output = tf.stack(axis=0, values=root_path) # [bz, seq_len, hidden]
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
shape = [2, 2, 2]
n_cell_dim = 2
def init_vars(sess):
sess.run(tf.global_variables_initializer())
def train_graph():
with tf.Graph().as_default(), tf.device('/gpu:0'):
with tf.Session() as sess:
is_training = True
inputs = tf.random_uniform(shape, dtype=tf.float32)
lstm = tf.contrib.cudnn_rnn.CudnnLSTM(
num_layers=1,
num_units=n_cell_dim,
direction='bidirectional',
dtype=tf.float32)
lstm.build(inputs.get_shape())
outputs, output_states = lstm(inputs, training=is_training)
with tf.device('/cpu:0'):
saver = tf.train.Saver()
init_vars(sess)
saver.save(sess, '/tmp/model')
def inf_graph():
with tf.Graph().as_default(), tf.device('/cpu:0'):
with tf.Session() as sess:
single_cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(
n_cell_dim, reuse=tf.get_variable_scope().reuse)
inputs = tf.random_uniform(shape, dtype=tf.float32)
lstm_fw_cell = [single_cell() for _ in range(1)]
lstm_bw_cell = [single_cell() for _ in range(1)]
(outputs, output_state_fw,
output_state_bw) = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
lstm_fw_cell,
lstm_bw_cell,
inputs,
dtype=tf.float32,
time_major=True)
saver = tf.train.Saver()
saver.restore(sess, '/tmp/model')
print(sess.run(outputs))
def main(unused_argv):
train_graph()
inf_graph()
if __name__ == '__main__':
tf.app.run(main)
一个跑mnist的例子
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from tensorflow.examples.tutorials.mnist import input_data
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
mnist = input_data.read_data_sets('/tmp/data', one_hot=True)
element_size = 28
time_steps = 28
num_classes = 10
batch_size = 64
hidden_layer_size = 128
LOG_DIR = 'culstm'
_inputs = tf.placeholder(tf.float32, shape=[batch_size, time_steps, element_size], name='inputs')
y = tf.placeholder(tf.float32, shape=[None, num_classes], name='labels')
with tf.name_scope('rnn'):
rnn_input = tf.transpose(_inputs, [1, 0, 2])
from tensorflow.contrib.cudnn_rnn import CudnnLSTM
is_training = True
lstm = tf.contrib.cudnn_rnn.CudnnLSTM(
num_layers=1, num_units=hidden_layer_size,
# UnknownError (see above for traceback): CUDNN_STATUS_EXECUTION_FAILED
# dropout=0.3,
dtype=tf.float32
)
lstm.build(rnn_input.get_shape())
# [time_len, batch_size, input_size] -> [time_len, batch_size, num_dirs * num_units]
outputs, _ = lstm(rnn_input, training=is_training)
output = outputs[-1]
with tf.name_scope('fc'):
w = tf.Variable(tf.truncated_normal([hidden_layer_size, num_classes], mean=0, stddev=0.01), dtype=tf.float32)
b = tf.Variable(tf.truncated_normal([num_classes], mean=0, stddev=0.01), dtype=tf.float32)
y_pred = tf.matmul(output, w) + b
loss = tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y)
loss = tf.reduce_mean(loss)
optimizer = tf.train.RMSPropOptimizer(0.001, 0.9)
train = optimizer.minimize(loss)
correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
test_data = mnist.test.images[:batch_size].reshape(-1, time_steps, element_size)
test_label = mnist.test.labels[:batch_size]
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
pbar = tqdm(range(10000))
pbar.set_description(
f'Train loss: , '
f'accuracy '
f'Test loss: , '
f'accuracy ')
print()
for i in pbar:
batch_x, batch_y = mnist.train.next_batch(batch_size)
batch_x = batch_x.reshape(-1, time_steps, element_size)
_, loss_np, accuracy_np = sess.run([train, loss, accuracy], feed_dict={_inputs: batch_x, y: batch_y})
if i % 100 == 99:
test_loss_np, test_accuracy_np = sess.run([loss, accuracy], feed_dict={_inputs: test_data, y: test_label})
pbar.set_description(
f'Train loss: {loss_np:.4f}, '
f'accuracy {accuracy_np:.4f} '
f'Test loss: {test_loss_np:.4f}, '
f'accuracy {test_accuracy_np:.4f}')