在测试集上对训练好的模型进行实验的时候,发现写好的代码没有报错但是会卡在session.run()那里不动。查了好久终于明白是开启线程的问题。
tf的数据线程没有启动,导致数据流图没办法计算,整个程序就卡在那里。
更深层次的原因是tensorflow的计算和数据读入是异步的,合理的方式是主线程进行模型的训练,然后开一个数据读入线程异步读入数据.tensorflow会在内存中维护一个队列,然后数据线程异步从磁盘中将样本推入队列当中。并且,因为tensorflow的训练和读数据是异步的,故即使当前没有数据进来,tensorflow也没办法报错,因为可能接下来会有数据进队列,所以,tensorflow就一直处于等待的状态
下面以别人的例子说明:
在Session当中,没有启动数据读入线程。所以,sess.run(train_input.input_data)就是无数据可取,程序就处于一种挂起的状态。
#-*- coding:utf-8 -*-
import numpy as np
import tensorflow as tf
from tensorflow.models.rnn.ptb import reader
class PTBInput(object):
"""The input data."""
def __init__(self, config, data, name=None):
self.batch_size = batch_size = config.batch_size
self.num_steps = num_steps = config.num_steps
#为何要进行-1操作
self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
self.input_data, self.targets = reader.ptb_producer(
data, batch_size, num_steps, name=name)
class SmallConfig(object):
"""Small config."""
init_scale = 0.1
learning_rate = 1.0
max_grad_norm = 5
num_layers = 2
num_steps = 20
hidden_size = 200
max_epoch = 4
max_max_epoch = 13
keep_prob = 1.0
lr_decay = 0.5
batch_size = 20
vocab_size = 10000
if __name__ == '__main__':
config = SmallConfig()
data_path = '/home/jdlu/jdluTensor/data/simple-examples/data'
raw_data = reader.ptb_raw_data(data_path)
train_data, valid_data, test_data, _ = raw_data
train_input = PTBInput(config=config, data=train_data, name="TrainInput")
print "end--------------------------------"
#wrong,使用session就会出现读不出数据的错误,读不出数据,整个数据流图就无法计算,整个程序就处于挂起的状态
#使用session会出错
with tf.Session() as sess:
for step in range(1):
print sess.run(train_input.input_data)
有两种办法:
#-*- coding:utf-8 -*-
import numpy as np
import tensorflow as tf
from tensorflow.models.rnn.ptb import reader
class PTBInput(object):
"""The input data."""
def __init__(self, config, data, name=None):
self.batch_size = batch_size = config.batch_size
self.num_steps = num_steps = config.num_steps
#为何要进行-1操作
self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
self.input_data, self.targets = reader.ptb_producer(
data, batch_size, num_steps, name=name)
class SmallConfig(object):
"""Small config."""
init_scale = 0.1
learning_rate = 1.0
max_grad_norm = 5
num_layers = 2
num_steps = 20
hidden_size = 200
max_epoch = 4
max_max_epoch = 13
keep_prob = 1.0
lr_decay = 0.5
batch_size = 20
vocab_size = 10000
if __name__ == '__main__':
config = SmallConfig()
data_path = '/home/jdlu/jdluTensor/data/simple-examples/data'
raw_data = reader.ptb_raw_data(data_path)
train_data, valid_data, test_data, _ = raw_data
train_input = PTBInput(config=config, data=train_data, name="TrainInput")
print "end--------------------------------"
#right,使用Supervisor() 方法二
#sv = tf.train.Supervisor()
#with sv.managed_session() as sess:
# for step in range(1):
# print sess.run(train_input.input_data)
#right 方法一
# Create a session for running operations in the Graph.
sess = tf.Session()
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
# Run training steps or whatever
try:
for step in range(2):
print sess.run(train_input.input_data)
except Exception,e:
#Report exceptions to the coordinator
coord.request_stop(e)
coord.request_stop()
# Terminate as usual. It is innocuous to request stop twice.
coord.join(threads)
sess.close()