c_0 = tf.constant(0, name="c") # => operation named "c"
# Already-used names will be "uniquified".
c_1 = tf.constant(2, name="c") # => operation named "c_1"
# Name scopes add a prefix to all operations created in the same context.
with tf.name_scope("outer"):
c_2 = tf.constant(2, name="c") # => operation named "outer/c"
# Name scopes nest like paths in a hierarchical file system.
with tf.name_scope("inner"):
c_3 = tf.constant(3, name="c") # => operation named "outer/inner/c"
# Exiting a name scope context will return to the previous prefix.
c_4 = tf.constant(4, name="c") # => operation named "outer/c_1"
PS: 返回的Tensor的名字为OP_NAME:i
OP_NAME:产生改Tensor的Operation的名字
i:Operation的第i个输出
with tf.device("/device:CPU:0"):
# Operations created in this context will be pinned to the CPU.
img = tf.decode_jpeg(tf.read_file("img.jpg"))
with tf.device("/device:GPU:0"):
# Operations created in this context will be pinned to the GPU.
result = tf.matmul(weights, img)
- 在分布式计算中,常把计算资源分成两个部分,参数服务器(Parameter Server)和工作节点(Worker):参数服务器节点用来存储参数,工作节点部分用来做算法的训练。对应的也罢机器学习算法分成两个部分,参数和训练,参数部分即模型本身,有一致性的要求,参数服务器也可以是一个集群,训练部分自然是并行的,不然无法体现分布式机器学习的优势。因为参数服务器的存在,每个计算节点在拿到新的batch数据之后,都要从参数服务器上取下最新的参数,然后计算梯度,再将梯度更新回参数服务器。具体会再写一篇来讲。
with tf.device("/job:ps/task:0"):
weights_1 = tf.Variable(tf.truncated_normal([784, 100]))
biases_1 = tf.Variable(tf.zeroes([100]))
with tf.device("/job:ps/task:1"):
weights_2 = tf.Variable(tf.truncated_normal([100, 10]))
biases_2 = tf.Variable(tf.zeroes([10]))
with tf.device("/job:worker"):
layer_1 = tf.matmul(train_batch, weights_1) + biases_1
layer_2 = tf.matmul(train_batch, weights_2) + biases_2
PS: tf.train.replica_device_setter API可以自动为算子(Operation)分配计算资源,即哪些运算分给PS,哪些运算分给Worker:
with tf.device(tf.train.replica_device_setter(ps_tasks=3)):
#PS用来存参数
w_0 = tf.Variable(...) # placed on "/job:ps/task:0"
b_0 = tf.Variable(...) # placed on "/job:ps/task:1"
w_1 = tf.Variable(...) # placed on "/job:ps/task:2"
b_1 = tf.Variable(...) # placed on "/job:ps/task:0"
# 其他operation给worker进行计算
input_data = tf.placeholder(tf.float32) # placed on "/job:worker"
layer_0 = tf.matmul(input_data, w_0) + b_0 # placed on "/job:worker"
layer_1 = tf.matmul(layer_0, w_1) + b_1 # placed on "/job:worker"
#使用default Session
with tf.Session() as sess:
# ...
# 创建远程Session
with tf.Session("grpc://example.org:2222"):
# ...
x = tf.constant([[37.0, -23.0], [1.0, 4.0]])
w = tf.Variable(tf.random_uniform([2, 2]))
y = tf.matmul(x, w)
output = tf.nn.softmax(y)
init_op = w.initializer
with tf.Session() as sess:
# 传入一个Operation,对涉及该Operation的子图进行计算
sess.run(init_op)
# 传入一个Tensor,对涉及该Tensor的子图进行计算
print(sess.run(output))
# 此时y只会被计算一次
y_val, output_val = sess.run([y, output])
当然,tf.Session.run()也可以接受一个dict of feeds对涉及占位符(placeholder)的子图进行计算:
x = tf.placeholder(tf.float32, shape=[3])
y = tf.square(x)
with tf.Session() as sess:
# Feeding a value changes the result that is returned when you evaluate `y`.
print(sess.run(y, {x: [1.0, 2.0, 3.0]}) # => "[1.0, 4.0, 9.0]"
print(sess.run(y, {x: [0.0, 0.0, 5.0]}) # => "[0.0, 0.0, 25.0]"