文|Serpah
import os
import errno
import tensorflow as tf
import horovod.tensorflow as hvd
import numpy as np
from tensorflow import keras
layers = tf.layers
tf.logging.set_verbosity(tf.logging.INFO)
def conv_model(feature, target, mode):
"""2-layer convolution model."""
# Convert the target to a one-hot tensor of shape (batch_size, 10) and
# with a on-value of 1 for each one-hot vector of length 10.
target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
# Reshape feature to 4d tensor with 2nd and 3rd dimensions being
# image width and height final dimension being the number of color channels.
feature = tf.reshape(feature, [-1, 28, 28, 1])
# First conv layer will compute 32 features for each 5x5 patch
with tf.variable_scope('conv_layer1'):
h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
activation=tf.nn.relu, padding="SAME")
h_pool1 = tf.nn.max_pool(
h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
# Second conv layer will compute 64 features for each 5x5 patch.
with tf.variable_scope('conv_layer2'):
h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
activation=tf.nn.relu, padding="SAME")
h_pool2 = tf.nn.max_pool(
h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
# reshape tensor into a batch of vectors
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
# Densely connected layer with 1024 neurons.
h_fc1 = layers.dropout(
layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
# Compute logits (1 per class) and compute loss.
logits = layers.dense(h_fc1, 10, activation=None)
loss = tf.losses.softmax_cross_entropy(target, logits)
return tf.argmax(logits, 1), loss
def train_input_generator(x_train, y_train, batch_size=64):
assert len(x_train) == len(y_train)
while True:
p = np.random.permutation(len(x_train))
x_train, y_train = x_train[p], y_train[p]
index = 0
while index <= len(x_train) - batch_size:
yield x_train[index:index + batch_size], \
y_train[index:index + batch_size],
index += batch_size
def main(_):
# Horovod: initialize Horovod.
hvd.init()
# Keras automatically creates a cache directory in ~/.keras/datasets for
# storing the downloaded MNIST data. This creates a race
# condition among the workers that share the same filesystem. If the
# directory already exists by the time this worker gets around to creating
# it, ignore the resulting exception and continue.
cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
if not os.path.exists(cache_dir):
try:
os.mkdir(cache_dir)
except OSError as e:
if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
pass
else:
raise
# Download and load MNIST dataset.
(x_train, y_train), (x_test, y_test) = \
keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())
# The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
# into (-1, 784) to feed into our network. Also, need to normalize the
# features between 0 and 1.
x_train = np.reshape(x_train, (-1, 784)) / 255.0
x_test = np.reshape(x_test, (-1, 784)) / 255.0
# Build model...
with tf.name_scope('input'):
image = tf.placeholder(tf.float32, [None, 784], name='image')
label = tf.placeholder(tf.float32, [None], name='label')
predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)
# Horovod: adjust learning rate based on number of GPUs.
opt = tf.train.AdamOptimizer(0.001 * hvd.size())
# Horovod: add Horovod Distributed Optimizer.
opt = hvd.DistributedOptimizer(opt)
global_step = tf.train.get_or_create_global_step()
train_op = opt.minimize(loss, global_step=global_step)
hooks = [
# Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
# from rank 0 to all other processes. This is necessary to ensure consistent
# initialization of all workers when training is started with random weights
# or restored from a checkpoint.
hvd.BroadcastGlobalVariablesHook(0),
# Horovod: adjust number of steps based on number of GPUs.
tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
every_n_iter=10),
]
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
training_batch_generator = train_input_generator(x_train,
y_train, batch_size=100)
# The MonitoredTrainingSession takes care of session initialization,
# restoring from a checkpoint, saving to a checkpoint, and closing when done
# or an error occurs.
with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
hooks=hooks,
config=config) as mon_sess:
while not mon_sess.should_stop():
# Run a training step synchronously.
image_, label_ = next(training_batch_generator)
mon_sess.run(train_op, feed_dict={image: image_, label: label_})
if __name__ == "__main__":
tf.app.run()
layters = tf.layers
tf.layers
是提供用于深度学习的更高层次封装的 API,主要提供全连接和卷积等基本操作。方法 | 含义 |
---|---|
Input(…) | 用于实例化一个输入 Tensor,作为神经网络的输入。 |
average_pooling1d(…) | 一维平均池化层 |
average_pooling2d(…) | 二维平均池化层 |
average_pooling3d(…) | 三维平均池化层 |
batch_normalization(…) | 批量标准化层 |
conv1d(…) | 一维卷积层 |
conv2d(…) | 二维卷积层 |
conv2d_transpose(…) | 二维反卷积层 |
conv3d(…) | 三维卷积层 |
conv3d_transpose(…) | 三维反卷积层 |
dense(…) | 全连接层 |
dropout(…) | Dropout层 |
flatten(…) | Flatten层,即把一个 Tensor 展平 |
max_pooling1d(…) | 一维最大池化层 |
max_pooling2d(…) | 二维最大池化层 |
max_pooling3d(…) | 三维最大池化层 |
separable_conv2d(…) | 二维深度可分离卷积层 |
tf.logging.set_verbosity(tf.logging.INFO)
设置日志输出级别为INFO。
TensorFlow使用五个不同级别的日志消息。 按照上升的顺序,它们是DEBUG,INFO,WARN,ERROR和FATAL。
当您在任何这些级别配置日志记录时,TensorFlow将输出与该级别相对应的所有日志消息以及比当前级别的更严重的级别信息。
TensorFlow在WARN的日志记录级别进行配置,但是在跟踪模型训练时,您需要将级别调整为INFO,这将提供适合操作正在进行的其他反馈。
os.path
os.path.join 路径拼接
os.path.exists 路径是否存在
os.path.mkdir 用于以数字权限模式创建目录。默认的模式为 0777 (八进制)。
os.path.isdir 判断路径是否为目录
pass
语句占位符,不作任何操作
raise
程序出现错误,会自动引发异常,Python也允许使用raise语句自行引发异常。
keras.datasets.mnist.load_data
keras自带的加载MNIST数据集
np.reshape
在不改变数据内容的情况下,改变一个数组的格式。注意这里是引用修改,如果改变了原数据,新数组的内容也会随之改变。
-1标识,以其它的维度去改变数组的格式
x_train = np.reshape(x_train, (-1, 784)) / 255.0
将x_train数据改变为每行有784元素的数组,且数据归一化到0~1之间。
tf.name_scope
指定的区域中定义的所有对象及各种操作,他们的“name”属性上会增加该命名区的区域名,用以区别对象属于哪个区域;
tf.estimator.ModeKeys
model 模式的标准名称.
定义了以下的标准键:
TRAIN:训练模式.
EVAL:计算模式.
PREDICT:推理模式.
tf.train.AdamOptimizer()
Adam优化算法:是一个寻找全局最优点的优化算法,引入了二次方梯度校正。
11.tf.train.get_or_create_global_step()
这个函数主要用于返回或者创建(如果有必要的话)一个全局步数的tensor。
optimizer.minimize()
minimize的内部存在两个操作:(1)计算各个变量的梯度 (2)用梯度更新这些变量的值
MonitoredTrainingSession()
监控训练的回话.
tf.train.StopAtStepHook钩子,这个钩子定义了训练的最后一步,之后参数服务器和worker服务器会被关闭。
tf.train.LoggingTensorHook钩子,以每N步或者N秒打印给定的张量,张量以INFO信息输出日志。
config=tf.ConfigProto()
用在创建session的时候,用来对session进行参数配置
config.gpu_options.allow_growth #True表示允许动态申请显存
config.gpu_options.visible_device_list #指定使用GPU列表
Numpy.random.permutation
对原来的数组进行重新洗牌(即随机打乱原来的元素顺序)。
区别在于shuffle直接在原来的数组上进行操作,改变原来数组的顺序,无返回值。而permutation不直接在原来的数组上进行操作,而是返回一个新的打乱顺序的数组,并不改变原来的数组。
Next
返回迭代器的下一个项目。
tf.cast
tf.cast()函数的作用是执行 tensorflow 中张量数据类型转换。
tf.one_hot
使用one-hot编码,将离散特征的取值扩展到了欧式空间,离散特征的某个取值就对对应欧式空间的某个点。
将离散型特征使用one-hot编码,会让特征之间的距离计算更加合理。
例如:
tf.one_hot([0, 1, 2], 3, on_value=7, off_value=4)
#[7 4 4]
#[4 7 4]
#[4 4 7]]
model.evaluate
评估训练的模型,输入数据和标签,输出损失和精确度。
model.predict
预测测试结果,输入测试数据,输出预测结果。
from __future__ import absolute_import, division, print_function
import argparse
import os
import numpy as np
import timeit
import tensorflow as tf
import horovod.tensorflow as hvd
from tensorflow.keras import applications
# Benchmark settings
parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--fp16-allreduce', action='store_true', default=False,
help='use fp16 compression during allreduce')
parser.add_argument('--model', type=str, default='ResNet50',
help='model to benchmark')
parser.add_argument('--batch-size', type=int, default=32,
help='input batch size')
parser.add_argument('--num-warmup-batches', type=int, default=10,
help='number of warm-up batches that don\'t count towards benchmark')
parser.add_argument('--num-batches-per-iter', type=int, default=10,
help='number of batches per benchmark iteration')
parser.add_argument('--num-iters', type=int, default=10,
help='number of benchmark iterations')
parser.add_argument('--eager', action='store_true', default=False,
help='enables eager execution')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
args = parser.parse_args()
args.cuda = not args.no_cuda
hvd.init()
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
if args.cuda:
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
else:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
config.gpu_options.allow_growth = False
config.gpu_options.visible_device_list = ''
if args.eager:
tf.enable_eager_execution(config)
# Set up standard model.
model = getattr(applications, args.model)(weights=None)
opt = tf.train.GradientDescentOptimizer(0.01)
# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
# Horovod: wrap optimizer with DistributedOptimizer.
opt = hvd.DistributedOptimizer(opt, compression=compression)
init = tf.global_variables_initializer()
bcast_op = hvd.broadcast_global_variables(0)
data = tf.random_uniform([args.batch_size, 224, 224, 3])
target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64)
def loss_function():
probs = model(data, training=True)
return tf.losses.sparse_softmax_cross_entropy(target, probs)
def log(s, nl=True):
if hvd.rank() != 0:
return
print(s, end='\n' if nl else '')
log('Model: %s' % args.model)
log('Batch size: %d' % args.batch_size)
device = 'GPU' if args.cuda else 'CPU'
log('Number of %ss: %d' % (device, hvd.size()))
def run(benchmark_step):
# Warm-up
log('Running warmup...')
timeit.timeit(benchmark_step, number=args.num_warmup_batches)
# Benchmark
log('Running benchmark...')
img_secs = []
for x in range(args.num_iters):
time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
img_sec = args.batch_size * args.num_batches_per_iter / time
log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
img_secs.append(img_sec)
# Results
img_sec_mean = np.mean(img_secs)
img_sec_conf = 1.96 * np.std(img_secs)
log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
log('Total img/sec on %d %s(s): %.1f +-%.1f' %
(hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
if tf.executing_eagerly():
with tf.device(device):
run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables))
else:
with tf.Session(config=config) as session:
init.run()
bcast_op.run()
loss = loss_function()
train_opt = opt.minimize(loss)
run(lambda: session.run(train_opt))
from future import
做法的作用就是将新版本的特性引进当前版本中。
absolute_import 绝对引用包
division 精确除法
print_function Python3打印
import argparse
argparse模块是Python用来处理命令行参数模块。
argparse.ArgumentParser生成一个parser对象(参数解析器)
parse.add_argument增加参数
parse.parse_args获取解析的参数
import timeit
timeit模块是Python计时工具。
timeit.timeit 第一个参数为执行表达式或函数,number
参数为执行次数。
from tensorflow.keras import applications
tensorflow.keras.application模块里有多个预训练的模型类。
os.environ[“CUDA_VISIBLE_DEVICES”] = “-1”
指定使用的GPU,序号从0开始,以,
隔开多个。“-1”表示不使用。
tensorflow eager
动态图特性,使Tensorflow可以立刻执行运算:并返回具体值。
tf.enable_eager_execution 启动eager
tf.executing_eagerly() 判断eager是否启动
getattr返回指定模块的属性
源码中model = getattr(applications, args.model)(weights=None)
表示返回applications模块中的args.model属性(默认输入参数为ResNet50),后面为参数。
hvd.broadcast_global_variables(0)
在没有使用MonitoredTrainingSession的情况下,可以初始化初始值后使用这个函数广播初始值。
tf.random_uniform
构造均匀分布数据
numpy.std()
计算标准差
--fp16-allreduce
该参数的意思使将梯度转换为fp16类型的数据,即双字节的float,以减少传输过程中数据的量。
即对梯度进行压缩(gradient compression)。