Horovod运行的示例源码解析

文|Serpah

01 | tensorflow_mnist.py

import os
import errno
import tensorflow as tf
import horovod.tensorflow as hvd
import numpy as np

from tensorflow import keras

layers = tf.layers

tf.logging.set_verbosity(tf.logging.INFO)


def conv_model(feature, target, mode):
    """2-layer convolution model."""
    # Convert the target to a one-hot tensor of shape (batch_size, 10) and
    # with a on-value of 1 for each one-hot vector of length 10.
    target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)

    # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
    # image width and height final dimension being the number of color channels.
    feature = tf.reshape(feature, [-1, 28, 28, 1])

    # First conv layer will compute 32 features for each 5x5 patch
    with tf.variable_scope('conv_layer1'):
        h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
                                activation=tf.nn.relu, padding="SAME")
        h_pool1 = tf.nn.max_pool(
            h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

    # Second conv layer will compute 64 features for each 5x5 patch.
    with tf.variable_scope('conv_layer2'):
        h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
                                activation=tf.nn.relu, padding="SAME")
        h_pool2 = tf.nn.max_pool(
            h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
        # reshape tensor into a batch of vectors
        h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])

    # Densely connected layer with 1024 neurons.
    h_fc1 = layers.dropout(
        layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
        rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)

    # Compute logits (1 per class) and compute loss.
    logits = layers.dense(h_fc1, 10, activation=None)
    loss = tf.losses.softmax_cross_entropy(target, logits)

    return tf.argmax(logits, 1), loss


def train_input_generator(x_train, y_train, batch_size=64):
    assert len(x_train) == len(y_train)
    while True:
        p = np.random.permutation(len(x_train))
        x_train, y_train = x_train[p], y_train[p]
        index = 0
        while index <= len(x_train) - batch_size:
            yield x_train[index:index + batch_size], \
                  y_train[index:index + batch_size],
            index += batch_size


def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.AdamOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),

        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train, batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})


if __name__ == "__main__":
    tf.app.run()
  1. layters = tf.layers
    tf.layers是提供用于深度学习的更高层次封装的 API,主要提供全连接和卷积等基本操作。
    提供的方法有:
方法 含义
Input(…) 用于实例化一个输入 Tensor,作为神经网络的输入。
average_pooling1d(…) 一维平均池化层
average_pooling2d(…) 二维平均池化层
average_pooling3d(…) 三维平均池化层
batch_normalization(…) 批量标准化层
conv1d(…) 一维卷积层
conv2d(…) 二维卷积层
conv2d_transpose(…) 二维反卷积层
conv3d(…) 三维卷积层
conv3d_transpose(…) 三维反卷积层
dense(…) 全连接层
dropout(…) Dropout层
flatten(…) Flatten层,即把一个 Tensor 展平
max_pooling1d(…) 一维最大池化层
max_pooling2d(…) 二维最大池化层
max_pooling3d(…) 三维最大池化层
separable_conv2d(…) 二维深度可分离卷积层
  1. tf.logging.set_verbosity(tf.logging.INFO)
    设置日志输出级别为INFO。
    TensorFlow使用五个不同级别的日志消息。 按照上升的顺序,它们是DEBUG,INFO,WARN,ERROR和FATAL。
    当您在任何这些级别配置日志记录时,TensorFlow将输出与该级别相对应的所有日志消息以及比当前级别的更严重的级别信息。
    TensorFlow在WARN的日志记录级别进行配置,但是在跟踪模型训练时,您需要将级别调整为INFO,这将提供适合操作正在进行的其他反馈。

  2. os.path
    os.path.join 路径拼接
    os.path.exists 路径是否存在
    os.path.mkdir 用于以数字权限模式创建目录。默认的模式为 0777 (八进制)。
    os.path.isdir 判断路径是否为目录

  3. pass
    语句占位符,不作任何操作

  4. raise
    程序出现错误,会自动引发异常,Python也允许使用raise语句自行引发异常。

  5. keras.datasets.mnist.load_data
    keras自带的加载MNIST数据集

  6. np.reshape
    在不改变数据内容的情况下,改变一个数组的格式。注意这里是引用修改,如果改变了原数据,新数组的内容也会随之改变。
    -1标识,以其它的维度去改变数组的格式
    x_train = np.reshape(x_train, (-1, 784)) / 255.0将x_train数据改变为每行有784元素的数组,且数据归一化到0~1之间。

  7. tf.name_scope
    指定的区域中定义的所有对象及各种操作,他们的“name”属性上会增加该命名区的区域名,用以区别对象属于哪个区域;

  8. tf.estimator.ModeKeys
    model 模式的标准名称.
    定义了以下的标准键:
    TRAIN:训练模式.
    EVAL:计算模式.
    PREDICT:推理模式.

  9. tf.train.AdamOptimizer()
    Adam优化算法:是一个寻找全局最优点的优化算法,引入了二次方梯度校正。

11.tf.train.get_or_create_global_step()
这个函数主要用于返回或者创建(如果有必要的话)一个全局步数的tensor。

  1. optimizer.minimize()
    minimize的内部存在两个操作:(1)计算各个变量的梯度 (2)用梯度更新这些变量的值

  2. MonitoredTrainingSession()
    监控训练的回话.
    tf.train.StopAtStepHook钩子,这个钩子定义了训练的最后一步,之后参数服务器和worker服务器会被关闭。
    tf.train.LoggingTensorHook钩子,以每N步或者N秒打印给定的张量,张量以INFO信息输出日志。

  3. config=tf.ConfigProto()
    用在创建session的时候,用来对session进行参数配置
    config.gpu_options.allow_growth #True表示允许动态申请显存
    config.gpu_options.visible_device_list #指定使用GPU列表

  4. Numpy.random.permutation
    对原来的数组进行重新洗牌(即随机打乱原来的元素顺序)。
    区别在于shuffle直接在原来的数组上进行操作,改变原来数组的顺序,无返回值。而permutation不直接在原来的数组上进行操作,而是返回一个新的打乱顺序的数组,并不改变原来的数组。

  5. Next
    返回迭代器的下一个项目。

  6. tf.cast
    tf.cast()函数的作用是执行 tensorflow 中张量数据类型转换。

  7. tf.one_hot
    使用one-hot编码,将离散特征的取值扩展到了欧式空间,离散特征的某个取值就对对应欧式空间的某个点。
    将离散型特征使用one-hot编码,会让特征之间的距离计算更加合理。
    例如:

tf.one_hot([0, 1, 2],    3, on_value=7, off_value=4)
#[7 4 4]
#[4 7 4]
#[4 4 7]]
  1. model.evaluate
    评估训练的模型,输入数据和标签,输出损失和精确度。

  2. model.predict
    预测测试结果,输入测试数据,输出预测结果。

02 | tensorflow_synthetic_benchmark.py

from __future__ import absolute_import, division, print_function

import argparse
import os
import numpy as np
import timeit

import tensorflow as tf
import horovod.tensorflow as hvd
from tensorflow.keras import applications

# Benchmark settings
parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark',
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--fp16-allreduce', action='store_true', default=False,
                    help='use fp16 compression during allreduce')

parser.add_argument('--model', type=str, default='ResNet50',
                    help='model to benchmark')
parser.add_argument('--batch-size', type=int, default=32,
                    help='input batch size')

parser.add_argument('--num-warmup-batches', type=int, default=10,
                    help='number of warm-up batches that don\'t count towards benchmark')
parser.add_argument('--num-batches-per-iter', type=int, default=10,
                    help='number of batches per benchmark iteration')
parser.add_argument('--num-iters', type=int, default=10,
                    help='number of benchmark iterations')

parser.add_argument('--eager', action='store_true', default=False,
                    help='enables eager execution')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')

args = parser.parse_args()
args.cuda = not args.no_cuda

hvd.init()

# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
if args.cuda:
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    config.gpu_options.allow_growth = False
    config.gpu_options.visible_device_list = ''

if args.eager:
    tf.enable_eager_execution(config)

# Set up standard model.
model = getattr(applications, args.model)(weights=None)

opt = tf.train.GradientDescentOptimizer(0.01)

# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

# Horovod: wrap optimizer with DistributedOptimizer.
opt = hvd.DistributedOptimizer(opt, compression=compression)

init = tf.global_variables_initializer()
bcast_op = hvd.broadcast_global_variables(0)

data = tf.random_uniform([args.batch_size, 224, 224, 3])
target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64)


def loss_function():
    probs = model(data, training=True)
    return tf.losses.sparse_softmax_cross_entropy(target, probs)


def log(s, nl=True):
    if hvd.rank() != 0:
        return
    print(s, end='\n' if nl else '')


log('Model: %s' % args.model)
log('Batch size: %d' % args.batch_size)
device = 'GPU' if args.cuda else 'CPU'
log('Number of %ss: %d' % (device, hvd.size()))


def run(benchmark_step):
    # Warm-up
    log('Running warmup...')
    timeit.timeit(benchmark_step, number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
        img_secs.append(img_sec)

    # Results
    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
    log('Total img/sec on %d %s(s): %.1f +-%.1f' %
        (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))


if tf.executing_eagerly():
    with tf.device(device):
        run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables))
else:
    with tf.Session(config=config) as session:
        init.run()
        bcast_op.run()

        loss = loss_function()
        train_opt = opt.minimize(loss)
        run(lambda: session.run(train_opt))

  1. from future import
    做法的作用就是将新版本的特性引进当前版本中。
    absolute_import 绝对引用包
    division 精确除法
    print_function Python3打印

  2. import argparse
    argparse模块是Python用来处理命令行参数模块。
    argparse.ArgumentParser生成一个parser对象(参数解析器)
    parse.add_argument增加参数
    parse.parse_args获取解析的参数

  3. import timeit
    timeit模块是Python计时工具。
    timeit.timeit 第一个参数为执行表达式或函数,number参数为执行次数。

  4. from tensorflow.keras import applications
    tensorflow.keras.application模块里有多个预训练的模型类。

  5. os.environ[“CUDA_VISIBLE_DEVICES”] = “-1”
    指定使用的GPU,序号从0开始,以,隔开多个。“-1”表示不使用。

  6. tensorflow eager
    动态图特性,使Tensorflow可以立刻执行运算:并返回具体值。
    tf.enable_eager_execution 启动eager
    tf.executing_eagerly() 判断eager是否启动

  7. getattr返回指定模块的属性
    源码中model = getattr(applications, args.model)(weights=None)表示返回applications模块中的args.model属性(默认输入参数为ResNet50),后面为参数。

  8. hvd.broadcast_global_variables(0)
    在没有使用MonitoredTrainingSession的情况下,可以初始化初始值后使用这个函数广播初始值。

  9. tf.random_uniform
    构造均匀分布数据

  10. numpy.std()
    计算标准差

  11. --fp16-allreduce
    该参数的意思使将梯度转换为fp16类型的数据,即双字节的float,以减少传输过程中数据的量。
    即对梯度进行压缩(gradient compression)。

你可能感兴趣的:(并行计算,horovod)