0 安装horovod所需要的g++版本
5). 调整g++版本
6). 验证版本
1. 安装NCCL
方法2: 下载nccl_2.4.8-1+cuda10.0_x86_64.txz(如下链接,需要登录nividia),解压后移动到/usr/local/下:
2. 安装Openmpi
3. 安装horovod
安装时可能出现g++版本过高的问题,因此需要进行版本切换: 验证ubuntu 18.04 对应 g++ gcc 5版本可用 (但docker中4.8可用)
sudo gedit /etc/apt/sources.list
deb http://dk.archive.ubuntu.com/ubuntu/ xenial main
deb http://dk.archive.ubuntu.com/ubuntu/ xenial universe
sudo apt update
sudo apt-get install gcc-4.9
sudo apt-get install g++-4.9
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 20
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 20
此时,终端输入gcc --version发现默认版本仍然是未改变,需要更改为4.9,
> sudo update-alternatives --config gcc
There are 2 choices for the alternative gcc (providing /usr/bin/gcc).
Selection Path Priority Status
0 /usr/bin/gcc-5 50 auto mode
* 1 /usr/bin/gcc-4.9 20 manual mode
2 /usr/bin/gcc-5 50 manual mode
Press to keep the current choice[*], or type selection number: 1
> sudo update-alternatives --config g++
There are 2 choices for the alternative g++ (providing /usr/bin/g++).
Selection Path Priority Status
* 0 /usr/bin/g++-5 50 auto mode
1 /usr/bin/g++-4.9 20 manual mode
2 /usr/bin/g++-5 50 manual mode
Press to keep the current choice[*], or type selection number: 1
update-alternatives: using /usr/bin/g++-4.9 to provide /usr/bin/g++ (g++) in manual mode
gcc -v
g++ -v
#当切换使用了其他版本的gcc时,请务必保持g++的版本和gcc版本的一致性, 否则用cmake配置出来的项目遇到c++代码还是会用之前版本的gcc
# 删除选项操作
sudo update-alternatives --remove gcc /usr/bin/gcc-4.9
先下载库文件 https://developer.nvidia.com/nccl/nccl-legacy-downloads
# option 1: local
sudo dpkg -i nccl-repo-ubuntu1804-2.4.8-ga-cuda10.0_1-1_amd64
# option 2: network
sudo dpkg -i nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb
sudo apt-get update
sudo apt install libnccl2=2.4.8-1+cuda10.0 libnccl-dev=2.4.8-1+cuda10.0
tar xvf nccl_2.4.8-1+cuda10.0_x86_64.txz
mv nccl_2.4.8-1+cuda10.0_x86_64 /usr/local/nccl_2.4.8
export LD_LIBRARY_PATH=/usr/local/nccl_2.4.8/lib:$LD_LIBRARY_PATH
参考: https://www.open-mpi.org/faq/?category=building#easy-build
$ gunzip -c openmpi-4.0.2.tar.gz | tar xf -
$ cd openmpi-4.0.2
$ ./configure --prefix=/usr/local
<...lots of output...>
$ make all install
mpiexec --version
mpirun --version
参考 https://github.com/horovod/horovod/blob/master/docs/install.rst
> ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs
> HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod -i https://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com
> ldconfig
import horovod.torch as hvd
import horovod.tensorflow as hvd
(ref: https://zhuanlan.zhihu.com/p/78303865 )
https://github.com/horovod/horovod/tree/master/examples --- tensorflow_mnist.py
import os
import errno
import tensorflow as tf
import horovod.tensorflow as hvd
import numpy as np
import argparse
from tensorflow import keras
layers = tf.layers
# Training settings
parser = argparse.ArgumentParser(description='Tensorflow MNIST Example')
parser.add_argument('--use-adasum', action='store_true', default=False,
help='use adasum algorithm to do reduction')
args = parser.parse_args()
def conv_model(feature, target, mode):
"""2-layer convolution model."""
# Convert the target to a one-hot tensor of shape (batch_size, 10) and
# with a on-value of 1 for each one-hot vector of length 10.
target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
# Reshape feature to 4d tensor with 2nd and 3rd dimensions being
# image width and height final dimension being the number of color channels.
feature = tf.reshape(feature, [-1, 28, 28, 1])
# First conv layer will compute 32 features for each 5x5 patch
with tf.variable_scope('conv_layer1'):
h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
activation=tf.nn.relu, padding="SAME")
h_pool1 = tf.nn.max_pool(
h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
# Second conv layer will compute 64 features for each 5x5 patch.
with tf.variable_scope('conv_layer2'):
h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
activation=tf.nn.relu, padding="SAME")
h_pool2 = tf.nn.max_pool(
h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
# reshape tensor into a batch of vectors
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
# Densely connected layer with 1024 neurons.
h_fc1 = layers.dropout(
layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
# Compute logits (1 per class) and compute loss.
logits = layers.dense(h_fc1, 10, activation=None)
loss = tf.losses.softmax_cross_entropy(target, logits)
return tf.argmax(logits, 1), loss
def train_input_generator(x_train, y_train, batch_size=64):
assert len(x_train) == len(y_train)
while True:
p = np.random.permutation(len(x_train))
x_train, y_train = x_train[p], y_train[p]
index = 0
while index <= len(x_train) - batch_size:
yield x_train[index:index + batch_size], \
y_train[index:index + batch_size],
index += batch_size
def main(_):
# Horovod: initialize Horovod.
# Keras automatically creates a cache directory in ~/.keras/datasets for
# storing the downloaded MNIST data. This creates a race
# condition among the workers that share the same filesystem. If the
# directory already exists by the time this worker gets around to creating
# it, ignore the resulting exception and continue.
cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
if not os.path.exists(cache_dir):
except OSError as e:
if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
# Download and load MNIST dataset.
(x_train, y_train), (x_test, y_test) = \
keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())
# The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
# into (-1, 784) to feed into our network. Also, need to normalize the
# features between 0 and 1.
x_train = np.reshape(x_train, (-1, 784)) / 255.0
x_test = np.reshape(x_test, (-1, 784)) / 255.0
# Build model...
with tf.name_scope('input'):
image = tf.placeholder(tf.float32, [None, 784], name='image')
label = tf.placeholder(tf.float32, [None], name='label')
predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)
lr_scaler = hvd.size()
# By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL,
# scale lr by local_size
if args.use_adasum:
lr_scaler = hvd.local_size() if hvd.nccl_built() else 1
# Horovod: adjust learning rate based on lr_scaler.
opt = tf.train.AdamOptimizer(0.001 * lr_scaler)
# Horovod: add Horovod Distributed Optimizer.
opt = hvd.DistributedOptimizer(opt, op=hvd.Adasum if args.use_adasum else hvd.Average)
global_step = tf.train.get_or_create_global_step()
train_op = opt.minimize(loss, global_step=global_step)
hooks = [
# Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
# from rank 0 to all other processes. This is necessary to ensure consistent
# initialization of all workers when training is started with random weights
# or restored from a checkpoint.
# Horovod: adjust number of steps based on number of GPUs.
tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
training_batch_generator = train_input_generator(x_train,
y_train, batch_size=100)
# The MonitoredTrainingSession takes care of session initialization,
# restoring from a checkpoint, saving to a checkpoint, and closing when done
# or an error occurs.
with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
config=config) as mon_sess:
while not mon_sess.should_stop():
# Run a training step synchronously.
image_, label_ = next(training_batch_generator)
mon_sess.run(train_op, feed_dict={image: image_, label: label_})
if __name__ == "__main__":
初始化 Horovod,启动相关线程和MPI线程。config.gpu_options.visible_device_list = str(hvd.local_rank())
为不同的进程分配不同的GPU。opt = tf.train.AdagradOptimizer(0.01 * hvd.size())
把常规TensorFlow Optimizer通过Horovod包起来,进而使用 ring-allreduce 来得到平均梯度。hvd.BroadcastGlobalVariablesHook(0)
if hvd.rank() != 0
设置只有设备0需要保存模型参数。运行方式: https://github.com/horovod/horovod#usage
** 运行在一台机器的4个GPUs上(根据自己电脑设备指定IP):
mpirun -np 4 -H ip-1:4 -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib python tensorflow_mnist.py
** 运行在四台机器的4个GPUs上(根据自己电脑设备指定IP):
mpirun -np 16 -H ip-1:4,ip-2:4,ip-3:4,ip-4:4 -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib python tensorflow_mnist.py