使用TensorFlow and Horovod 实现BERT多GPU训练

一、run_classifier.py文件

#chang 1:引入horovod
import horovod.tensorflow as hvd
...
def get_train_examples(self, data_dir):
    """See base class."""
    file_path = os.path.join(data_dir, "train_bert.txt")
    #chang 2 修改训练数据创建方式
    return self._create_examples_train(file_path, "train")
	#...
def get_labels(self):
    """See base class."""
    return ["0", "1"]
#chang 3 新增训练数据创建代码
def _create_examples_train(self, file_path, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(codecs.open(file_path, "r", "utf-8")):
        if i % hvd.size() == hvd.rank():
            guid = "%s-%d" % (set_type, i)
            split_line = line.strip('\r\n').split('\t')
            text_a = tokenization.convert_to_unicode(split_line[1])
            label = split_line[0]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples
def _create_examples(self, file_path, set_type):
	...
def main(_):
  #chang 4 初始化
  hvd.init()
  #chang 5 为不同的rank定义不同的输出文件名
  FLAGS.output_dir = FLAGS.output_dir if hvd.rank() == 0 else os.path.join(FLAGS.output_dir, str(hvd.rank()))
  ...
  #chang 6  确保一个GPU上一个进程
  config = tf.ConfigProto()
  config.gpu_options.allow_growth = True
  config.gpu_options.visible_device_list = str(hvd.local_rank())
  
  run_config = tf.contrib.tpu.RunConfig(
      ...,
      #chang 7 设置配置
      session_config=config)
...
if FLAGS.do_train:
    #change 8 定义训练记录文件
    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
    file_based_convert_examples_to_features(
    ...
    #change 9 确保每个进程拥有同样的初始化权重
    hooks = [hvd.BroadcastGlobalVariablesHook(0)]
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=hooks)
#change 10 #只在编号为0的GPU上进行评估
if FLAGS.do_eval and hvd.rank() == 0
...
#change 11#只在编号为0的GPU上进行预测
if FLAGS.do_predict and hvd.rank() == 0    

二、optimization.py文件

#change 1
import horovod.tensorflow as hvd
#change 2
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
	...   
    #Change 3 (optional)  学习率乘以GPU个数
    optimizer = AdamWeightDecayOptimizer(
        learning_rate=learning_rate * hvd.size(),
        ...)

    #Change 4 使用hvd分布式优化器
    optimizer = hvd.DistributedOptimizer(optimizer)
    #Change 5 使用分布式优化器计算梯度
    grads_and_vars=optimizer.compute_gradients(loss, tvars)
    
    #Change 6 根据分布式优化器的调试进行梯度裁剪
    grads = [grad for grad,var in grads_and_vars]
    tvars = [var for grad,var in grads_and_vars]
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
  
    #Change 7 调整梯度裁剪进行优化
    train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)

参考文献:BERT Multi-GPU implementation using TensorFlow and Horovod with code

你可能感兴趣的:(深度学习)