SegDecNet的多GPU数据同步训练 代码改动记录

论文:Segmentation-Based Deep-Learning Approach for Surface-Defect Detection
环境:python3.5, cuda10 ,cudnn7.6 , tensorflow1.13
参考:https://blog.csdn.net/jinxiaonian11/article/details/94316567
记录原因:

  1. SegDecNet为two-stage网络
  2. 熟悉tensorflow同步的数据并行方法

SegDecNet的多GPU数据同步训练 代码改动记录_第1张图片

数据同步的原理

每个gpu中都存有模型,并共享所有的变量,分别在不同的gpu中计算不同batch的数据,得到loss_1,loss_2,…,loss_num_gpus,最后将所有loss放到cpu中进行mean,最后update。

代码

def average_gradients(self,tower_grads):
    average_grads=[]
    # tower_grads保存的是gradients和weight的元组
    for grad_and_vars in zip(*tower_grads):
        grads=[]
        for g,_ in grad_and_vars:
            expend_g=tf.expand_dims(g,0)
            grads.append(expend_g)
        grad=tf.concat(grads,0)
        grad=tf.reduce_mean(grad,0)
        v=grad_and_vars[0][1]
        grad_and_var=(grad,v)
        average_grads.append(grad_and_var)
    return average_grads
def build_model(self):
    def SegmentNet(input, scope, is_training, reuse=None):
        with tf.variable_scope(scope, reuse=reuse):
            with slim.arg_scope([slim.conv2d],
                                padding='SAME',
                                activation_fn=tf.nn.relu,
                                normalizer_fn=slim.batch_norm):
                net = slim.conv2d(input, 32, [5, 5],scope='conv1')
                net = slim.conv2d(net, 32, [5, 5], scope='conv2')
                net=slim.max_pool2d(net,[2,2],[2,2],scope='pool1')

                net = slim.conv2d(net, 64, [5, 5],scope='conv3')
                net = slim.conv2d(net, 64, [5, 5], scope='conv4')
                net = slim.conv2d(net, 64, [5, 5], scope='conv5')
                net=slim.max_pool2d(net,[2,2],[2,2],scope='pool2')

                net = slim.conv2d(net, 64, [5, 5],scope='conv6')
                net = slim.conv2d(net, 64, [5, 5], scope='conv7')
                net = slim.conv2d(net, 64, [5, 5],scope='conv8')
                net = slim.conv2d(net, 64, [5, 5], scope='conv9')
                net=slim.max_pool2d(net,[2,2],[2,2],scope='pool3')

                net = slim.conv2d(net, 1024, [15, 15], scope='conv10')
                features=net
                net = slim.conv2d(net, 1, [1, 1],activation_fn=None, scope='conv11')
                logits_pixel=net
                net=tf.sigmoid(net, name=None)
                mask=net
        return features,logits_pixel,mask

    def DecisionNet(feature,mask, scope, is_training,num_classes=2, reuse=None):
        with tf.variable_scope(scope, reuse=reuse):
            with slim.arg_scope([slim.conv2d],
                                padding='SAME',
                                activation_fn=tf.nn.relu,
                                normalizer_fn=slim.batch_norm):
                net=tf.concat([feature,mask],axis=3)
                net = slim.max_pool2d(net, [2, 2], [2, 2], scope='pool1')
                net = slim.conv2d(net, 8, [5, 5], scope='conv1')
                net = slim.max_pool2d(net, [2, 2], [2, 2], scope='pool2')
                net = slim.conv2d(net, 16, [5, 5], scope='conv2')
                net = slim.max_pool2d(net, [2, 2], [2, 2], scope='pool3')
                net = slim.conv2d(net, 32, [5, 5], scope='conv3')
                vector1=math_ops.reduce_mean(net,[1,2],name='pool4', keepdims=True)
                vector2=math_ops.reduce_max(net,[1,2],name='pool5', keepdims=True)
                vector3=math_ops.reduce_mean(mask,[1,2],name='pool6', keepdims=True)
                vector4=math_ops.reduce_max(mask,[1,2],name='pool7', keepdims=True)
                vector=tf.concat([vector1,vector2,vector3,vector4],axis=3)
                vector=tf.squeeze(vector,axis=[1,2])
                logits = slim.fully_connected(vector, num_classes,activation_fn=None)
                output=tf.argmax(logits,axis=1,name='output')
                return  logits,output
    # 多GPU的网络模型结果(关键是根据self.__num_gpus的取值来决定loss计算方式)
    tower_grads=[]
    tower_grads_seg=[]
    tower_grads_dec=[]
    #  网络的整体输入
    Image = tf.placeholder(tf.float32, shape=(None, IMAGE_SIZE[0],IMAGE_SIZE[1], 1), name='Image')
    PixelLabel=tf.placeholder(tf.float32,shape=(None, IMAGE_SIZE[0]/8,IMAGE_SIZE[1]/8, 1), name='PixelLabel')
    Label=tf.placeholder(tf.int32, shape=(None), name='Label')
    opt=tf.train.GradientDescentOptimizer(self.__learn_rate)
    
    with tf.variable_scope(tf.get_variable_scope()):
        # 当测试的时候,网络计算应该默认gpu为1
        itera=self.__num_gpus if self.__mode == 'training' else 1
        for i in range(itera):
            # 用以统计的全局值
            loss_pixel_global=0
            loss_class_global=0
            loss_total_global=0
            mask_lst=[]
            logits_pixel_lst=[]
            # 设置在第i块GPU上进行训练
            with tf.device(utils.assign_to_device('/gpu:{}'.format(i),ps_device='/cpu:0')):
                # 取对应batch的数据进行训练
                _Image=Image[i*self.__batch_size:(i+1)*self.__batch_size]
                _PixelLabel=PixelLabel[i*self.__batch_size:(i+1)*self.__batch_size]
                _Label=Label[i*self.__batch_size:(i+1)*self.__batch_size]
                features, logits_pixel, mask=SegmentNet(_Image,'segment',self.is_training)
                logits_class,output_class=DecisionNet(features,mask, 'decision', self.is_training)

                logits_pixel=tf.reshape(logits_pixel,[self.__batch_size,-1])
                PixelLabel_reshape=tf.reshape(_PixelLabel,[self.__batch_size,-1])

                mask_lst.append(mask)
                logits_pixel_lst.append(logits_pixel)
                # 两个stage的loss计算
                loss_pixel=tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits_pixel,labels=PixelLabel_reshape))
                loss_class = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_class,labels=_Label))
                loss_total=loss_pixel+loss_class
                loss_pixel_global+=loss_pixel
                loss_class_global+=loss_class
                loss_total_global+=loss_total  

                # 获取变量列表
                train_var_list = [v for v in tf.trainable_variables() ]
                train_segment_var_list = [v for v in tf.trainable_variables() if 'segment' in v.name ]
                train_decision_var_list = [v for v in tf.trainable_variables() if 'decision' in v.name]

                # 计算grad
                grads_pixel=opt.compute_gradients(loss_pixel,train_segment_var_list)
                grads_class=opt.compute_gradients(loss_class,train_decision_var_list)
                grads_total=opt.compute_gradients(loss_total,train_var_list)
                
                # 记录不同gpus算出的grad
                tower_grads.append(grads_total)
                tower_grads_dec.append(grads_class)
                tower_grads_seg.append(grads_pixel)
                # 共享变量
                tf.get_variable_scope().reuse_variables()
    # 做mean
    grads=self.average_gradients(tower_grads)
    grads_seg=self.average_gradients(tower_grads_seg)
    grads_dec=self.average_gradients(tower_grads_dec)
    # 更新gradient
    train_op=opt.apply_gradients(grads)
    train_op_seg=opt.apply_gradients(grads_seg)
    train_op_dec=opt.apply_gradients(grads_dec)

    init_op=tf.global_variables_initializer()
    # 以下均为第-1个gpu的变量值
    self.Image=Image
    self.PixelLabel = PixelLabel
    self.Label = Label
    self.features = features
    self.logits_class=logits_class
    self.output_class=output_class
    # 以下为n个gpu训练n*num_gpus的全局变量值
    self.loss_pixel = loss_pixel_global
    self.loss_class = loss_class_global
    self.loss_total = loss_total_global
    self.logits_pixel=logits_pixel_lst
    self.mask = mask_lst

    self.init_op=init_op

    self.train_op=train_op
    self.train_op_seg=train_op_seg
    self.train_op_dec=train_op_dec

utils.py中的assign_to_device方法

PS_OPS = ['Variable', 'VariableV2', 'AutoReloadVariable']
def assign_to_device(device, ps_device='/cpu:0'):
    def _assign(op):
        node_def = op if isinstance(op, tf.NodeDef) else op.node_def
        if node_def.op in PS_OPS:
            return  ps_device
        else:
            return device
    return _assign

opt.compute和opt.apply_gradient

前者用来计算梯度,后者用来更新对应的variable,两者配合使用,和optimizer.minimize作用相同。

def minimize(self,loss,global_step=None,var_list=None,name=None):
	# 输出为[(gradient,variable),(gradient,variable),...]
	grads_and_vars=self.compute_gradient(loss,var_list=var_list)
	vars_with_grad=[v for g,v in grads_and_vars if g is not None]
	if not vars_with_grad:
		raise ValueError(...)
	return self.apply_gradients(grads_and_vars, global_step=global_step,name=name)
		

average_gradient方法

tower_grads的list长度为gpu个数
tower_grads[i]代表第i个gpu的所有变量和对应的梯度信息,是元组的list,每个元祖的形式是(tf.Tensor (gradient for …), tf.Variable(…)),即每个variable以及对应的梯度。所以命名为“梯度塔

注意点:

  1. 论文的其他部分也要有对应的小修改
  2. build_model()最后设置了一系列的成员变量,不管是基于哪个范围的,都是用于展示作用的(我是用来可视化过程loss和过程mask图)

总的来说,相对于源码的改动就是写了个循环,然后将minimize拆分,最后计算出一个num_gpus*batchsize的数据量后,算一个mean,然后再利用apply_gradients更新参数的值。

你可能感兴趣的:(DL)