mindspore出现memory is not enough!

在训练过程中,进行到更新参数这一步时,出现memory is not enough,我使用的是24G的RTX 3090显卡来训练,能帮忙看一下是我代码的问题吗?还是纯粹的内存问题?

输出日志

[WARNING] PRE_ACT(14099,7f5cfb754700,python3):2021-12-13-11:08:39.551.431 [mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.cc:150] CalMemBlockAllocSize] Memory not enough: current free memory size[103809024] is smaller than required size[4213571584].
[ERROR] RUNTIME_FRAMEWORK(14099,7f5cfb754700,python3):2021-12-13-11:08:39.551.455 [mindspore/ccsrc/runtime/framework/actor/memory_manager_actor.cc:182] SetOpContextMemoryAllocFail] Device(id:0) memory isn't enough and alloc failed, kernel name: Gradients/Default/network-compute_loss_pi/ac-ActorCritic/pi-GaussianActor/conv2-Conv2d/gradConv2D/Conv2DBackpropInput-op731, alloc size: 4213571584B.
[EXCEPTION] VM(14099,7f5dfee71740,python3):2021-12-13-11:08:39.551.515 [mindspore/ccsrc/vm/backend.cc:835] RunGraph] The actor runs failed, actor name: kernel_graph_82
[ERROR] RUNTIME_FRAMEWORK(14099,7f5cfaf53700,python3):2021-12-13-11:08:39.551.570 [mindspore/ccsrc/runtime/framework/actor/abstract_actor.cc:53] EraseInput] Erase input data failed: Gradients/Default/network-compute_loss_pi/ac-ActorCritic/pi-GaussianActor/conv2-Conv2d/gradConv2D/Conv2DBackpropFilter-op730, sequential_num: -820833344
Traceback (most recent call last):
  File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/carserver1/mindspore_ws/src/drl_nav/drl/algos/ppo/pppo.py", line 374, in 
    train(env, ppo)
  File "/home/carserver1/mindspore_ws/src/drl_nav/drl/algos/ppo/pppo.py", line 173, in train
    flag=ppo.update()
  File "/home/carserver1/mindspore_ws/src/drl_nav/drl/algos/ppo/pppo_mindspore.py", line 268, in update
    trainOneStepCellForPi(inputs)
  File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/nn/cell.py", line 404, in __call__
    out = self.compile_and_run(*inputs)
  File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/nn/cell.py", line 698, in compile_and_run
    return _cell_graph_executor(self, *new_inputs, phase=self.phase)
  File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 627, in __call__
    return self.run(obj, *args, phase=phase)
  File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 655, in run
    return self._exec_pip(obj, *args, phase=phase_real)
  File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 78, in wrapper
    results = fn(*arg, **kwargs)
  File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 638, in _exec_pip
    return self._graph_executor(args_list, phase)
RuntimeError: mindspore/ccsrc/vm/backend.cc:835 RunGraph] The actor runs failed, actor name: kernel_graph_82

# In file /home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/ops/_grad/grad_math_ops.py(81)
    num_selected = reshape(reduce_sum(indicators, axis), output_shape_kept_dims) + min_num
                           ^
复制

部分实现代码

## 损失网络
class compute_loss_pi(nn.Cell):
    def __init__(self, actor_critic, clip_ratio=0.2):
        super(compute_loss_pi, self).__init__()
        self.ac = actor_critic
        self.clip_ratio = clip_ratio
    def construct(self, data):
        state, image, act, adv, logp_old = data['state'], data['image'], data['act'], data['adv'], data['logp']

        # Policy loss
        pi, logp = self.ac.pi(state, image, act)
        ratio = mnp.exp(logp - logp_old)
        clip_adv = mnp.clip(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) * adv  ###clamp
        loss_pi = -(mnp.min(ratio * adv, clip_adv)).mean()
        return loss_pi
    def backbone_network(self):
        return self.ac.pi

class compute_loss_v(nn.Cell):
    def __init__(self, actor_critic):
        super(compute_loss_v, self).__init__()
        self.ac = actor_critic
    def construct(self, data):
        state, image, ret = data['state'], data['image'], data['ret']
        return ((self.ac.v(state, image) - ret) ** 2).mean()
    def backbone_network(self):
        return self.ac.v

## train(one epoch)
data = dict(state=state_bufs, image=img_bufs, act=act_bufs, ret=ret_bufs, adv=adv_bufs, logp=logp_bufs)
        inputs = {k: Tensor(v, dtype=mindspore.dtype.float32) for k, v in list(data.items())}#zhe li ba device shanle
        trainOneStepCellForPi = nn.TrainOneStepCell(self.compute_loss_pi, self.pi_optimizer)
        trainOneStepCellForV = nn.TrainOneStepCell(self.compute_loss_v, self.vf_optimizer)
        if not self.only_once:
            trainOneStepCellForPi.set_train()
            trainOneStepCellForV.set_train()
            self.only_once = True
        # Train policy with multiple steps of gradient descent
        for i in range(self.train_pi_iters):
            trainOneStepCellForPi(inputs)
            pi_info = self.cal_pi_info(inputs)
            kl = np.average(pi_info['kl'])
            if kl > 1.5 * self.target_kl:
                self.logger.log('Early stopping at step %d due to reaching max kl.' % i)
                break

        # Value function learning
        for i in range(self.train_v_iters):
            trainOneStepCellForV(inputs)

*******************************************************************************************************************

 export GLOG_v=1 再执行下,需要根据详细日志分析

你可能感兴趣的:(python,深度学习,开发语言)