在训练过程中,进行到更新参数这一步时,出现memory is not enough,我使用的是24G的RTX 3090显卡来训练,能帮忙看一下是我代码的问题吗?还是纯粹的内存问题?
[WARNING] PRE_ACT(14099,7f5cfb754700,python3):2021-12-13-11:08:39.551.431 [mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.cc:150] CalMemBlockAllocSize] Memory not enough: current free memory size[103809024] is smaller than required size[4213571584].
[ERROR] RUNTIME_FRAMEWORK(14099,7f5cfb754700,python3):2021-12-13-11:08:39.551.455 [mindspore/ccsrc/runtime/framework/actor/memory_manager_actor.cc:182] SetOpContextMemoryAllocFail] Device(id:0) memory isn't enough and alloc failed, kernel name: Gradients/Default/network-compute_loss_pi/ac-ActorCritic/pi-GaussianActor/conv2-Conv2d/gradConv2D/Conv2DBackpropInput-op731, alloc size: 4213571584B.
[EXCEPTION] VM(14099,7f5dfee71740,python3):2021-12-13-11:08:39.551.515 [mindspore/ccsrc/vm/backend.cc:835] RunGraph] The actor runs failed, actor name: kernel_graph_82
[ERROR] RUNTIME_FRAMEWORK(14099,7f5cfaf53700,python3):2021-12-13-11:08:39.551.570 [mindspore/ccsrc/runtime/framework/actor/abstract_actor.cc:53] EraseInput] Erase input data failed: Gradients/Default/network-compute_loss_pi/ac-ActorCritic/pi-GaussianActor/conv2-Conv2d/gradConv2D/Conv2DBackpropFilter-op730, sequential_num: -820833344
Traceback (most recent call last):
File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/carserver1/mindspore_ws/src/drl_nav/drl/algos/ppo/pppo.py", line 374, in
train(env, ppo)
File "/home/carserver1/mindspore_ws/src/drl_nav/drl/algos/ppo/pppo.py", line 173, in train
flag=ppo.update()
File "/home/carserver1/mindspore_ws/src/drl_nav/drl/algos/ppo/pppo_mindspore.py", line 268, in update
trainOneStepCellForPi(inputs)
File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/nn/cell.py", line 404, in __call__
out = self.compile_and_run(*inputs)
File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/nn/cell.py", line 698, in compile_and_run
return _cell_graph_executor(self, *new_inputs, phase=self.phase)
File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 627, in __call__
return self.run(obj, *args, phase=phase)
File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 655, in run
return self._exec_pip(obj, *args, phase=phase_real)
File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 78, in wrapper
results = fn(*arg, **kwargs)
File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 638, in _exec_pip
return self._graph_executor(args_list, phase)
RuntimeError: mindspore/ccsrc/vm/backend.cc:835 RunGraph] The actor runs failed, actor name: kernel_graph_82
# In file /home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/ops/_grad/grad_math_ops.py(81)
num_selected = reshape(reduce_sum(indicators, axis), output_shape_kept_dims) + min_num
^
复制
## 损失网络
class compute_loss_pi(nn.Cell):
def __init__(self, actor_critic, clip_ratio=0.2):
super(compute_loss_pi, self).__init__()
self.ac = actor_critic
self.clip_ratio = clip_ratio
def construct(self, data):
state, image, act, adv, logp_old = data['state'], data['image'], data['act'], data['adv'], data['logp']
# Policy loss
pi, logp = self.ac.pi(state, image, act)
ratio = mnp.exp(logp - logp_old)
clip_adv = mnp.clip(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) * adv ###clamp
loss_pi = -(mnp.min(ratio * adv, clip_adv)).mean()
return loss_pi
def backbone_network(self):
return self.ac.pi
class compute_loss_v(nn.Cell):
def __init__(self, actor_critic):
super(compute_loss_v, self).__init__()
self.ac = actor_critic
def construct(self, data):
state, image, ret = data['state'], data['image'], data['ret']
return ((self.ac.v(state, image) - ret) ** 2).mean()
def backbone_network(self):
return self.ac.v
## train(one epoch)
data = dict(state=state_bufs, image=img_bufs, act=act_bufs, ret=ret_bufs, adv=adv_bufs, logp=logp_bufs)
inputs = {k: Tensor(v, dtype=mindspore.dtype.float32) for k, v in list(data.items())}#zhe li ba device shanle
trainOneStepCellForPi = nn.TrainOneStepCell(self.compute_loss_pi, self.pi_optimizer)
trainOneStepCellForV = nn.TrainOneStepCell(self.compute_loss_v, self.vf_optimizer)
if not self.only_once:
trainOneStepCellForPi.set_train()
trainOneStepCellForV.set_train()
self.only_once = True
# Train policy with multiple steps of gradient descent
for i in range(self.train_pi_iters):
trainOneStepCellForPi(inputs)
pi_info = self.cal_pi_info(inputs)
kl = np.average(pi_info['kl'])
if kl > 1.5 * self.target_kl:
self.logger.log('Early stopping at step %d due to reaching max kl.' % i)
break
# Value function learning
for i in range(self.train_v_iters):
trainOneStepCellForV(inputs)
*******************************************************************************************************************
export GLOG_v=1 再执行下,需要根据详细日志分析