resize或者ravel一个128*112*112*48的Tensor的时候,报错TimeoutError和RuntimeError: Response is empty

程序代码片段

    def construct(self, x):
        att = self.DotKernel(x)
        residual = x
        g = self.g(x)
        b, c, h, w = g.shape
        g = self.transpose(g.view(b, c, -1), (0, 2, 1))
        x_1 = msnp.ravel(self.transpose(g, (0, 2, 1))).view(b, c, h, w)
        x_1 = self.w_1(x_1)
        out = x_1 + residual
        
        x_2 = self.bmm(att, g)
        x_2 = self.transpose(x_2, (0, 2, 1))
        # 出错代码
        x_2 = x_2.resize(b, c, h, w)
        out = out + x_2

以上代码将x_2的resize的操作换成如下操作

x_2 = self.transpose(x_2, (0, 2, 1))
x_2 = msnp.ravel(x_2)
x_2 = x_2.view(b, c, h, w)

也一样报错 具体报错信息如下

{'enable_modelarts': 'Whether training on modelarts default: False', 'data_url': 'Url for modelarts', 'train_url': 'Url for modelarts', 'data_path': 'The location of input data', 'output_pah': 'The location of the output file', 'device_target': 'device id of GPU or Ascend. (Default: None)', 'enable_profiling': 'Whether enable profiling while training default: False', 'is_distributed': 'distributed training', 'resume': 'resume training with existed checkpoint', 'model_size': 'shuffleNetV1 model size choices 2.0x, 1.5x, 1.0x, 0.5x', 'device_id': 'device id', 'file_name': 'output file name', 'file_format': 'file format choices [AIR MINDIR ONNX]'}
{'amp_level': 'O0',
 'batch_size': 128,
 'checkpoint_url': '',
 'ckpt_path': '',
 'config_path': '/root/shufflenetv1/src/model_utils/../../gpu_default_config.yaml',
 'data_path': '/cache/data',
 'data_url': '',
 'decay_method': 'cosine',
 'device_id': 0,
 'device_target': 'GPU',
 'enable_modelarts': False,
 'enable_profiling': False,
 'epoch_size': 1,
 'eval_dataset_path': '',
 'f': 'extra argument on jupyter',
 'file_format': 'ONNX',
 'file_name': 'shufflenetv1',
 'is_distributed': False,
 'is_transfer': True,
 'keep_checkpoint_max': 5,
 'label_smooth_factor': 0.2,
 'load_path': '/cache/checkpoint_path',
 'loss_scale': 1024,
 'lr_end': 0.0,
 'lr_init': 0.0,
 'lr_max': 0.5,
 'model_size': '2.0x',
 'momentum': 0.9,
 'num_classes': 20,
 'onnx_dataset_path': '/root/shufflenetv1/Arch28_split/val/',
 'onnx_path': 'shufflenetv1.onnx',
 'output_path': '/cache/train',
 'resume': '/root/shufflenetv1/ckpt/shufflenetv1.ckpt',
 'save_checkpoint': True,
 'save_checkpoint_epochs': 5,
 'save_ckpt_path': '/root/autodl-tmp/ckpt/',
 'train_dataset_path': '/root/shufflenetv1/Arch28_split/train/',
 'train_url': '',
 'warmup_epochs': 4,
 'weight_decay': 4e-05}
Please check the above information for the configurations
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.861.238 [mindspore/train/serialization.py:712] For 'load_param_into_net', 11 parameters in the 'net' are not loaded, because they are not in the 'parameter_dict', please check whether the network structure is consistent when training and loading checkpoint.
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.861.423 [mindspore/train/serialization.py:714] sln.t.weight is not loaded.
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.861.484 [mindspore/train/serialization.py:714] sln.p.weight is not loaded.
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.861.536 [mindspore/train/serialization.py:714] sln.g.weight is not loaded.
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.861.585 [mindspore/train/serialization.py:714] sln.bn.moving_mean is not loaded.
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.861.644 [mindspore/train/serialization.py:714] sln.bn.moving_variance is not loaded.
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.861.692 [mindspore/train/serialization.py:714] sln.bn.gamma is not loaded.
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.861.739 [mindspore/train/serialization.py:714] sln.bn.beta is not loaded.
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.861.785 [mindspore/train/serialization.py:714] sln.w_1.weight is not loaded.
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.861.832 [mindspore/train/serialization.py:714] sln.w_2.weight is not loaded.
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.861.879 [mindspore/train/serialization.py:714] classifier.weight is not loaded.
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.861.925 [mindspore/train/serialization.py:714] classifier.bias is not loaded.
[WARNING] ME(97662:140323767718080,MainProcess):2022-09-24-16:45:36.996.728 [mindspore/train/model.py:1077] For ValAccMonitor callback, {'end', 'epoch_end'} methods may not be supported in later version, Use methods prefixed with 'on_train' or 'on_eval' instead when using customized callbacks.
Traceback (most recent call last):
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/_extends/remote/kernel_build_server_akg.py", line 55, in 
    messager.run()
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/_extends/remote/kernel_build_server.py", line 106, in run
    self.loop()
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/_extends/remote/kernel_build_server.py", line 103, in loop
    self.handle()
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/_extends/remote/kernel_build_server_akg.py", line 39, in handle
    self.akg_builder.handle(self, arg)
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/_extends/remote/kernel_build_server.py", line 168, in handle
    res = self.compile()
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/_extends/remote/kernel_build_server.py", line 145, in compile
    return self.akg_processor.compile(self.attrs)
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py", line 133, in compile
    res.get(timeout=self.wait_time)
  File "/root/miniconda3/envs/shuffle/lib/python3.7/multiprocessing/pool.py", line 653, in get
    raise TimeoutError
multiprocessing.context.TimeoutError
model size is  2.0x
============== Starting Training ==============
Traceback (most recent call last):
  File "train.py", line 149, in 
    train()
  File "/root/shufflenetv1/src/model_utils/moxing_adapter.py", line 113, in wrapped_func
    run_func(*args, **kwargs)
  File "train.py", line 137, in train
    model.train(config.epoch_size, train_dataset, callbacks=cb, dataset_sink_mode=False)
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/train/model.py", line 1049, in train
    initial_epoch=initial_epoch)
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/train/model.py", line 98, in wrapper
    func(self, *args, **kwargs)
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/train/model.py", line 616, in _train
    self._train_process(epoch, train_dataset, list_callback, cb_params, initial_epoch, valid_infos)
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/train/model.py", line 907, in _train_process
    outputs = self._train_network(*next_element)
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/nn/cell.py", line 578, in __call__
    out = self.compile_and_run(*args)
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/nn/cell.py", line 965, in compile_and_run
    self.compile(*inputs)
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/nn/cell.py", line 938, in compile
    jit_config_dict=self._jit_config_dict)
  File "/root/miniconda3/envs/shuffle/lib/python3.7/site-packages/mindspore/common/api.py", line 1137, in compile
    result = self._graph_executor.compile(obj, args_list, phase, self._use_vm_mode())
RuntimeError: Response is empty

----------------------------------------------------
- C++ Call Stack: (For framework developers)
----------------------------------------------------
mindspore/ccsrc/backend/common/session/kernel_build_client.h:110 Response

不知道是不是和算子融合有关系,但是关闭之后也试过,还是不行,又或者说数据太大了?

****************************************************解答*****************************************************

这个valueerror指明了是tensor的size过大造成的。您可能需要降低batch_size(如果它是x_2的某个维度的话),或者,ops.inplaceadd可能有用。

你可能感兴趣的:(python,tensorflow,开发语言)