Traceback (most recent call last):
File "run_RACE.py", line 243, in
main(args.get_args(data_dir,gpu_ids), bert_vocab_file)
File "run_RACE.py", line 226, in main
config.gradient_accumulation_steps, config.max_grad_norm, device, scheduler, label_list, config.output_dir)
File "run_RACE.py", line 52, in train
logits = model(input_ids, segment_ids, input_mask)
File "/home/amax/base2/venv/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/amax/base2/venv/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 152, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/amax/base2/venv/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/amax/base2/venv/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/home/amax/base2/venv/lib/python3.6/site-packages/torch/_utils.py", line 394, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 2.
Original Traceback (most recent call last):
File "/home/amax/base2/venv/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/home/amax/base2/venv/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/amax/base2/BertOrigin/BertOrigin.py", line 58, in forward
flat_input_ids, flat_token_type_ids, flat_attention_mask, encoder_hidden_states=False)
File "/home/amax/base2/venv/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/amax/base2/venv/lib/python3.6/site-packages/transformers/modeling_bert.py", line 841, in forward
return_dict=return_dict,
File "/home/amax/base2/venv/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/amax/base2/venv/lib/python3.6/site-packages/transformers/modeling_bert.py", line 482, in forward
output_attentions,
File "/home/amax/base2/venv/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/amax/base2/venv/lib/python3.6/site-packages/transformers/modeling_bert.py", line 402, in forward
output_attentions=output_attentions,
File "/home/amax/base2/venv/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/amax/base2/venv/lib/python3.6/site-packages/transformers/modeling_bert.py", line 339, in forward
output_attentions,
File "/home/amax/base2/venv/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/amax/base2/venv/lib/python3.6/site-packages/transformers/modeling_bert.py", line 275, in forward
context_layer = torch.matmul(attention_probs, value_layer)
RuntimeError: CUDA out of memory. Tried to allocate 30.00 MiB (GPU 2; 15.90 GiB total capacity; 15.08 GiB already allocated; 11.69 MiB free; 15.18 GiB reserved in total by PyTorch)
def get_device(gpu_id):
device = torch.device("cuda:" + str(gpu_id)
if torch.cuda.is_available() else "cpu")
return device
#https://github.com/songyingxin/MRC-Pytorch/blob/master/run_RACE.py
def main( cfg_gpu_ids ):
gpu_ids = [int(device_id) for device_id in cfg_gpu_ids.split(',')]
device = get_device(gpu_ids[0])
model = BertClassification()
model.to(device)
model = nn.DataParallel(model, device_ids=gpu_ids)
if __name__ == "__main__":
gpu_ids = "2,3,5"
main( gpu_ids )
代码没有问题,是用的GPU不够,用三块就可以了