任务背景:在使用YOLOv5的过程中,使用DDP模式时,对其相关操作记录如下
nvidia-smi
怎么看呢?具体参数含义如下:
我现在看到我的服务器空闲的GPU的编号有0,1,2,3,4,5,6
# device 设备选择
# parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
python train.py --device 0,1,2,3,4,5,6 --data coco128.yaml --weights yolov5s.pt --img 640
注意事项:
可以查看他实现的函数:
train.py 692行
torch_utils.py 52行
def select_device(device='', batch_size=0, newline=True):
# 选择计算的设备cpu/gpu
# device = 'cpu' or '0' or '0,1,2,3'
s = '' # string
device = str(device).strip().lower().replace('cuda:', '') # to string, 'cuda:0' to '0'
cpu = device == 'cpu'
if cpu:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # force torch.cuda.is_available() = False
elif device: # non-cpu device requested
os.environ['CUDA_VISIBLE_DEVICES'] = device
# set environment variable - must be before assert is_available()
assert torch.cuda.is_available() and torch.cuda.device_count() >= len(device.replace(',', '')), \
f"Invalid CUDA '--device {device}' requested, use '--device cpu' or pass valid CUDA device(s)"
cuda = not cpu and torch.cuda.is_available()
if cuda:
devices = device.split(',') if device else '0' # range(torch.cuda.device_count()) # i.e. 0,1,6,7
n = len(devices) # device count
if n > 1 and batch_size > 0: # 检查:batch_size 必须是GPU数量的整数倍(如果使用多块GPU)
assert batch_size % n == 0, f'batch-size {batch_size} not multiple of GPU count {n}'
for i, d in enumerate(devices):
p = torch.cuda.get_device_properties(i)
s += f"CUDA:{d} ({p.name}, {p.total_memory / (1 << 20):.0f}MiB)\n" # bytes to MB
else:
s += 'CPU\n'
if not newline:
s = s.rstrip()
# 打印device选用情况
LOGGER.info(s.encode().decode('ascii', 'ignore') if platform.system() == 'Windows' else s) # emoji-safe
return torch.device('cuda:0' if cuda else 'cpu')