PyTorch中的常见报错总结
NO.1
num_samples=0
检查dataset中的路径
,路径不对,读取不到数据# ========================== 1 num_samples=0
flag = 0
# flag = 1
if flag:
# train_dir = os.path.join("..", "data", "rmb_split", "train")
train_dir = os.path.join("..", "..", "data", "rmb_split", "train")
train_data = RMBDataset(data_dir=train_dir)
# 构建DataLoder
train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)
NO.2
Got
检查transform中是否存在两次ToTensor()方法
# ========================== 2
# TypeError: pic should be PIL Image or ndarray. Got
flag = 0
# flag = 1
if flag:
train_transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.FiveCrop(200),
transforms.Lambda(lambda crops: torch.stack([(transforms.ToTensor()(crop)) for crop in crops])),
# transforms.ToTensor(),
# transforms.ToTensor(),
])
train_dir = os.path.join("..", "..", "data", "rmb_split", "train")
train_data = RMBDataset(data_dir=train_dir, transform=train_transform)
train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)
data, label = next(iter(train_loader))
NO.3
invalid argument 0: Sizes of tensors must match except in dimension 0.
Got 93 and 89 in dimension 1 at /Users/soumith/code/builder/wheel/pytorch- src/aten/src/TH/generic/THTensorMath.cpp:3616检查__getitem__函数中的操作
# ========================== 3
# RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0
flag = 0
# flag = 1
if flag:
class FooDataset(Dataset):
def __init__(self, num_data, data_dir=None, transform=None):
self.foo = data_dir
self.transform = transform
self.num_data = num_data
def __getitem__(self, item):
size = torch.randint(63, 64, size=(1,))
fake_data = torch.zeros((3, size, size))
fake_label = torch.randint(0, 10, size=(1,))
return fake_data, fake_label
def __len__(self):
return self.num_data
foo_dataset = FooDataset(num_data=10)
foo_dataloader = DataLoader(dataset=foo_dataset, batch_size=4)
data, label = next(iter(foo_dataloader))
NO.4
conv:
RuntimeError: Given groups=1, weight of size 6 1 5 5, expected input[16, 3, 32, 32] to have 1 channels, but got 3 channels instead
linear: RuntimeError: size mismatch
, m1: [16 x 576
], m2: [400
x 120] at …/aten/src/TH/generic/THTensorMath.cpp:752# ========================== 4
# Given groups=1, weight of size 6 3 5 5, expected input[16, 1, 32, 32] to have 3 channels, but got 1 channels instead
# RuntimeError: size mismatch, m1: [16 x 576], m2: [400 x 120] at ../aten/src/TH/generic/THTensorMath.cpp:752
flag = 0
# flag = 1
if flag:
class FooDataset(Dataset):
def __init__(self, num_data, shape, data_dir=None, transform=None):
self.foo = data_dir
self.transform = transform
self.num_data = num_data
self.shape = shape
def __getitem__(self, item):
fake_data = torch.zeros(self.shape)
fake_label = torch.randint(0, 10, size=(1,))
if self.transform is not None:
fake_data = self.transform(fake_data)
return fake_data, fake_label
def __len__(self):
return self.num_data
# ============================ step 1/5 数据 ============================
channel = 3 # 1 3
img_size = 32 # 36 32
train_data = FooDataset(num_data=32, shape=(channel, img_size, img_size))
train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)
# ============================ step 2/5 模型 ============================
net = LeNet(classes=2)
# ============================ step 3/5 损失函数 ============================
criterion = nn.CrossEntropyLoss()
# ============================ step 4/5 优化器 ============================
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9) # 选择优化器
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10)
# ============================ step 5/5 训练 ============================
data, label = next(iter(train_loader))
outputs = net(data)
NO.5
AttributeError: 'DataParallel' object has no attribute 'linear’
并行运算时,模型被dataparallel包装,所有module都增加一个属性 module. 因此 需要通过 net.module.linear调用
网络层前加入module.
# ========================== 5
# AttributeError: 'DataParallel' object has no attribute 'linear'
flag = 0
# flag = 1
if flag:
class FooNet(nn.Module):
def __init__(self):
super(FooNet, self).__init__()
self.linear = nn.Linear(3, 3, bias=True)
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool1 = nn.MaxPool2d(5)
def forward(self, x):
return 1234567890
net = FooNet()
for layer_name, layer in net.named_modules():
print(layer_name)
net = nn.DataParallel(net)
for layer_name, layer in net.named_modules():
print(layer_name)
print(net.module.linear)
NO.6
Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False.
If you are running on a CPU-only machine, please use torch.load with map_location=torch.device(‘cpu’) to map your storages to the CPU.gpu训练的模型保存后,在无gpu设备上无法直接加载
设置map_location="cpu"
# ========================== 6
# RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False.
# If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu')
# to map your storages to the CPU.
flag = 0
# flag = 1
if flag:
path_state_dict = "./model_in_multi_gpu.pkl"
state_dict_load = torch.load(path_state_dict)
# state_dict_load = torch.load(path_state_dict, map_location="cpu")
NO.7
保存的网络模型在当前python脚本中没有定义
提前定义该类
# ========================== 7
# AttributeError: Can't get attribute 'FooNet2' on
# flag = 0
flag = 1
if flag:
path_net = os.path.join(BASE_DIR, "foo_net.pkl")
# save
class FooNet2(nn.Module):
def __init__(self):
super(FooNet2, self).__init__()
self.linear = nn.Linear(3, 3, bias=True)
def forward(self, x):
return 1234567890
#
# net = FooNet2()
# torch.save(net, path_net)
# load
net_load = torch.load(path_net)
NO.8
cur_target >= 0 && cur_target < n_classes
failed. at …/aten/src/THNN/generic/ClassNLLCriterion.c:94不满足 cur_target < n_classes
,通常是因为标签从1开始而不是从0开始
修改label,从0开始
# ========================== 8
# RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes' failed.
flag = 0
# flag = 1
if flag:
inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 0, 1], dtype=torch.long)
criterion = nn.CrossEntropyLoss()
loss = criterion(inputs, target)
NO.9
expected device cuda:0
and dtype Long but got device cpu
and dtype LongExpected object of backend CPU but got backend CUDA for argument #2 'weight’
需计算的两个数据不在同一个设备上
采用to函数将数据迁移到同一个设备上
# ========================== 9.1
# RuntimeError: expected device cuda:0 and dtype Long but got device cpu and dtype Long
flag = 0
# flag = 1
if flag:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.tensor([1])
w = torch.tensor([2]).to(device)
# y = w * x
x = x.to(device)
y = w * x
# ========================== 9.2
# RuntimeError: Expected object of backend CPU but got backend CUDA for argument #2 'weight'
# flag = 0
flag = 1
if flag:
def data_loader(num_data):
for _ in range(num_data):
img_ = torch.randn(1, 3, 224, 224)
label_ = torch.randint(0, 10, size=(1,))
yield img_, label_
resnet18 = models.resnet18()
resnet18.to(device)
for inputs, labels in data_loader(2):
# inputs.to(device)
# labels.to(device)
# outputs = resnet18(inputs)
inputs = inputs.to(device)
labels = labels.to(device)
outputs = resnet18(inputs)
print("outputs device:{}".format(outputs.device))
NO.10
DataLoader worker
(pid 27) is killed by signal: Killed. Details are lost due to multiprocessing. Rerunning with num_workers=0
may give better error trace.内存不够(不是gpu显存,是内存)
申请更大内存
NO.11
RuntimeError: reduce failed to synchronize: device-side assert triggered
BCE损失函数的时候,input必须是0-1之间
,由于模型最后没有加sigmoid激活函数
,导致的。NO.12
NO.13
UnicodeDecodeError: 'utf-8' codec
can’t decode byte 0xff in position 1: invalid start bytepython2保存,python3加载
,会报错NO.14
NO.15
cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
NO.16
CUDA out of memory
. Tried to allocate 46.00 MiB (GPU 0; 2.00 GiB total capacity; 54.79 MiB already allocated; 39.30 MiB free; 74.00 MiB reserved in total by PyTorch)import os
os.environ['CUDA_VISIBLE_DEVICES']='2, 3'