问题描述:
单主机四gpu利用torch.nn.DataParallel进行多gpu的训练;gpu的内存占有率没问题,但使用率很低,很长一段时间在百分之零。
问题原因:
读取每个batch size的图像时,cpu读取时间较长。(batch_size = 1024, 设置DataLoader线程4、8、16、32的改善都不大,且pin_memory=true),开始的dataset为下面方式,显然每次读取时io的耗时比较长。于是,将图像全部读出(硬件支持…),然后再进行每次的变量赋值。
class myDataset(Dataset):
def __init__(self, lable_path,rootDir = "test1"):
super(myDataset, self).__init__()
truth = {}
f = open(lable_path, 'r', encoding='utf-8')
for line in f.readlines():
line = line.split('\n')[0][:-1]
data = line.split(" ")
truth[data[0]] = []#字典:图片文件对应label
for i in data[1:]:
truth[data[0]].append([float(j) for j in i.split(',')])
f.close()
self.truth = truth
self.root = rootDir
self.indexRead = np.zeros(len(self.truth),np.int)
def __len__(self):
return len(self.truth.keys())
def __getitem__(self, index):
img_path = list(self.truth.keys())[index]
img_path = random.choice(list(self.truth.keys()))
bboxes = np.array(self.truth.get(img_path), dtype=np.float)
# print(img_path)
img = cv2.imread(img_path,0)
# img = (img - np.min(img))/(np.max(img) - (np.min(img)))
# img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
if img is None:
print('read--- %s ---error'%img_path)
img = np.expand_dims(img, axis=2)#末尾增加一维
# print(img.shape)
# img = np.expand_dims(img, 1)
return img, bboxes
#调用方法,直接DataLoader:
# valData = myDataset("val.txt","val")
# valloader = DataLoader(valData, batch_size=val_batchsize, shuffle=True,
# num_workers=32, pin_memory=True, drop_last=True, collate_fn=collate)
改善后的dataset:
class myDataset(Dataset):
def __init__(self, lable_path,dir = 't'):
super(myDataset, self).__init__()
self.imgsList = []
self.labelList = []
f = open(lable_path, 'r', encoding='utf-8')
for line in f.readlines():
line = line.split('\n')[0][:-1]
data = line.split(" ")
imgPath = data[0]
img = cv2.imread(imgPath,0)
if img is None:
print('read--- %s ---error'%imgPath)
else:
# img = np.concatenate(img, axis=0)
# img = img.transpose(0, 3, 1, 2)
# img = torch.from_numpy(img).div(255.0)
img = np.expand_dims(img, axis=2)
img = img.transpose(2, 0, 1)
self.imgsList.append(img)
for i in data[1:]:
boxes = np.array( [float(j) for j in i.split(',')],dtype=np.float)
# bboxes = np.concatenate(bboxes, axis=0)
# bboxes = torch.from_numpy(bboxes)*1000.0
self.labelList.append(boxes)
f.close()
def __len__(self):
return len(self.imgsList)
def __getitem__(self, index):
img , boxes = self.imgsList[index], self.labelList[index]
# print(img.shape,boxes.shape)
return img, boxes
def allDataset(label_path):
dataset = myDataset(label_path)
lenth = dataset.__len__()
train_loader = DataLoader(dataset, batch_size=lenth, shuffle=True,
num_workers=8, pin_memory=False, collate_fn=collate)
dataiter = iter(train_loader)
images,labels = dataiter.next()
print(images.size(),labels.size())
return images,labels
调用方法,先制作dataset,再DataLoader:
from torch.utils.data import TensorDataset
trainIn,trainOut = allDataset("train30w.txt")
trainData=TensorDataset(trainIn,trainOut)
trainloader = DataLoader(trainData, batch_size=train_batchsize, shuffle=True,
num_workers=8, pin_memory=True, drop_last=True)
误差函数问题,开始采用:
Error = torch.mean(torch.abs(x - y)/torch.abs(y))
但当y >> x时,Error = 1;不能反映真实误差。
修改:
def listErrors(x,y):
x = x.view(-1)
y = y.view(-1)
absD = torch.abs(x - y)
#errlist = torch.zeros(x.size()[0],dtype=type(x))
errlist = x
absX = torch.abs(x)
absY = torch.abs(y)
sumErr = 0
for i in range(x.size()[0]):
errlist[i] = absD[i].div(min(absX[i],absY[i]))#torch.
return errlist
Error = torch.mean(listErrors(x,y))