pytorch训练,gpu利用率不稳定

问题描述:
单主机四gpu利用torch.nn.DataParallel进行多gpu的训练;gpu的内存占有率没问题,但使用率很低,很长一段时间在百分之零。

问题原因:
读取每个batch size的图像时,cpu读取时间较长。(batch_size = 1024, 设置DataLoader线程4、8、16、32的改善都不大,且pin_memory=true),开始的dataset为下面方式,显然每次读取时io的耗时比较长。于是,将图像全部读出(硬件支持…),然后再进行每次的变量赋值。

class myDataset(Dataset):
    def __init__(self, lable_path,rootDir = "test1"):
        super(myDataset, self).__init__()

        truth = {}
        f = open(lable_path, 'r', encoding='utf-8')
        for line in f.readlines():
            line = line.split('\n')[0][:-1]
            data = line.split(" ")
            truth[data[0]] = []#字典:图片文件对应label
            for i in data[1:]:
                truth[data[0]].append([float(j) for j in i.split(',')])
        f.close()
        
        self.truth = truth
        self.root = rootDir
        
        self.indexRead = np.zeros(len(self.truth),np.int)

    def __len__(self):
        return len(self.truth.keys())

    def __getitem__(self, index):
        img_path = list(self.truth.keys())[index]
        img_path = random.choice(list(self.truth.keys()))
    
        bboxes = np.array(self.truth.get(img_path), dtype=np.float)

        # print(img_path)
        img = cv2.imread(img_path,0)
        # img = (img - np.min(img))/(np.max(img) - (np.min(img)))
        # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if img is None:
            print('read--- %s ---error'%img_path)

        img = np.expand_dims(img, axis=2)#末尾增加一维
        # print(img.shape)
        # img = np.expand_dims(img, 1)
        
        return img, bboxes
   #调用方法,直接DataLoader:
    # valData = myDataset("val.txt","val")
    # valloader =  DataLoader(valData, batch_size=val_batchsize, shuffle=True,
    #                         num_workers=32, pin_memory=True, drop_last=True, collate_fn=collate)

改善后的dataset:

class myDataset(Dataset):
    def __init__(self, lable_path,dir = 't'):
        super(myDataset, self).__init__()

        self.imgsList = []
        self.labelList = []
        f = open(lable_path, 'r', encoding='utf-8')
        for line in f.readlines():
            line = line.split('\n')[0][:-1]
            data = line.split(" ")
            imgPath = data[0]
            img = cv2.imread(imgPath,0)
            if img is None:
                print('read--- %s ---error'%imgPath)
            else:
                # img = np.concatenate(img, axis=0)
                # img = img.transpose(0, 3, 1, 2)
                # img = torch.from_numpy(img).div(255.0)
                img = np.expand_dims(img, axis=2)
                img = img.transpose(2, 0, 1)
                self.imgsList.append(img)
            
                for i in data[1:]:
                    boxes = np.array( [float(j) for j in i.split(',')],dtype=np.float)
                # bboxes = np.concatenate(bboxes, axis=0)
                # bboxes = torch.from_numpy(bboxes)*1000.0
                self.labelList.append(boxes)
        f.close()
        

    def __len__(self):
        return len(self.imgsList)

    def __getitem__(self, index):
        img , boxes = self.imgsList[index], self.labelList[index]
        # print(img.shape,boxes.shape)
        return img, boxes

def allDataset(label_path):
    dataset = myDataset(label_path)
    lenth = dataset.__len__()
    train_loader = DataLoader(dataset, batch_size=lenth, shuffle=True,
                                num_workers=8, pin_memory=False, collate_fn=collate)
    dataiter = iter(train_loader)
    images,labels = dataiter.next()
    print(images.size(),labels.size())    
    return images,labels

调用方法,先制作dataset,再DataLoader:

	from torch.utils.data import TensorDataset
    trainIn,trainOut = allDataset("train30w.txt")
    trainData=TensorDataset(trainIn,trainOut)
    trainloader =  DataLoader(trainData, batch_size=train_batchsize, shuffle=True,
                            num_workers=8, pin_memory=True, drop_last=True)


9.12记录:

误差函数问题,开始采用:

Error = torch.mean(torch.abs(x - y)/torch.abs(y))

但当y >> x时,Error = 1;不能反映真实误差。
修改:

def listErrors(x,y):
    x = x.view(-1)
    y = y.view(-1)
    absD  = torch.abs(x - y)
    #errlist = torch.zeros(x.size()[0],dtype=type(x))
    errlist = x
    absX = torch.abs(x)
    absY = torch.abs(y)
    sumErr = 0
    for i in range(x.size()[0]):
        errlist[i] = absD[i].div(min(absX[i],absY[i]))#torch.
    return errlist
Error = torch.mean(listErrors(x,y))

你可能感兴趣的:(学习记录,python)