DataLoader RuntimeError Ran out of memory

错误

Traceback (most recent call last):
  File "train.py", line 137, in 
    train(model, device,criterion, trainLoader, optimizer, epoch,losses)

  File "train.py", line 33, in train
    for batchIdx, (data, target) in enumerate(trainLoader):

  File "C:\Users\user_name\AppData\Local\Continuum\anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 501, in __iter__
__mp_main__
    return _DataLoaderIter(self)

  File "C:\Users\user_name\AppData\Local\Continuum\anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 289, in __init__
    w.start()

  File "C:\Users\user_name\AppData\Local\Continuum\anaconda3\lib\multiprocessing\process.py", line 105, in start
    self._popen = self._Popen(self)

  File "C:\Users\user_name\AppData\Local\Continuum\anaconda3\lib\multiprocessing\context.py", line 223, in _Popen
    return _default_context.get_context().Process._Popen(process_obj)
  File "C:\Users\user_name\AppData\Local\Continuum\anaconda3\lib\multiprocessing\context.py", line 322, in _Popen
    return Popen(process_obj)
  File "C:\Users\user_name\AppData\Local\Continuum\anaconda3\lib\multiprocessing\popen_spawn_win32.py", line 65, in __init__
    reduction.dump(process_obj, to_child)
  File "C:\Users\user_name\AppData\Local\Continuum\anaconda3\lib\multiprocessing\reduction.py", line 60, in dump
    ForkingPickler(file, protocol).dump(obj)

OSError: [Errno 22] Invalid argument
C:\Users\user_name\AppData\Local\Continuum\anaconda3\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Traceback (most recent call last):
  File "", line 1, in 
  File "C:\Users\user_name\AppData\Local\Continuum\anaconda3\lib\multiprocessing\spawn.py", line 105, in spawn_main
    exitcode = _main(fd)
  File "C:\Users\user_namer\AppData\Local\Continuum\anaconda3\lib\multiprocessing\spawn.py", line 115, in _main
    self = reduction.pickle.load(from_parent)

EOFError: Ran out of memory

RuntimeError: 
        An attempt has been made to start a new process before the
        current process has finished its bootstrapping phase.


The "freeze_support()" line can be omitted if the program is not going to be frozen to produce an executable.

问题代码段

train_loader = Data.DataLoader(dataset=train_data,batch_size=1, shuffle=True, num_workers=8)

for batch_idx, content in enumerate(trainloader):
       print("正确进入for循环")
       input, label = content
       batch_num = input.size(0)
       input = Variable(input.cuda())
       feat_pool, feat = net(input, input, test_mode[1])

#  出错
》》》bug....
》》》不能进入for循环

原因

python 多进程调用,为了避免冲突,只允许在主函数下进行



解决办法

第一种:加速训练 num_workers>0 ,使用前加主函数环境声明

if __name__ == '__main__':

    train_loader = Data.DataLoader(dataset=train_data,batch_size=1, shuffle=True, num_workers=8)
    for batch_idx, content in enumerate(trainloader):
         print("正确进入for循环")
         input, label = content
         batch_num = input.size(0)
         input = Variable(input.cuda())
         feat_pool, feat = net(input, input, test_mode[1])


》》》correct....
》》》正确进入for循环

第二种:将 num_workers 设置为 0 ,即程序只在一个进程下运行

train_loader = Data.DataLoader(dataset=train_data,batch_size=1, shuffle=True, num_workers=0)

for batch_idx, content in enumerate(trainloader):
       print("正确进入for循环")
       input, label = content
       batch_num = input.size(0)
       input = Variable(input.cuda())
       feat_pool, feat = net(input, input, test_mode[1])

》》》correct....
》》》正确进入for循环

第三种:设置多进程使用声明

torch.multiprocessing.freeze_support()

train_loader = Data.DataLoader(dataset=train_data,batch_size=1, shuffle=True, num_workers=8)

for batch_idx, content in enumerate(trainloader):
       print("正确进入for循环")
       input, label = content
       batch_num = input.size(0)
       input = Variable(input.cuda())
       feat_pool, feat = net(input, input, test_mode[1])

》》》correct....
》》》正确进入for循环
train_loader = Data.DataLoader(dataset=train_data,batch_size=1, shuffle=True, num_workers=8)

p = multiprocessing.Process(target= train_loader)
p.start()
p.join()

for batch_idx, content in enumerate(trainloader):
       print("正确进入for循环")
       input, label = content
       batch_num = input.size(0)
       input = Variable(input.cuda())
       feat_pool, feat = net(input, input, test_mode[1])



》》》correct....
》》》正确进入for循环


例子

# 错误的样例

import time
import torch
import torch.utils.data as Data

train_dataset = torch.FloatTensor((100000, 32))

batch_size = 32

train_loader = Data.DataLoader(dataset=train_dataset,
batch_size=batch_size, shuffle=True)
train_loader2 = Data.DataLoader(dataset=train_dataset,
batch_size=batch_size, shuffle=True, num_workers=8)

start = time.time()
for _ in range(200):
    for x in train_loader:
        pass
end = time.time()
print(end - start)

start = time.time()
for _ in range(200):
    for x in train_loader2:
        pass
end = time.time()
print(end - start)

正确的样例

import time
import torch
import torch.utils.data as Data


#Step 2: time it
if __name__ == '__main__':

    train_dataset = torch.FloatTensor((100000, 32))

    batch_size = 32

    train_loader = Data.DataLoader(dataset=train_dataset,
    batch_size=batch_size, shuffle=True)
    train_loader2 = Data.DataLoader(dataset=train_dataset,
    batch_size=batch_size, shuffle=True, num_workers=8)

    start = time.time()
    for _ in range(200):
        for x in train_loader:
            pass
    end = time.time()
    print(end - start)

    start = time.time()
    for _ in range(200):
        for x in train_loader2:
            pass
    end = time.time()
    print(end - start)

 

 

你可能感兴趣的:(bug)