在138机器上执行命令如下
conda activate pytorch0.2.0
python
import torch
x=torch.empty(5, 3)
提示错误,如标题所示。
分析:模块没有属性“empty”,也就是torch模块中没有属性empty。说明了PyTorch0.2.0版本中,torch模块不包含改属性。
验证:在139的pytorch1.0中测试成功。
train_loader = DataLoader(train_dset,batch_size=32,shuffle=False,num_workers=3)
for batch_idx , (d, la) in enumerate(train_loader):
错误是在dataloader加载数据时出现的错误,原因是pytorch多线程共享tensor是通过打开文件的方式实现的,而打开文件的数量是有限制的,通过ulimit -a
可查看,当需共享的tensor超过open files限制时,即会出现该错误。
解决办法有2种:
1、增加open files的限制数量:
不能用sudo ulimit -n命令,而需执行:
sudo sh -c "ulimit -n 65535 && exec su $LOGNAME"
解释如下:
ulimit is a shell builtin like cd, not a separate program. sudo looks for a binary to run, but there is no ulimit binary, which is why you get the error message. You need to run it in a shell.
However, while you do need to be root to raise the limit to 65535, you probably don’t want to run your program as root. So after you raise the limit you should switch back to the current user.
To do this, run:
sudo sh -c “ulimit -n 65535 && exec su $LOGNAME”
and you will get a new shell, without root privileges, but with the raised limit. The exec causes the new shell to replace the process with sudo privileges, so after you exit that shell, you won’t accidentally end up as root again.
2、修改多线程的tensor方式为file_system(默认方式为file_descriptor,受限于open files数量):
torch.multiprocessing.set_sharing_strategy(‘file_system‘)
方法2有效
重新安装tensorboard(推荐)
pip install tb-nightly
http://www.voidcn.com/article/p-yfxuhjvo-brr.html
sudo dpkg --list | grep nvidia*
cat /proc/driver/nvidia/version
lsmod | grep nvidia
查看下有哪些进程使用了 nvidia*
sudo fuser -v /dev/nvidia*
#查找占用GPU资源的PID
sudo lsof -n -w /dev/nvidia*
#查找占用GPU资源的PID
import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
# 保存mat文件
sio.savemat('testpython.mat', {'a': 1,'b': 2,'c': 3,'d': 4})
sio.savemat('testpython2.mat', {'x': [[1, 2, 3, 4],[ 5, 6, 7, 8]]})
# 读取mat文件
data = sio.loadmat('testpython.mat')
x1 = data['a']
x2 = data['b']
x3 = data['c']
x4 = data['d']
# 查看mat文件
sio.whosmat('testpython.mat')
《深度学习之PyTorch实战计算机视觉》唐进民 p182
7.2.1 验证数据集和测试数据集
从训练集的猫和狗的图片中各抽出2500张图片组成一个具有5000张图片的验证数据集
chapter3/
dogsandcats/
train/
dog.183.jpg
cat.2.jpg
cat.17.jpg
dog.186.jpg
cat.27.jpg
dog.193.jpg
chapter3/
dogsandcats/
train/
dog/
dog.183.jpg
dog.186.jpg
dog.193.jpg
cat/
cat.17.jpg
cat.2.jpg
cat.27.jpg
valid/
dog/
dog.173.jpg
dog.156.jpg
dog.123.jpg
cat/
cat.172.jpg
cat.20.jpg
cat.21.jpg
import numpy
import random, shutil
def moveSomeFileToNewDir(srcDir, tarDir, rate):
srcPaths = os.listdir(srcDir)
tarPaths = random.sample(srcPaths, int(len(paths) * rate) )
for name in tarPaths:
shutil.move(os.path.join(srcDir, name), os.path.join(tarDir, name) )
#fileDir = r"train"
#valDir = r'valid'
#moveSomeFileToNewDir(fileDir, valDir, 0.2)
#从命令行读入参数
moveSomeFileToNewDir(sys.argv[1], sys.argv[2], (float)(sys.argv[3]))
train文件夹有25000张图片,移动5000张到valid文件夹,运行:python split.py train valid
import os
from glob import glob
#path = '/home/yhr/data/DogsVSCats/train'
path = '/home/yhr/data/kaggle_DogsVSCats'
files = glob(os.path.join(path,'*/*.jpg')) # dog.1224.jpg
print('Total no of images', len(files)) # 25000
if not os.path.exists(os.path.join(path, 'valid')):
os.mkdir(os.path.join(path, 'valid'))
for t in ['train', 'valid']:
for folder in ['dog/', 'cat/']:
os.mkdir(os.path.join(path, t, folder))
no_of_images = len(files)
# move images from train to train/dog and trian/cat
for i in range(no_of_images):
folder = files[i].split('/')[-1].split('.')[0]
image = files[i].split('/')[-1]
os.rename(files[i], os.path.join(path,'train',folder,image))
# move images from train/dog to valid/dog
for folder in ['dog/', 'cat/']:
sub_path = os.path.join(path, 'train', folder)
files = glob(os.path.join(sub_path, '*.jpg'))
no_of_images = len(files)
shuffle = np.random.permutation(no_of_images)
for i in shuffle[:2500]:
#shutil.copyfile(files[i],'../chapter3/dogsandcats/valid/')
folder = files[i].split('/')[-1].split('.')[0]
image = files[i].split('/')[-1]
os.rename(files[i], os.path.join(path,'valid',folder,image))
simple_transform = transforms.Compose([transforms.Resize((224,224))
,transforms.ToTensor()
,transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
train = ImageFolder('/home/yhr/data/DogsVSCats/train/',simple_transform)
valid = ImageFolder('/home/yhr/data/DogsVSCats/valid/',simple_transform)
train_data_gen = torch.utils.data.DataLoader(train,shuffle=True,batch_size=64,num_workers=3)
valid_data_gen = torch.utils.data.DataLoader(valid,batch_size=64,num_workers=3)
dataset_sizes = {'train':len(train_data_gen.dataset),'valid':len(valid_data_gen.dataset)}
dataloaders = {'train':train_data_gen,'valid':valid_data_gen}
图片展示
def imshow(inp):
"""Imshow for Tensor."""
inp = inp.numpy().transpose((1, 2, 0))
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
inp = std * inp + mean #反归一化
inp = np.clip(inp, 0, 1)
plt.imshow(inp)
imshow(train[50][0])
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 2)
if torch.cuda.is_available():
model_ft = model_ft.cuda()
def train_model(model, criterion, optimizer, scheduler, num_epochs=5):
since = time.time()
best_model_wts = model.state_dict()
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'valid']:
if phase == 'train':
print("training...")
scheduler.step()
model.train(True) # Set model to training mode
else:
print("validating")
model.train(False) # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for data in dataloaders[phase]:
# get the inputs
inputs, labels = data
# wrap them in Variable
if torch.cuda.is_available():
#inputs = Variable(inputs.cuda())
#labels = Variable(labels.cuda())
inputs = Variable(inputs)
labels = Variable(labels)
inputs, labels = inputs.to(device), labels.to(device)
else:
inputs, labels = Variable(inputs), Variable(labels)
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs)
_, preds = torch.max(outputs.data, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item()
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'valid' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = model.state_dict()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
my_resnet = resnet34(pretrained=True)
if is_cuda:
my_resnet = my_resnet.cuda()
m = nn.Sequential(*list(my_resnet.children())[:-1])
for p in my_resnet.parameters():
p.requires_grad = False
#For training data
#Iterate through the train data and store the calculated features and the labels
trn_labels = []
trn_features = []
for batch_idx , (d, la) in enumerate(train_loader):
o = m(Variable(d.cuda()))
o = o.view(o.size(0), -1)
trn_labels.extend(la)
trn_features.extend(o.cpu().data)
#For validation data
#Iterate through the validation data and store the calculated features and the labels
val_labels = []
val_features = []
for d, la in val_loader:
o = m(Variable(d.cuda()))
o = o.view(o.size(0), -1)
val_labels.extend(la)
val_features.extend(o.cpu().data)
class FeaturesDataset(Dataset):
def __init__(self,featlst,labellst):
self.featlst = featlst
self.labellst = labellst
def __getitem__(self,index):
return (self.featlst[index],self.labellst[index])
def __len__(self):
return len(self.labellst)
#Creating dataset for train and validation
trn_feat_dset = FeaturesDataset(trn_features,trn_labels)
val_feat_dset = FeaturesDataset(val_features,val_labels)
#Creating data loader for train and validation
trn_feat_loader = DataLoader(trn_feat_dset,batch_size=64,shuffle=True)
val_feat_loader = DataLoader(val_feat_dset,batch_size=64)
问题 | 方法 |
---|---|
合并文件夹 CUB数据集train,test合并成data文件夹 |
cp -Rap train/* test/ mv test data |