####1、保存模型参数和加载模型参数
(A)保存参数
# 2 ways to save the net
torch.save(net1, 'net.pkl') # save entire net
torch.save(net1.state_dict(), 'net_params.pkl') # save only the parameters
(B)加载参数
# copy net1's parameters into net3
net3.load_state_dict(torch.load('net_params.pkl'))
prediction = net3(x)
上面出现的net1和net3都是nn.Module的实例。
####2、模型参数的钳位
# Clip weights of discriminator
for p in discriminator.parameters():
p.data.clamp_(-opt.clip_value, opt.clip_value)
p是Module(nn.Module)—— discriminator的参数。这段代码是实现WGAN时用到的。钳位不仅可以实现WGAN,而且它可以消除在训练中出现的nan情况,但钳位的大小很关键。
####3、模型的CUDA化
在配有CUDA的训练过程中,模型和数据都需要加载到CUDA中,pytorch的张量有两种类型,以Float为例:用于CPU——torch.FloatTensor、用于CUDA——torch.cuda.FloatTensor,以下是完整列表:
n CPU CUDA Desc. 1 torch.FloatTensor torch.cuda.FloatTensor 32-bit floating point 2 torch.DoubleTensor torch.cuda.DoubleTensor 64-bit floating point 3 N/A torch.cuda.HalfTensor 16-bit floating point 4 torch.ByteTensor torch.cuda.ByteTensor 8-bit integer (unsigned) 5 torch.CharTensor torch.cuda.CharTensor 8-bit integer (signed) 6 torch.ShortTensor torch.cuda.ShortTensor 16-bit integer (signed) 7 torch.IntTensor torch.cuda.IntTensor 32-bit integer (signed) 8 torch.LongTensor torch.cuda.LongTensor 64-bit integer (signed) \begin{array}{c|lc|r} n & \text{CPU} & \text{CUDA} & \text{Desc.}\\ \hline 1 & \text{torch.FloatTensor} & \text{torch.cuda.FloatTensor} & \text{32-bit floating point} \\ 2 & \text{torch.DoubleTensor} & \text{torch.cuda.DoubleTensor} & \text{64-bit floating point} \\ 3 & \text{N/A} & \text{torch.cuda.HalfTensor} & \text{16-bit floating point} \\ 4 & \text{torch.ByteTensor} & \text{torch.cuda.ByteTensor} & \text{8-bit integer (unsigned)} \\ 5 & \text{torch.CharTensor} & \text{torch.cuda.CharTensor} & \text{8-bit integer (signed)} \\ 6 & \text{torch.ShortTensor} & \text{torch.cuda.ShortTensor} & \text{16-bit integer (signed)} \\ 7 & \text{torch.IntTensor} & \text{torch.cuda.IntTensor} & \text{32-bit integer (signed)} \\ 8 & \text{torch.LongTensor} & \text{torch.cuda.LongTensor} & \text{64-bit integer (signed)} \\ \end{array} n12345678CPUtorch.FloatTensortorch.DoubleTensorN/Atorch.ByteTensortorch.CharTensortorch.ShortTensortorch.IntTensortorch.LongTensorCUDAtorch.cuda.FloatTensortorch.cuda.DoubleTensortorch.cuda.HalfTensortorch.cuda.ByteTensortorch.cuda.CharTensortorch.cuda.ShortTensortorch.cuda.IntTensortorch.cuda.LongTensorDesc.32-bit floating point64-bit floating point16-bit floating point8-bit integer (unsigned)8-bit integer (signed)16-bit integer (signed)32-bit integer (signed)64-bit integer (signed)
CPU中Tensor经常需要与CUDA中Tensor交换,交换的方法如下:
方法一:
MODEL_NAME = 'VanillaGAN'
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
...2.
def to_cuda(x):
return x.to(DEVICE)
'''3...模型的CUDA化'''
D = to_cuda(Discriminator())
G = to_cuda(Generator())
'''4...输入数据的CUDA化'''
x = to_cuda(images)
'''5...两个CUDA化的模型结合'''
x_outputs = D(x)
"""6...从CUDA回到CPU"""
def get_sample_image(G, n_noise=100):
"""
save sample 100 images
"""
for num in range(10):
for i in range(10):
z = to_cuda(torch.randn(1, n_noise))
y_hat = G(z)
line_img = torch.cat((line_img, y_hat.view(28, 28)), dim=1) if i > 0 else y_hat.view(28, 28)
all_img = torch.cat((all_img, line_img), dim=0) if num > 0 else line_img
img = all_img.cpu().data.numpy()
return img
方法二:使用 .cuda() 和 .cpu()
# setup input tensors
x = torch.FloatTensor(opt.batch_size, nc, opt.image_size, opt.image_size)
z = torch.FloatTensor(opt.batch_size, nz, 1, 1)
noise = torch.FloatTensor(opt.batch_size, 1, 1, 1)
if opt.cuda:
netGx.cuda(), netGz.cuda()
netDx.cuda(), netDz.cuda(), netDxz.cuda()
x, z, noise = x.cuda(), z.cuda(), noise.cuda()
从cuda中回来:
def test(dataloader, epoch):
real_cpu_first, _ = iter(dataloader).next()
real_cpu_first = real_cpu_first.mul(0.5).add(0.5) # denormalize
if opt.cuda:
real_cpu_first = real_cpu_first.cuda()
netGx.eval(), netGz.eval() # switch to test mode
latent = netGz(Variable(real_cpu_first, volatile=True))
# removes last sigmoid activation to visualize reconstruction correctly
mu, sigma = latent[:, :opt.nz], latent[:, opt.nz:].exp()
recon = netGx(mu + sigma)
vutils.save_image(recon.data, '{0}/reconstruction.png'.format(opt.experiment))
vutils.save_image(real_cpu_first, '{0}/real_samples.png'.format(opt.experiment))
####4、dataloader的逐次迭代
real_cpu_first, _ = iter(dataloader).next()
该方法可以用在单元测试上,检查dataloader的输出数据。以下是显示dataloader一次批处理图像的方法:
def imshow(inp, title=None):
"""Imshow for Tensor."""
inp = inp.numpy().transpose((1, 2, 0))
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
inp = std * inp + mean
inp = np.clip(inp, 0, 1)
plt.imshow(inp)
if title is not None:
plt.title(title)
plt.pause(0.001) # pause a bit so that plots are updated
# Get a batch of training data
inputs, classes = next(iter(dataloaders['train']))
# Make a grid from batch
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[class_names[x] for x in classes])
####5、构建一个模型的装配模式
class CNN(nn.Module):
def __init__(self, nc, input_size, hparams, ngpu=1, leaky_slope=0.01, std=0.01):
super(CNN, self).__init__()
self.ngpu = ngpu # num of gpu's to use
self.leaky_slope = leaky_slope # slope for leaky_relu activation
self.std = std # standard deviation for weights initialization
self.input_size = input_size # expected input size
main = nn.Sequential()
in_feat, num = nc, 0
for op, k, s, out_feat, b, bn, dp, h in hparams:
# add operation: conv2d or convTranspose2d
if op == 'conv2d':
main.add_module(
'{0}.pyramid.{1}-{2}.conv'.format(num, in_feat, out_feat),
nn.Conv2d(in_feat, out_feat, k, s, 0, bias=b))
elif op == 'convt2d':
main.add_module(
'{0}.pyramid.{1}-{2}.convt'.format(num,in_feat, out_feat),
nn.ConvTranspose2d(in_feat, out_feat, k, s, 0, bias=b))
else:
raise Exception('Not supported operation: {0}'.format(op))
num += 1
# add batch normalization layer
if bn:
main.add_module(
'{0}.pyramid.{1}-{2}.batchnorm'.format(num, in_feat, out_feat),
nn.BatchNorm2d(out_feat))
num += 1
# add dropout layer
main.add_module(
'{0}.pyramid.{1}-{2}.dropout'.format(num, in_feat, out_feat),
nn.Dropout2d(p=dp))
num += 1
# add activation
if h == 'leaky_relu':
main.add_module(
'{0}.pyramid.{1}-{2}.leaky_relu'.format(num, in_feat, out_feat),
nn.LeakyReLU(self.leaky_slope, inplace=True))
elif h == 'sigmoid':
main.add_module(
'{0}.pyramid.{1}-{2}.sigmoid'.format(num, in_feat, out_feat),
nn.Sigmoid())
elif h == 'maxout':
# TODO: implement me
# https://github.com/IshmaelBelghazi/ALI/blob/master/ali/bricks.py#L338-L380
raise NotImplementedError('Maxout is not implemented.')
elif h == 'relu':
main.add_module(
'{0}.pyramid.{1}-{2}.relu'.format(num, in_feat, out_feat),
nn.ReLU(inplace=True))
elif h == 'tanh':
main.add_module(
'{0}.pyramid.{1}-{2}.tanh'.format(num, in_feat, out_feat),
nn.Tanh())
elif h == 'linear':
num -= 1 # 'Linear' do nothing
else:
raise Exception('Not supported activation: {0}'.format(h))
num += 1
in_feat = out_feat
self.main = main
# initialize weights
for m in self.modules():
if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
m.weight.data.normal_(0.0, self.std)
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.normal_(1.0, self.std)
m.bias.data.zero_()
def forward(self, input):
assert input.size(2) == self.input_size,\
'Wrong input size: {0}. Expected {1}'.format(input.size(2),
self.input_size)
if self.ngpu > 1 and isinstance(input.data, torch.cuda.FloatTensor):
gpu_ids = range(self.ngpu)
output = nn.parallel.data_parallel(self.main, input, gpu_ids)
else:
output = self.main(input)
return output
它的调用模式:
def create_svhn_gz(nz=256, ngpu=1):
hparams = [
# op // kernel // strides // fmaps // conv. bias // batch_norm // dropout // nonlinearity
['conv2d', 5, 1, 32, False, True, 0.0, 'leaky_relu'],
['conv2d', 4, 2, 64, False, True, 0.0, 'leaky_relu'],
['conv2d', 4, 1, 128, False, True, 0.0, 'leaky_relu'],
['conv2d', 4, 2, 256, False, True, 0.0, 'leaky_relu'],
['conv2d', 4, 1, 512, False, True, 0.0, 'leaky_relu'],
['conv2d', 1, 1, 512, False, True, 0.0, 'leaky_relu'],
['conv2d', 1, 1, 2*nz, True, False, 0.0, 'linear'],
]
return CNN(3, 32, hparams, ngpu)
装配模式完成模型构建、参数设置和和参数的初始化。
####6、对模型中不同参数进行不同的优化
在GAN中,生成器与判别器的参数不是同时优化的,需要交替进行,我们使用torch.optim实现了类似spring的横切功能:
A)定义两个optim,分别掌管不同的模型参数
# setup optimizer
dis_params = chain(netDx.parameters(), netDz.parameters(), netDxz.parameters())
gen_params = chain(netGx.parameters(), netGz.parameters())
kwargs_adam = {'lr': opt.lr, 'betas': (opt.beta1, opt.beta2)}
optimizerD = optim.Adam(dis_params, **kwargs_adam)
optimizerG = optim.Adam(gen_params, **kwargs_adam)
B)交替调用优化器
D_loss = compute_loss(batch_size, d_loss=True)
G_loss = compute_loss(batch_size, d_loss=False)
for p in netGx.parameters():
p.requires_grad = False # to avoid computation
for p in netGz.parameters():
p.requires_grad = False # to avoid computation
for p in netDx.parameters():
p.requires_grad = True # to avoid computation
for p in netDz.parameters():
p.requires_grad = True # to avoid computation
for p in netDxz.parameters():
p.requires_grad = True # to avoid computation
optimizerD.zero_grad()
D_loss.backward()
optimizerD.step() # Apply optimization step
for p in netGx.parameters():
p.requires_grad = True # to avoid computation
for p in netGz.parameters():
p.requires_grad = True # to avoid computation
for p in netDx.parameters():
p.requires_grad = False # to avoid computation
for p in netDz.parameters():
p.requires_grad = False # to avoid computation
for p in netDxz.parameters():
p.requires_grad = False # to avoid computation
optimizerG.zero_grad()
G_loss.backward()
optimizerG.step() # Apply optimization step
####7、DataLoader的构建
class CustomDataset(data.Dataset):#需要继承data.Dataset
def __init__(self):
# TODO
# 1. Initialize file path or list of file names.
pass
def __getitem__(self, index):
# TODO
# 1. Read one data from file (e.g. using numpy.fromfile, PIL.Image.open).
# 2. Preprocess the data (e.g. torchvision.Transform).
# 3. Return a data pair (e.g. image and label).
#这里需要注意的是,第一步:read one data,是一个data
pass
def __len__(self):
# You should change 0 to the total size of your dataset.
return 0
然后Dataset作为Dataloader的输入参数。我们用枚举来调用Dataloader,或用Next()调用Dataloader。
dataloader = torch.utils.data.DataLoader(
ListDataset(train_path),
batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_cpu)
....
for batch_i, (_, imgs, targets) in enumerate(data loader):
....
####8、一些函数的应用
Pytorch中view()、squeeze()、unsqueeze()、torch.max()函数经常用:
1、view()函数作用是将一个多行的Tensor,拼接成一行
2、squeeze(0)表示如果第一维度值为1,则去掉,否则不变。
3、unsqueeze()与squeeze()作用相反,即增加一维
4、torch.max()返回两个结果,第一个是最大值,第二个是对应的索引值;第二个参数 0 代表按列取最大值并返回对应的行索引值,1 代表按行取最大值并返回对应的列索引值。
参考: https://blog.csdn.net/lanse_zhicheng/article/details/79148678
5、contiguous:view只能用在contiguous的variable上。如果在view之前用了transpose, permute等,需要用contiguous()来返回一个contiguous copy
####9、使用tensorboardX,记录调试信息
参考:https://blog.csdn.net/JNingWei/article/details/79740825
参考:https://blog.csdn.net/qq_27278153/article/details/78916080
参考:https://blog.csdn.net/kaixinjiuxing666/article/details/81004010
tensorboardX使用简单方便,最后运行:tensorboard --logdir runs