关于fp16 AMP自动混合精度训练,见下文:
AMP自动混合精度训练https://blog.csdn.net/ytusdc/article/details/122152244
import time
import os
import numpy as np
import torch
from torch.autograd import Variable
from collections import OrderedDict
from subprocess import call
import fractions
def lcm(a,b): return abs(a * b)/fractions.gcd(a,b) if a and b else 0
# 用来求最小公倍数的函数(LCM)
from options.train_options import TrainOptions
from data.data_loader import CreateDataLoader
from models.models import create_model
import util.util as util
from util.visualizer import Visualizer
opt = TrainOptions().parse() # 载入选择项options
iter_path = os.path.join(opt.checkpoints_dir, opt.name, 'iter.txt') # 记录文件,记录训练进程
if opt.continue_train: # 如果使用continue_train
try:
start_epoch, epoch_iter = np.loadtxt(iter_path , delimiter=',', dtype=int) # 载入记录文件(如果使用继续训练)
except:
start_epoch, epoch_iter = 1, 0 # 若没有记录文件,就从头开始训练
print('Resuming from epoch %d at iteration %d' % (start_epoch, epoch_iter))
else:
start_epoch, epoch_iter = 1, 0
opt.print_freq = lcm(opt.print_freq, opt.batchSize)
# 求出print_freq和batchsize的最小公倍数,这保证了print_freq是batchsize的倍数
if opt.debug:
# debug就是试运行,只跑一个epoch
opt.display_freq = 1
opt.print_freq = 1
opt.niter = 1
opt.niter_decay = 0
opt.max_dataset_size = 10
data_loader = CreateDataLoader(opt)
# 建立数据加载器对象
dataset = data_loader.load_data()
# load_data()方法返回数据集
dataset_size = len(data_loader)
print('#training images = %d' % dataset_size)
model = create_model(opt)
# 建立模型对象
visualizer = Visualizer(opt)
# 可视化
if opt.fp16:
# 是否使用自动混合精度训练
from apex import amp
model, [optimizer_G, optimizer_D] = amp.initialize(model, [model.optimizer_G, model.optimizer_D], opt_level='O1')
model = torch.nn.DataParallel(model, device_ids=opt.gpu_ids)
else:
optimizer_G, optimizer_D = model.module.optimizer_G, model.module.optimizer_D # 定义优化器
total_steps = (start_epoch-1) * dataset_size + epoch_iter
# 开始训练的总iter
# 打印、展示、保存的依据(全部取余运算)
display_delta = total_steps % opt.display_freq
print_delta = total_steps % opt.print_freq
save_delta = total_steps % opt.save_latest_freq
for epoch in range(start_epoch, opt.niter + opt.niter_decay + 1):
# 从start_epoch开始训练一直到我们指定的总epoch
epoch_start_time = time.time()
if epoch != start_epoch:
epoch_iter = epoch_iter % dataset_size
# 当一个epoch结束后,epoch_iter=0,接下来从一个epoch里面的第一个iter开始训练。
# 第一次被赋值时,表示continue_train里面的continue_iter
for i, data in enumerate(dataset, start=epoch_iter):
if total_steps % opt.print_freq == print_delta: # 满足print_freq,开始计时,方便后续打印
iter_start_time = time.time() # 当前iter开始的时间
total_steps += opt.batchSize # 总训练数增加
epoch_iter += opt.batchSize # 一个epoch内的训练数增加,
# 当epoch_iter达到datasets_size时,跳出循环,进入下一个epoch
# whether to collect output images
save_fake = total_steps % opt.display_freq == display_delta # 受否保存fake_img
############## Forward Pass ######################
# 完成正向传播
losses, generated = model(Variable(data['label']), Variable(data['inst']),
Variable(data['image']), Variable(data['feat']), infer=save_fake)
# sum per device losses
losses = [ torch.mean(x) if not isinstance(x, int) else x for x in losses ]
# 对列表中每个元素进行处理,对不是int的元素进行均值计算
loss_dict = dict(zip(model.module.loss_names, losses))
# 创建了字典,zip()函数打包了元组
# calculate final loss scalar
loss_D = (loss_dict['D_fake'] + loss_dict['D_real']) * 0.5 # 鉴别器损失
loss_G = loss_dict['G_GAN'] + loss_dict.get('G_GAN_Feat',0) + loss_dict.get('G_VGG',0) # 生成器损失
############### Backward Pass ####################
# update generator weights
optimizer_G.zero_grad() # 首先将所有梯度置零
if opt.fp16:
with amp.scale_loss(loss_G, optimizer_G) as scaled_loss: scaled_loss.backward()
# scale_loss()是 AMP 库提供的一个函数,用于自动缩放梯度。
# 混合精度训练的主要思想是在前向传播时使用半精度浮点数(float16),从而减少存储和计算的需求,
# 而在反向传播时使用单精度浮点数(float32)以防止数值下溢。amp.scale_loss 负责缩放梯度,使其保持在合适的范围内。
else:
loss_G.backward()
optimizer_G.step() # 反向传播
# update discriminator weights
optimizer_D.zero_grad() # 梯度置零
if opt.fp16:
with amp.scale_loss(loss_D, optimizer_D) as scaled_loss: scaled_loss.backward()
else:
loss_D.backward()
optimizer_D.step() # 反向传播
############## Display results and errors ##########
### print out errors
if total_steps % opt.print_freq == print_delta: # 是否打印
errors = {k: v.data.item() if not isinstance(v, int) else v for k, v in loss_dict.items()}
t = (time.time() - iter_start_time) / opt.print_freq # 在visdom上打印训练时间
visualizer.print_current_errors(epoch, epoch_iter, errors, t)
visualizer.plot_current_errors(errors, total_steps)
#call(["nvidia-smi", "--format=csv", "--query-gpu=memory.used,memory.free"])
### display output images
if save_fake:
visuals = OrderedDict([('input_label', util.tensor2label(data['label'][0], opt.label_nc)),
('synthesized_image', util.tensor2im(generated.data[0])),
('real_image', util.tensor2im(data['image'][0]))])
visualizer.display_current_results(visuals, epoch, total_steps)
### save latest model
if total_steps % opt.save_latest_freq == save_delta: # 是否保存权重文件
print('saving the latest model (epoch %d, total_steps %d)' % (epoch, total_steps))
model.module.save('latest')
np.savetxt(iter_path, (epoch, epoch_iter), delimiter=',', fmt='%d')
if epoch_iter >= dataset_size: # 如果当前epoch已训练iter数达到了datasize就进入下一个epoch
break
# end of epoch
iter_end_time = time.time()
print('End of epoch %d / %d \t Time Taken: %d sec' %
(epoch, opt.niter + opt.niter_decay, time.time() - epoch_start_time))
# 训练的epoch 已经训练的iters 当前epoch耗费的时间
### save model for this epoch
if epoch % opt.save_epoch_freq == 0:
print('saving the model at the end of epoch %d, iters %d' % (epoch, total_steps))
model.module.save('latest')
model.module.save(epoch)
np.savetxt(iter_path, (epoch+1, 0), delimiter=',', fmt='%d')
### instead of only training the local enhancer, train the entire network after certain iterations
if (opt.niter_fix_global != 0) and (epoch == opt.niter_fix_global):
model.module.update_fixed_params()
### linearly decay learning rate after certain iterations
if epoch > opt.niter: # 当训练总epoch大于niter时,学习率开始更新
model.module.update_learning_rate()