torch7学习(一)——Tensor
Torch7学习(二) —— Torch与Matlab的语法对比
Torch7学习(三)——学习神经网络包的用法(1)
Torch7学习(四)——学习神经网络包的用法(2)
Torch7学习(五)——学习神经网路包的用法(3)
Torch7学习(六)——学习神经网络包的用法(4)——利用optim进行训练
Torch7学习(七)——Neural-Style代码解析
Neural-style用的可能是最容易入门的代码之一吧。比较简单,写的很清晰。只涉及到最简单的网络构建方式,同时也是最基本最重要的写法。直接看neural-style的代码吧。代码附有大量的注释,而在最后也有一定的分析。
值得注意的是,最新的该论文的代码已经是多GPU版本的了。可以指定不同层放入不同的GPU进行训练。,主要是加入了 Controlling Perceptual Factors in Neural Style Transfer代码的支持。个人觉得下面的版本比较简单,看懂了再看最新的版本的吧。https://github.com/jcjohnson/neural-style
require 'torch'
require 'nn'
require 'image'
require 'optim'
require 'loadcaffe'
--------------------------------------------------------------------------------
local cmd = torch.CmdLine()
-- Basic options
cmd:option('-style_image', 'examples/inputs/starry_night.jpg',
'Style target image')
cmd:option('-style_blend_weights', 'nil')
cmd:option('-content_image', 'examples/inputs/5.jpg',
'Content target image')
cmd:option('-image_size', 512, 'Maximum height / width of generated image')
cmd:option('-gpu', 0, 'Zero-indexed ID of the GPU to use; for CPU mode set -gpu = -1')
-- Optimization options
cmd:option('-content_weight', 5e0)
cmd:option('-style_weight', 1e2)
cmd:option('-tv_weight', 1e-3)
cmd:option('-num_iterations', 1000)
cmd:option('-normalize_gradients', false)
cmd:option('-init', 'random', 'random|image')
cmd:option('-optimizer', 'lbfgs', 'lbfgs|adam')
cmd:option('-learning_rate', 1e1)
-- Output options
cmd:option('-print_iter', 50)
cmd:option('-save_iter', 100)
cmd:option('-output_image', 'out.png')
-- Other options
cmd:option('-style_scale', 1.0)
cmd:option('-pooling', 'max', 'max|avg')
cmd:option('-proto_file', 'models/VGG_ILSVRC_19_layers_deploy.prototxt')
cmd:option('-model_file', 'models/VGG_ILSVRC_19_layers.caffemodel')
cmd:option('-backend', 'nn', 'nn|cudnn|clnn')
cmd:option('-cudnn_autotune', false)
cmd:option('-seed', -1)
cmd:option('-content_layers', 'relu4_2', 'layers for content')
cmd:option('-style_layers', 'relu1_1,relu2_1,relu3_1,relu4_1,relu5_1', 'layers for style')
local function main(params)
if params.gpu >= 0 then
if params.backend ~= 'clnn' then
require 'cutorch'
require 'cunn'
cutorch.setDevice(params.gpu + 1)
else
require 'clnn'
require 'cltorch'
cltorch.setDevice(params.gpu + 1)
end
else
params.backend = 'nn'
end
if params.backend == 'cudnn' then
require 'cudnn'
if params.cudnn_autotune then
cudnn.benchmark = true
end
cudnn.SpatialConvolution.accGradParameters = nn.SpatialConvolutionMM.accGradParameters -- ie: nop
end
local loadcaffe_backend = params.backend
if params.backend == 'clnn' then loadcaffe_backend = 'nn' end
local cnn = loadcaffe.load(params.proto_file, params.model_file, loadcaffe_backend):float()
if params.gpu >= 0 then
if params.backend ~= 'clnn' then
cnn:cuda()
else
cnn:cl()
end
end
local content_image = image.load(params.content_image, 3)
content_image = image.scale(content_image, params.image_size, 'bilinear')
local content_image_caffe = preprocess(content_image):float()
local style_size = math.ceil(params.style_scale * params.image_size)
local style_image_list = params.style_image:split(',')
local style_images_caffe = {}
for _, img_path in ipairs(style_image_list) do
local img = image.load(img_path, 3)
img = image.scale(img, style_size, 'bilinear')
local img_caffe = preprocess(img):float()
table.insert(style_images_caffe, img_caffe)
end
-- Handle style blending weights for multiple style inputs
local style_blend_weights = nil
if params.style_blend_weights == 'nil' then
-- Style blending not specified, so use equal weighting
style_blend_weights = {}
-- #表示长度
for i = 1, #style_image_list do
table.insert(style_blend_weights, 1.0)
end
else
style_blend_weights = params.style_blend_weights:split(',')
assert(#style_blend_weights == #style_image_list,
'-style_blend_weights and -style_images must have the same number of elements')
end
-- Normalize the style blending weights so they sum to 1
local style_blend_sum = 0
for i = 1, #style_blend_weights do
style_blend_weights[i] = tonumber(style_blend_weights[i])
style_blend_sum = style_blend_sum + style_blend_weights[i]
end
for i = 1, #style_blend_weights do
style_blend_weights[i] = style_blend_weights[i] / style_blend_sum
end
if params.gpu >= 0 then
if params.backend ~= 'clnn' then
content_image_caffe = content_image_caffe:cuda()
for i = 1, #style_images_caffe do
style_images_caffe[i] = style_images_caffe[i]:cuda()
end
else
content_image_caffe = content_image_caffe:cl()
for i = 1, #style_images_caffe do
style_images_caffe[i] = style_images_caffe[i]:cl()
end
end
end
local content_layers = params.content_layers:split(",")
local style_layers = params.style_layers:split(",")
-- Set up the network, inserting style and content loss modules
local content_losses, style_losses = {}, {}
local next_content_idx, next_style_idx = 1, 1
local net = nn.Sequential()
if params.tv_weight > 0 then
local tv_mod = nn.TVLoss(params.tv_weight):float()
if params.gpu >= 0 then
if params.backend ~= 'clnn' then
tv_mod:cuda()
else
tv_mod:cl()
end
end
net:add(tv_mod)
end
for i = 1, #cnn do
if next_content_idx <= #content_layers or next_style_idx <= #style_layers then
local layer = cnn:get(i)
local name = layer.name
local layer_type = torch.type(layer)
local is_pooling = (layer_type == 'cudnn.SpatialMaxPooling' or layer_type == 'nn.SpatialMaxPooling')
if is_pooling and params.pooling == 'avg' then
assert(layer.padW == 0 and layer.padH == 0)
local kW, kH = layer.kW, layer.kH
local dW, dH = layer.dW, layer.dH
local avg_pool_layer = nn.SpatialAveragePooling(kW, kH, dW, dH):float()
if params.gpu >= 0 then
if params.backend ~= 'clnn' then
avg_pool_layer:cuda()
else
avg_pool_layer:cl()
end
end
local msg = 'Replacing max pooling at layer %d with average pooling'
print(string.format(msg, i))
net:add(avg_pool_layer)
else
-- 如果不是pooling层,直接add这一层
net:add(layer)
end
if name == content_layers[next_content_idx] then
print("Setting up content layer", i, ":", layer.name)
-- 如果这一层是content的话,那么就要加入loss_module,而loss_module则需要:content_weight, target, norm来进行初始化类。
-- target就是“输入”经过前面网络所有层得到的输出。因此target = net:forward(content_image_caffe):clone.
local target = net:forward(content_image_caffe):clone()
local norm = params.normalize_gradients
local loss_module = nn.ContentLoss(params.content_weight, target, norm):float()
if params.gpu >= 0 then
if params.backend ~= 'clnn' then
loss_module:cuda()
else
loss_module:cl()
end
end
net:add(loss_module)
table.insert(content_losses, loss_module)
next_content_idx = next_content_idx + 1
end
if name == style_layers[next_style_idx] then
print("Setting up style layer ", i, ":", layer.name)
local gram = GramMatrix():float()
if params.gpu >= 0 then
if params.backend ~= 'clnn' then
gram = gram:cuda()
else
gram = gram:cl()
end
end
local target = nil
-- style_images_caffe是众多style_images的Tensor组成的table。
-- 因此要将每个style_images_caffe[i]送入net中得到相应的输出。每个输出要经过gram处理,
-- 得到相应的grams值,每张style图片所附的权值得到target
for i = 1, #style_images_caffe do
local target_features = net:forward(style_images_caffe[i]):clone()
local target_i = gram:forward(target_features):clone()
target_i:div(target_features:nElement())
target_i:mul(style_blend_weights[i])
if i == 1 then
target = target_i
else
target:add(target_i)
end
end
local norm = params.normalize_gradients
local loss_module = nn.StyleLoss(params.style_weight, target, norm):float()
if params.gpu >= 0 then
if params.backend ~= 'clnn' then
loss_module:cuda()
else
loss_module:cl()
end
end
net:add(loss_module)
table.insert(style_losses, loss_module)
next_style_idx = next_style_idx + 1
end
end
end
-- We don't need the base CNN anymore, so clean it up to save memory.
cnn = nil
for i=1,#net.modules do
local module = net.modules[i]
if torch.type(module) == 'nn.SpatialConvolutionMM' then
-- remove these, not used, but uses gpu memory
module.gradWeight = nil
module.gradBias = nil
end
end
collectgarbage()
-- Initialize the image
if params.seed >= 0 then
torch.manualSeed(params.seed)
end
local img = nil
if params.init == 'random' then
img = torch.randn(content_image:size()):float():mul(0.001)
elseif params.init == 'image' then
img = content_image_caffe:clone():float()
else
error('Invalid init type')
end
if params.gpu >= 0 then
if params.backend ~= 'clnn' then
img = img:cuda()
else
img = img:cl()
end
end
-- Run it through the network once to get the proper size for the gradient
-- All the gradients will come from the extra loss modules, so we just pass
-- zeros into the top of the net on the backward pass.
local y = net:forward(img)
local dy = img.new(#y):zero()
-- Declaring this here lets us access it in maybe_print
local optim_state = nil
if params.optimizer == 'lbfgs' then
optim_state = {
maxIter = params.num_iterations,
verbose=true,
}
elseif params.optimizer == 'adam' then
optim_state = {
learningRate = params.learning_rate,
}
else
error(string.format('Unrecognized optimizer "%s"', params.optimizer))
end
local function maybe_print(t, loss)
local verbose = (params.print_iter > 0 and t % params.print_iter == 0)
if verbose then
print(string.format('Iteration %d / %d', t, params.num_iterations))
for i, loss_module in ipairs(content_losses) do
print(string.format(' Content %d loss: %f', i, loss_module.loss))
end
for i, loss_module in ipairs(style_losses) do
print(string.format(' Style %d loss: %f', i, loss_module.loss))
end
print(string.format(' Total loss: %f', loss))
end
end
local function maybe_save(t)
local should_save = params.save_iter > 0 and t % params.save_iter == 0
should_save = should_save or t == params.num_iterations
if should_save then
local disp = deprocess(img:double())
disp = image.minmax{tensor=disp, min=0, max=1}
local filename = build_filename(params.output_image, t)
if t == params.num_iterations then
filename = params.output_image
end
image.save(filename, disp)
end
ed
-- Function to evaluate loss and gradient. We run the net forward and
-- backward to get the gradient, and sum up losses from the loss modules.
-- optim.lbfgs internally handles iteration and calls this fucntion many
-- times, so we manually count the number of iterations to handle printing
-- and saving intermediate results.
local num_calls = 0
local function feval(x)
num_calls = num_calls + 1
-- net:forward(x)会让content_module以及style_module各层计算loss
net:forward(x)
-- 由于并没有groundtruth的图。所以直接dy赋值为全是0的tensor
local grad = net:updateGradInput(x, dy)
-- loss包括content层的以及style层的,然后让这些loss加起来,让loss最小即可。
local loss = 0
for _, mod in ipairs(content_losses) do
loss = loss + mod.loss
end
for _, mod in ipairs(style_losses) do
loss = loss + mod.loss
end
maybe_print(num_calls, loss)
maybe_save(num_calls)
collectgarbage()
-- 返回的第一个参数是loss,是要最小化的目标,而传入feval的参数是要优化的参数。
-- optim.lbfgs expects a vector for gradients
return loss, grad:view(grad:nElement())
end
-- Run optimization.
if params.optimizer == 'lbfgs' then
print('Running optimization with L-BFGS')
local x, losses = optim.lbfgs(feval, img, optim_state)
elseif params.optimizer == 'adam' then
print('Running optimization with ADAM')
for t = 1, params.num_iterations do
local x, losses = optim.adam(feval, img, optim_state)
end
end
end
-- 下面是用到的函数
function build_filename(output_image, iteration)
local ext = paths.extname(output_image)
local basename = paths.basename(output_image, ext)
local directory = paths.dirname(output_image)
return string.format('%s/%s_%d.%s',directory, basename, iteration, ext)
end
-- Preprocess an image before passing it to a Caffe model.
-- We need to rescale from [0, 1] to [0, 255], convert from RGB to BGR,
-- and subtract the mean pixel.
function preprocess(img)
local mean_pixel = torch.DoubleTensor({103.939, 116.779, 123.68})
local perm = torch.LongTensor{3, 2, 1}
img = img:index(1, perm):mul(256.0)
mean_pixel = mean_pixel:view(3, 1, 1):expandAs(img)
img:add(-1, mean_pixel)
return img
end
-- Undo the above preprocessing.
function deprocess(img)
local mean_pixel = torch.DoubleTensor({103.939, 116.779, 123.68})
mean_pixel = mean_pixel:view(3, 1, 1):expandAs(img)
img = img + mean_pixel
local perm = torch.LongTensor{3, 2, 1}
img = img:index(1, perm):div(256.0)
return img
end
-- Define an nn Module to compute content loss in-place
local ContentLoss, parent = torch.class('nn.ContentLoss', 'nn.Module')
function ContentLoss:__init(strength, target, normalize)
parent.__init(self)
self.strength = strength
self.target = target
self.normalize = normalize or false
self.loss = 0
self.crit = nn.MSECriterion()
end
-- 标准写法:
function ContentLoss:updateOutput(input)
if input:nElement() == self.target:nElement() then
-- contentLoss层是在类似relu1_1之后的,主要是用来计算loss的。这个target当然是relu1_1传出的东西。
-- 也就是net:forward(content_image_caffe):clone,这里的net只加到了类似relu1_1层。
-- 一般在自定义的层会有updateOutput和updateGradInput两个函数的改写。
-- 注意: 只有 更新输入/输出 两个函数。没有更新权值!
-- 显然,更新输出只要传入输入就行,而updateGradInput当然要传入input和dLoss_doutput,才能进行反向传播啊!
-- 因为这一层是内容损失层,因此updateOutput就是self.loss = self.crit:forward(input, self.target) * self.strength啊
self.loss = self.crit:forward(input, self.target) * self.strength
else
print('WARNING: Skipping content loss')
end
-- 由于这里还是要进行output的,但是这一层并不改变输出,因此 self.output = input;
self.output = input
return self.output
end
--想想怎么自定义写updateGradInput的吧,显然传入input和gradOutput没啥说的。
--这个函数主要是计算dLoss_dInput,由于这里是contentloss层,因此是self.crit:backward(input,self.target)
--然后对返回的gradInput进行自定义的修改。
function ContentLoss:updateGradInput(input, gradOutput)
if input:nElement() == self.target:nElement() then
self.gradInput = self.crit:backward(input, self.target)
end
if self.normalize then
self.gradInput:div(torch.norm(self.gradInput, 1) + 1e-8)
end
self.gradInput:mul(self.strength)
self.gradInput:add(gradOutput)
return self.gradInput
end
-- Returns a network that computes the CxC Gram matrix from inputs
-- of size C x H x W
function GramMatrix()
local net = nn.Sequential()
-- s
net:add(nn.View(-1):setNumInputDims(2))
local concat = nn.ConcatTable()
concat:add(nn.Identity())
concat:add(nn.Identity())
net:add(concat)
net:add(nn.MM(false, true))
return net
end
-- Define an nn Module to compute style loss in-place
local StyleLoss, parent = torch.class('nn.StyleLoss', 'nn.Module')
function StyleLoss:__init(strength, target, normalize)
parent.__init(self)
self.normalize = normalize or false
self.strength = strength
self.target = target
self.loss = 0
self.gram = GramMatrix()
self.G = nil
self.crit = nn.MSECriterion()
end
function StyleLoss:updateOutput(input)
self.G = self.gram:forward(input)
self.G:div(input:nElement())
self.loss = self.crit:forward(self.G, self.target)
self.loss = self.loss * self.strength
self.output = input
return self.output
end
function StyleLoss:updateGradInput(input, gradOutput)
local dG = self.crit:backward(self.G, self.target)
dG:div(input:nElement())
self.gradInput = self.gram:backward(input, dG)
if self.normalize then
self.gradInput:div(torch.norm(self.gradInput, 1) + 1e-8)
end
self.gradInput:mul(self.strength)
self.gradInput:add(gradOutput)
return self.gradInput
end
local TVLoss, parent = torch.class('nn.TVLoss', 'nn.Module')
function TVLoss:__init(strength)
parent.__init(self)
self.strength = strength
self.x_diff = torch.Tensor()
self.y_diff = torch.Tensor()
end
function TVLoss:updateOutput(input)
self.output = input
return self.output
end
-- TV loss backward pass inspired by kaishengtai/neuralart
function TVLoss:updateGradInput(input, gradOutput)
self.gradInput:resizeAs(input):zero()
local C, H, W = input:size(1), input:size(2), input:size(3)
self.x_diff:resize(3, H - 1, W - 1)
self.y_diff:resize(3, H - 1, W - 1)
self.x_diff:copy(input[{{}, {1, -2}, {1, -2}}])
self.x_diff:add(-1, input[{{}, {1, -2}, {2, -1}}])
self.y_diff:copy(input[{{}, {1, -2}, {1, -2}}])
self.y_diff:add(-1, input[{{}, {2, -1}, {1, -2}}])
self.gradInput[{{}, {1, -2}, {1, -2}}]:add(self.x_diff):add(self.y_diff)
self.gradInput[{{}, {1, -2}, {2, -1}}]:add(-1, self.x_diff)
self.gradInput[{{}, {2, -1}, {1, -2}}]:add(-1, self.y_diff)
self.gradInput:mul(self.strength)
self.gradInput:add(gradOutput)
return self.gradInput
end
local params = cmd:parse(arg)
main(params)
cmd:option('-content_layers', 'relu4_2', 'layers for content')
cmd:option('-style_layers', 'relu1_1,relu2_1,relu3_1,relu4_1,relu5_1', 'layers for style')
那么net的最后一层就是relu5_1。当然这么说是有问题的,因为代码是将VGG的模型一层一层拿来,如果是普通层,直接加,如果是MaxPooling,则换成AvargePooling,如果是content层,加入contentLoss的loss_module层。如果是style层,那么加入styleLoss的loss_module。对于styleloss的loss_module,每一个是由gramNet构成的。
自定义层
需要自定义特殊类,比如contentLoss层。对于自定义层,至少要有3个重载,__init,updateOutput和updateGradInput。
Gram矩阵
GramMatrix是构建一个输入View层,好让输入复制成两份,然后构建ConcatTable。每份都加入一个nn.Identity(),net加入concat后,再将两个输出矩阵相乘。其中一个转置。返回自定义的gramNet。将gramNet看成一个整体会有利于理解。
function GramMatrix()
local net = nn.Sequential()
-- s
net:add(nn.View(-1):setNumInputDims(2))
local concat = nn.ConcatTable()
concat:add(nn.Identity())
concat:add(nn.Identity())
net:add(concat)
net:add(nn.MM(false, true))
return net
end
local target = nil
for i = 1, #style_images_caffe do
local target_features = net:forward(style_images_caffe[i]):clone()
local target_i = gram:forward(target_features):clone()
target_i:div(target_features:nElement())
target_i:mul(style_blend_weights[i])
if i == 1 then
target = target_i
else
target:add(target_i)
end
end
然后在 netx 后面加入styleLoss的loss_module.而loss_module的初始化是
local norm = params.normalize_gradients
local loss_module = nn.StyleLoss(params.style_weight, target, norm):float()
在StyleLoss中,__init中有:
...
self.gram = GramMatrix()
self.G = nil
self.crit = nn.MSECriterion()
在updateOutput中,还是比较容易理解的
function StyleLoss:updateOutput(input)
self.G = self.gram:forward(input)
self.G:div(input:nElement())
self.loss = self.crit:forward(self.G, self.target)
self.loss = self.loss * self.strength
self.output = input
return self.output
end
在updateGradInput中,self.G已经成了 netx 的真实输出(输入是随机噪声),目标是输出self.target(由style图片传到这一层的输出) 因此
self.crit:backward(self.G,self.target) 就可以得到dLoss_dG。然后就可以 self.gram:backward(input,dG) 得到gradInput。
function StyleLoss:updateGradInput(input, gradOutput)
--在updateOutput中,self.G已经成了gramNet的输出。因此dG由下式子得出。
local dG = self.crit:backward(self.G, self.target)
dG:div(input:nElement())
self.gradInput = self.gram:backward(input, dG)
if self.normalize then
self.gradInput:div(torch.norm(self.gradInput, 1) + 1e-8)
end
self.gradInput:mul(self.strength)
-- 这里为什么加起来啊??
self.gradInput:add(gradOutput)
return self.gradInput
end
local y = net:forward(img)
local dy = img.new(#y):zero()
local num_calls = 0
local function feval(x)
num_calls = num_calls + 1
-- net:forward(x)会让content_module以及style_module各层计算loss
net:forward(x)
-- 由于并没有groundtruth的图。所以直接dy赋值为全是0的tensor
local grad = net:updateGradInput(x, dy)
-- loss包括content层的以及style层的,然后让这些loss加起来,让loss最小即可。
local loss = 0
for _, mod in ipairs(content_losses) do
loss = loss + mod.loss
end
for _, mod in ipairs(style_losses) do
loss = loss + mod.loss
end
maybe_print(num_calls, loss)
maybe_save(num_calls)
collectgarbage()
-- 返回的第一个参数是loss,是要最小化的目标,而传入feval的参数是要优化的参数。
-- optim.lbfgs expects a vector for gradients
return loss, grad:view(grad:nElement())
end
-- Run optimization.
if params.optimizer == 'lbfgs' then
print('Running optimization with L-BFGS')
local x, losses = optim.lbfgs(feval, img, optim_state)
elseif params.optimizer == 'adam' then
print('Running optimization with ADAM')
for t = 1, params.num_iterations do
local x, losses = optim.adam(feval, img, optim_state)
end
end
When we have multiple style/content loss modules, we would need to make sure that the gradients generated by one module are passed back all the way to the input layer. Hence if we don't add the gradient that is coming from the modules ahead of the current layer in the network, we would essentially use gradients generated by a loss module only until another loss module occurs and then they would be ignored after that, hence they would not be propagated all the way till the input layer as we expect them to do so.
以后可能会加上更加复杂的代码解析。主要是一些关于ContentTable的用法以及一些nn.Replicate, nn.CDivTable之类的,其实很多操作都可以定义层,用内置的层的操作来替代自己写具体操作。这是很不错的事情。