这篇博客解析这份代码最重要的一个部分,LanguageModule.lua,这个文件由两部分组成,一个为nn.LanguageModule类(继承于nn.module),另一个为nn.LanguageModuleCriterion类(继承于nn.criterion),其中nn.LanguageModule类中预测形成caption的方法中,因水平有限,我只解析sample方法,而beam_sample方法若之后有机会,有时间能查询相关资料后再做解析,因为原作者预测时,也用的是sample方法。
这三个文件在我之前的博客都有所解析,大家可以翻之前的博客。
require 'nn'
local utils = require 'misc.utils'
local net_utils = require 'misc.net_utils'
local LSTM = require 'misc.LSTM'
这个函数初始了LanguageModule中的各种参数,我会以注释的形式标记上
function layer:__init(opt)
parent.__init(self)
-- options for core network
-- 这里定义是词空间的大小,这个大小可以理解为词的种类的多少
self.vocab_size = utils.getopt(opt, 'vocab_size') -- required
-- 这里定义的是图片,与词向量的共同空间的大小,词向量和imageFeature最终都要映射成input_encoding_size大小用作LSTM的输入
self.input_encoding_size = utils.getopt(opt, 'input_encoding_size')
-- LSTM中RNN的size
self.rnn_size = utils.getopt(opt, 'rnn_size')
-- LSTM模型层数
self.num_layers = utils.getopt(opt, 'num_layers', 1)
-- 就是dropout,不了解的可以wiki,是正则化的一种方法
local dropout = utils.getopt(opt, 'dropout', 0)
-- options for Language Model
-- captions的长度,即每个图像它所对应描述句子的最大长度
self.seq_length = utils.getopt(opt, 'seq_length')
-- create the core lstm network. note +1 for both the START and END tokens
-- 创建LSTM的核心网络,+1是因为有特殊的START END向量
self.core = LSTM.lstm(self.input_encoding1_size, self.vocab_size + 1, self.rnn_size, self.num_layers, dropout)
--lookuptable在torch中也是继承于nn.modules(),因此下面这句代码也是相当与构建一个vocab_size+1到input_encoding_size的过程,相当于一个encode过程
self.lookup_table = nn.LookupTable(self.vocab_size + 1, self.input_encoding_size)
--创建state的初始状态,state与LSTM中的prev_c和prev_h一一对应
self:_createInitState(1) -- will be lazily resized later during forward passes
end
这个方法为在初始时所调用,用来初始LSTM中prev_c,prev_h的输入,初始prev_c的梯度,prev_h的梯度
function layer:_createInitState(batch_size)
assert(batch_size ~= nil, 'batch size must be provided')
-- construct the initial state for the LSTM
if not self.init_state then self.init_state = {} end -- lazy init
for h=1,self.num_layers*2 do
-- note, the init state Must be zeros because we are using init_state to init grads in backward call too
-- 注意这里的num_layers*2应该是与prev_c和prev_h相对应
-- 检查各初始状态是否符合batch_size,若不符合则将其转换成batch_size的尺寸
-- 这里是将prev_c和prev_h分开存放的
if self.init_state[h] then
if self.init_state[h]:size(1) ~= batch_size then
self.init_state[h]:resize(batch_size, self.rnn_size):zero() -- expand the memory
end
else
self.init_state[h] = torch.zeros(batch_size, self.rnn_size)
end
end
--设定状态的大小
--这num_state的大小为num_layers*2
self.num_state = #self.init_state
end
创建clone网络,注意这里的克隆都是浅clone,所以clone体随原体变化而变化
function layer:createClones()
-- construct the net clones
print('constructing clones inside the LanguageModel')
--将网络的核心部分clone,创建了一号clone,可以用self.clones[1]索引查询
self.clones = {self.core}
--创建loopup_table层的克隆
self.lookup_tables = {self.lookup_table}
--这里是self.seq_length+2,因为一号作为img输入,2号是START特殊向量的输入,3--self.seq_length才是captions序列输入
for t=2,self.seq_length+2 do
--If arguments are provided to the clone(...) function it also calls share(...) with those arguments on the cloned module after creating it,
--hence making a deep copy of this module with some shared parameters.所以这里的clones[t]中的参数都是共享的
self.clones[t] = self.core:clone('weight', 'bias', 'gradWeight', 'gradBias')
self.lookup_tables[t] = self.lookup_table:clone('weight', 'gradWeight')
end
end
getModuleList(),这个方法是将LanguageModule的核心模块以List的形式返回
function layer:getModulesList()
return {self.core, self.lookup_table}
end
parameters(),training(),evaluate()这几个函数不作解析,大家一定都看得懂
sample()这个方法很关键,这个方法是在模型训练完毕后,输入图片来形成captions,因我水平有限,可能会有一些错误,大家请见谅。
--[[
takes a batch of images and runs the model forward in sampling mode
Careful: make sure model is in :evaluate() mode if you're calling this.
Returns: a DxN LongTensor with integer elements 1..M,
where D is sequence length and N is batch (so columns are sequences)
--]]
function layer:sample(imgs, opt)
--获得这三个参数,由于原作者对这三个参数的设定,所以大家可以不用完全理解这三个参数的具体含义
local sample_max = utils.getopt(opt, 'sample_max', 1)
local beam_size = utils.getopt(opt, 'beam_size', 1)
local temperature = utils.getopt(opt, 'temperature', 1.0)
if sample_max == 1 and beam_size > 1 then return self:sample_beam(imgs, opt) end -- indirection for beam search
--解释下imgs的格式,imgs为2维向量,第一维为BATCH_SIZE,第二维的大小为encoding_size,这说明这时传入的imgs已经经过了cnn,是imgsFeature
local batch_size = imgs:size(1)
self:_createInitState(batch_size)
--state中储存的是preV_c和prev_h
local state = self.init_state
-- we will write output predictions into tensor seq
-- seq存储预测序列,而seqLogprabs序列中对应元素对数正太分布的概率
local seq = torch.LongTensor(self.seq_length, batch_size):zero()
local seqLogprobs = torch.FloatTensor(self.seq_length, batch_size)
local logprobs -- logprobs predicted in last time step
for t=1,self.seq_length+2 do
--xt为LSTM网络中输入的向量x,it是词向量,sampleLogprobs为sample的对数正态分布的概率
local xt, it, sampleLogprobs
--若t==1,表明这层应该输入的是图片信息
if t == 1 then
-- feed in the images
xt = imgs
elseif t == 2 then
-- feed in the start tokens
-- 输入开始向量,it为VOCAB_SIZE+1长度的向量,其中第vocab_szie+1为1,其余元素为零
it = torch.LongTensor(batch_size):fill(self.vocab_size+1)
--将START向量映射到encoding_size的大小
xt = self.lookup_table:forward(it)
else
-- take predictions from previous time step and feed them in
if sample_max == 1 then
-- use argmax "sampling"
--取得predictions的最大的值,作为预测值
sampleLogprobs, it = torch.max(logprobs, 2)
it = it:view(-1):long()
else
--这块是真的不清楚
-- sample from the distribution of previous predictions
local prob_prev
if temperature == 1.0 then
prob_prev = torch.exp(logprobs) -- fetch prev distribution: shape Nx(M+1)
else
-- scale logprobs by temperature
prob_prev = torch.exp(torch.div(logprobs, temperature))
end
it = torch.multinomial(prob_prev, 1)
sampleLogprobs = logprobs:gather(2, it) -- gather the logprobs at sampled positions
it = it:view(-1):long() -- and flatten indices for downstream processing
end
xt = self.lookup_table:forward(it)
end
if t >= 3 then
--这是将预测的结果it存入seq预测序列,seqLogprobs同理
seq[t-2] = it -- record the samples
seqLogprobs[t-2] = sampleLogprobs:view(-1):float() -- and also their log likelihoods
end
--inputs是这层LSTM的输入,xt,prev_c与prev_h
local inputs = {xt,unpack(state)}
--out是经过LSTM层的输出,这里的out的大小为vocab_size+1
local out = self.core:forward(inputs)
--这是LSTM最后一层
--取预测值,logprobs为最后一层的输出值,是一个vocab_size+1大小的向量,其中元素大小在[0,1)之间,上面中去中元素最大为预测值
logprobs = out[self.num_state+1] -- last element is the output vector
--将每层的out结果存入state中,作为下一层的输入值
state = {}
for i=1,self.num_state do table.insert(state, out[i]) end
end
-- return the samples and their log likelihoods
return seq, seqLogprobs
end
因为nn.LanguageModule继承于nn.Module,是一个全新的模块,所以重写updataoutput是有必要的
--[[
input is a tuple of:
1. torch.Tensor of size NxK (K is dim of image code)
2. torch.LongTensor of size DxN, elements 1..M
where M = opt.vocab_size and D = opt.seq_length
returns a (D+2)xNx(M+1) Tensor giving (normalized) log probabilities for the
next token at every iteration of the LSTM (+2 because +1 for first dummy
img forward, and another +1 because of START/END tokens shift)
--]]
-- N是batch_size
-- 注意在这里训练的过程中输入的向量为确定的,为正确的label
function layer:updateOutput(input)
--其中imgs已经转换为encoding_size的向量
local imgs = input[1]
local seq = input[2]
--确认是否创建了克隆体
if self.clones == nil then self:createClones() end -- lazily create clones on first forward pass
--seq为2维tensor,第一维的大小为seq_length,第二维的大小为batch_size
assert(seq:size(1) == self.seq_length)
local batch_size = seq:size(2)
--创建网路输出tensor
self.output:resize(self.seq_length+2, batch_size, self.vocab_size+1)
self:_createInitState(batch_size)
--创建网络中statetable
self.state = {[0] = self.init_state}
--创建网络中的输入table
self.inputs = {}
--创建网络中编码层loopup_tables的输入
self.lookup_tables_inputs = {}
--初始序列的真实长度
self.tmax = 0 -- we will keep track of max sequence length encountered in the data for efficiency
for t=1,self.seq_length+2 do
--can_skip控制是否执行下一次循环,相当于continue功能
local can_skip = false
local xt
if t == 1 then
-- feed in the images
-- N为batchsize,K为encoding_size
xt = imgs -- NxK sized input
elseif t == 2 then
-- feed in the start tokens
local it = torch.LongTensor(batch_size):fill(self.vocab_size+1)
--编码
self.lookup_tables_inputs[t] = it
--向前传播,得LSTM的输入xt
xt = self.lookup_tables[t]:forward(it) -- NxK sized input (token embedding vectors)
else
-- feed in the rest of the sequence...
-- 获得loopup_tabel的输入词向量
local it = seq[t-2]:clone()
--判断该it是不是为空,it是一个长度为batch_size的向量,sum(it)==0意味着这batch_Size个序列都训练完毕,至can_skip为零表示可以跳过
if torch.sum(it) == 0 then
-- computational shortcut for efficiency. All sequences have already terminated and only
-- contain null tokens from here on. We can skip the rest of the forward pass and save time
can_skip = true
end
--[[
seq may contain zeros as null tokens, make sure we take them out to any arbitrary token
that won't make lookup_table crash with an error.
token #1 will do, arbitrarily. This will be ignored anyway
because we will carefully set the loss to zero at these places
in the criterion, so computation based on this value will be noop for the optimization.
--]]
--防止有部分序列已经训练完毕
it[torch.eq(it,0)] = 1
--取得xt
if not can_skip then
self.lookup_tables_inputs[t] = it
xt = self.lookup_tables[t]:forward(it)
end
end
if not can_skip then
-- construct the inputs
--符合LSTM格式运算
self.inputs[t] = {xt,unpack(self.state[t-1])}
-- forward the network
-- 进行LSTM网络的向前传播,得到out输出table,最后一层为输出向量
local out = self.clones[t]:forward(self.inputs[t])
-- process the outputs
-- 更新output
self.output[t] = out[self.num_state+1] -- last element is the output vector
-- 存储state状态
self.state[t] = {} -- the rest is state
for i=1,self.num_state do table.insert(self.state[t], out[i]) end
--标记该组训练数据的最后position
self.tmax = t
end
end
return self.output
end
同样重载了updataGradInput(input,gradOutput)
--D是seq_length,M是vocal_size,N为batchsize
--[[
gradOutput is an (D+2)xNx(M+1) Tensor.
--]]
function layer:updateGradInput(input, gradOutput)
local dimgs -- grad on input images
-- go backwards and lets compute gradients
local dstate = {[self.tmax] = self.init_state} -- this works when init_state is all zeros
--注意这里self.tmax的最大值为seq_length+2
for t=self.tmax,1,-1 do
-- concat state gradients and output vector gradients at time step t
local dout = {}
--将梯度状态插入到dout中,这里的梯度状态分三个,一个是cell梯度,一个是hide梯度,一个是output所对应的梯度。
for k=1,#dstate[t] do table.insert(dout, dstate[t][k]) end
table.insert(dout,gradOutput[t])
--这里dout类型为table,为什么能是table类型,因为LSTM核心网络是返回gmodule类型,我一开始习惯性的以为是module类型,吃了小亏。
-- 蠢了这里的self.clones[t]的类型为table,而且是每次迭代新创建的dout
local dinputs = self.clones[t]:backward(self.inputs[t], dout)
-- split the gradient to xt and to state
--这里的dinputs也是由三种构成的,分别为xt梯度,prev_c梯度,prev_h梯度
local dxt = dinputs[1] -- first element is the input vector
dstate[t-1] = {} -- copy over rest to state grad
--将input梯度状态存储
for k=2,self.num_state+1 do table.insert(dstate[t-1], dinputs[k]) end
-- continue backprop of xt
if t == 1 then
--将梯度传导至图片
dimgs = dxt
else
--将梯度传导至lookup_table层
local it = self.lookup_tables_inputs[t]
self.lookup_tables[t]:backward(it, dxt) -- backprop into lookup table
end
end
-- we have gradient on image, but for LongTensor gt sequence we only create an empty tensor - can't backprop
-- 这里只输出图片接口的梯度,因为词向量的梯度已到尽头了不能传导了
self.gradInput = {dimgs, torch.Tensor()}
return self.gradInput
end
nn.LanguageModelCriterion这个类是作为languagemodule最后的结果校准,这个继承于nn.Criterion,先说明一下这个模型的评估准则,首先模型输出的是一个一维的维数大小为VOCAB_SIZE+1大小的向量,其中元素经过softmax,取值在[0,1)之间,模型去向量中最大的元素的编号为词的索引值,所以最后模型预测的结果为索引值对应的词,并将正确索引值所在的元素位置梯度至为-1,并将误差添加到总误差中。
function crit:updateOutput(input, seq)
--创建gradInput tensor
self.gradInput:resizeAs(input):zero() -- reset to zeros
--获得输入数据的各维度信息,L为seq——length+2,N为batch_size,MP1为Vocab——size+1
local L,N,Mp1 = input:size(1), input:size(2), input:size(3)
--D为seq_length
local D = seq:size(1)
assert(D == L-2, 'input Tensor should be 2 larger in time')
local loss = 0
local n = 0
for b=1,N do -- iterate over batches
local first_time = true
for t=2,L do -- iterate over sequence time (ignore t=1, dummy forward for the image)
-- fetch the index of the next token in the sequence
-- 创建目标值的索引
local target_index
if t-1 > D then -- we are out of bounds of the index sequence: pad with null tokens
target_index = 0
else
--取得目标值的索引,为什么是t-1,原作者作出解释
target_index = seq[{t-1,b}] -- t-1 is correct, since at t=2 START token was fed in and we want to predict first word (and 2-1 = 1).
end
-- the first time we see null token as next index, actually want the model to predict the END token
if target_index == 0 and first_time then
--这是标明END token
target_index = Mp1
first_time = false
end
-- if there is a non-null next token, enforce loss!
if target_index ~= 0 then
-- accumulate loss
--查看input[{t,b,target_index}]的值,等于input[t][b][target_index]
loss = loss - input[{ t,b,target_index }] -- log(p)
--将目标梯度标记为-1,其他为0
self.gradInput[{ t,b,target_index }] = -1
n = n + 1
end
end
end
--取得输出
self.output = loss / n -- normalize by number of predictions that were made
self.gradInput:div(n)
return self.output
end
这LanguageModule.lua,就到这里吧,我看这代码比较慢,最主要的原因是我初学torch与lua,我需要将看代码,一边学习torch与lua,我最终目的还需我自己做实验,这最后一个模块train.lua写完之后,我会沉寂几个月,中间偶尔会发最近image caption这个小领域的最新情况和大神们的进展,继续加油!Fighting!