neuraltalk2代码解析-(3)-LanguageModule.lua

这篇博客解析这份代码最重要的一个部分,LanguageModule.lua,这个文件由两部分组成,一个为nn.LanguageModule类(继承于nn.module),另一个为nn.LanguageModuleCriterion类(继承于nn.criterion),其中nn.LanguageModule类中预测形成caption的方法中,因水平有限,我只解析sample方法,而beam_sample方法若之后有机会,有时间能查询相关资料后再做解析,因为原作者预测时,也用的是sample方法。

这三个文件在我之前的博客都有所解析,大家可以翻之前的博客。

require 'nn'
local utils = require 'misc.utils'
local net_utils = require 'misc.net_utils'
local LSTM = require 'misc.LSTM'

这个函数初始了LanguageModule中的各种参数,我会以注释的形式标记上

function layer:__init(opt)
  parent.__init(self)

  -- options for core network
  -- 这里定义是词空间的大小,这个大小可以理解为词的种类的多少
  self.vocab_size = utils.getopt(opt, 'vocab_size') -- required
  -- 这里定义的是图片,与词向量的共同空间的大小,词向量和imageFeature最终都要映射成input_encoding_size大小用作LSTM的输入
  self.input_encoding_size = utils.getopt(opt, 'input_encoding_size')
  -- LSTM中RNN的size
  self.rnn_size = utils.getopt(opt, 'rnn_size')
  -- LSTM模型层数
  self.num_layers = utils.getopt(opt, 'num_layers', 1)
  -- 就是dropout,不了解的可以wiki,是正则化的一种方法
  local dropout = utils.getopt(opt, 'dropout', 0)
  -- options for Language Model
  -- captions的长度,即每个图像它所对应描述句子的最大长度
  self.seq_length = utils.getopt(opt, 'seq_length')
  -- create the core lstm network. note +1 for both the START and END tokens
  -- 创建LSTM的核心网络,+1是因为有特殊的START END向量
  self.core = LSTM.lstm(self.input_encoding1_size, self.vocab_size + 1, self.rnn_size, self.num_layers, dropout)
  --lookuptable在torch中也是继承于nn.modules(),因此下面这句代码也是相当与构建一个vocab_size+1到input_encoding_size的过程,相当于一个encode过程
  self.lookup_table = nn.LookupTable(self.vocab_size + 1, self.input_encoding_size)
  --创建state的初始状态,state与LSTM中的prev_c和prev_h一一对应
  self:_createInitState(1) -- will be lazily resized later during forward passes
end

这个方法为在初始时所调用,用来初始LSTM中prev_c,prev_h的输入,初始prev_c的梯度,prev_h的梯度

function layer:_createInitState(batch_size)
  assert(batch_size ~= nil, 'batch size must be provided')
  -- construct the initial state for the LSTM
  if not self.init_state then self.init_state = {} end -- lazy init
  for h=1,self.num_layers*2 do
    -- note, the init state Must be zeros because we are using init_state to init grads in backward call too
    -- 注意这里的num_layers*2应该是与prev_c和prev_h相对应
    -- 检查各初始状态是否符合batch_size,若不符合则将其转换成batch_size的尺寸
    -- 这里是将prev_c和prev_h分开存放的
    if self.init_state[h] then
      if self.init_state[h]:size(1) ~= batch_size then
        self.init_state[h]:resize(batch_size, self.rnn_size):zero() -- expand the memory
      end
    else
      self.init_state[h] = torch.zeros(batch_size, self.rnn_size)
    end
  end
  --设定状态的大小
  --这num_state的大小为num_layers*2
  self.num_state = #self.init_state
end

创建clone网络,注意这里的克隆都是浅clone,所以clone体随原体变化而变化

function layer:createClones()
  -- construct the net clones
  print('constructing clones inside the LanguageModel')
  --将网络的核心部分clone,创建了一号clone,可以用self.clones[1]索引查询
  self.clones = {self.core}
  --创建loopup_table层的克隆
  self.lookup_tables = {self.lookup_table}
  --这里是self.seq_length+2,因为一号作为img输入,2号是START特殊向量的输入,3--self.seq_length才是captions序列输入
  for t=2,self.seq_length+2 do
  --If arguments are provided to the clone(...) function it also calls share(...) with those arguments on the cloned module after creating it,
  --hence making a deep copy of this module with some shared parameters.所以这里的clones[t]中的参数都是共享的
    self.clones[t] = self.core:clone('weight', 'bias', 'gradWeight', 'gradBias')
    self.lookup_tables[t] = self.lookup_table:clone('weight', 'gradWeight')
  end
end

getModuleList(),这个方法是将LanguageModule的核心模块以List的形式返回

function layer:getModulesList()
  return {self.core, self.lookup_table}
end

parameters(),training(),evaluate()这几个函数不作解析,大家一定都看得懂

sample()这个方法很关键,这个方法是在模型训练完毕后,输入图片来形成captions,因我水平有限,可能会有一些错误,大家请见谅。

--[[
takes a batch of images and runs the model forward in sampling mode
Careful: make sure model is in :evaluate() mode if you're calling this.
Returns: a DxN LongTensor with integer elements 1..M,
where D is sequence length and N is batch (so columns are sequences)
--]]
function layer:sample(imgs, opt)
  --获得这三个参数,由于原作者对这三个参数的设定,所以大家可以不用完全理解这三个参数的具体含义
  local sample_max = utils.getopt(opt, 'sample_max', 1)
  local beam_size = utils.getopt(opt, 'beam_size', 1)
  local temperature = utils.getopt(opt, 'temperature', 1.0)
  if sample_max == 1 and beam_size > 1 then return self:sample_beam(imgs, opt) end -- indirection for beam search
  --解释下imgs的格式,imgs为2维向量,第一维为BATCH_SIZE,第二维的大小为encoding_size,这说明这时传入的imgs已经经过了cnn,是imgsFeature
  local batch_size = imgs:size(1)
  self:_createInitState(batch_size)
  --state中储存的是preV_c和prev_h
  local state = self.init_state
  -- we will write output predictions into tensor seq
  -- seq存储预测序列,而seqLogprabs序列中对应元素对数正太分布的概率
  local seq = torch.LongTensor(self.seq_length, batch_size):zero()
  local seqLogprobs = torch.FloatTensor(self.seq_length, batch_size)
  local logprobs -- logprobs predicted in last time step
  for t=1,self.seq_length+2 do
    --xt为LSTM网络中输入的向量x,it是词向量,sampleLogprobs为sample的对数正态分布的概率
    local xt, it, sampleLogprobs
    --若t==1,表明这层应该输入的是图片信息
    if t == 1 then
      -- feed in the images
      xt = imgs
    elseif t == 2 then
      -- feed in the start tokens
      -- 输入开始向量,it为VOCAB_SIZE+1长度的向量,其中第vocab_szie+1为1,其余元素为零
      it = torch.LongTensor(batch_size):fill(self.vocab_size+1)
      --将START向量映射到encoding_size的大小
      xt = self.lookup_table:forward(it)
    else
      -- take predictions from previous time step and feed them in
      if sample_max == 1 then
        -- use argmax "sampling"
        --取得predictions的最大的值,作为预测值
        sampleLogprobs, it = torch.max(logprobs, 2)
        it = it:view(-1):long()
      else
        --这块是真的不清楚
        -- sample from the distribution of previous predictions
        local prob_prev
        if temperature == 1.0 then
          prob_prev = torch.exp(logprobs) -- fetch prev distribution: shape Nx(M+1)
        else
          -- scale logprobs by temperature
          prob_prev = torch.exp(torch.div(logprobs, temperature))
        end
        it = torch.multinomial(prob_prev, 1)
        sampleLogprobs = logprobs:gather(2, it) -- gather the logprobs at sampled positions
        it = it:view(-1):long() -- and flatten indices for downstream processing
      end
      xt = self.lookup_table:forward(it)
    end

    if t >= 3 then
      --这是将预测的结果it存入seq预测序列,seqLogprobs同理
      seq[t-2] = it -- record the samples
      seqLogprobs[t-2] = sampleLogprobs:view(-1):float() -- and also their log likelihoods
    end
    --inputs是这层LSTM的输入,xt,prev_c与prev_h
    local inputs = {xt,unpack(state)}
    --out是经过LSTM层的输出,这里的out的大小为vocab_size+1
    local out = self.core:forward(inputs)
    --这是LSTM最后一层
    --取预测值,logprobs为最后一层的输出值,是一个vocab_size+1大小的向量,其中元素大小在[0,1)之间,上面中去中元素最大为预测值
    logprobs = out[self.num_state+1] -- last element is the output vector
    --将每层的out结果存入state中,作为下一层的输入值
    state = {}
    for i=1,self.num_state do table.insert(state, out[i]) end
  end
  -- return the samples and their log likelihoods
  return seq, seqLogprobs
end

因为nn.LanguageModule继承于nn.Module,是一个全新的模块,所以重写updataoutput是有必要的

--[[
input is a tuple of:
1. torch.Tensor of size NxK (K is dim of image code)
2. torch.LongTensor of size DxN, elements 1..M
   where M = opt.vocab_size and D = opt.seq_length

returns a (D+2)xNx(M+1) Tensor giving (normalized) log probabilities for the
next token at every iteration of the LSTM (+2 because +1 for first dummy
img forward, and another +1 because of START/END tokens shift)
--]]
-- N是batch_size
-- 注意在这里训练的过程中输入的向量为确定的,为正确的label
function layer:updateOutput(input)
  --其中imgs已经转换为encoding_size的向量
  local imgs = input[1]
  local seq = input[2]
  --确认是否创建了克隆体
  if self.clones == nil then self:createClones() end -- lazily create clones on first forward pass
  --seq为2维tensor,第一维的大小为seq_length,第二维的大小为batch_size
  assert(seq:size(1) == self.seq_length)
  local batch_size = seq:size(2)
  --创建网路输出tensor
  self.output:resize(self.seq_length+2, batch_size, self.vocab_size+1)
  self:_createInitState(batch_size)
  --创建网络中statetable
  self.state = {[0] = self.init_state}
  --创建网络中的输入table
  self.inputs = {}
  --创建网络中编码层loopup_tables的输入
  self.lookup_tables_inputs = {}
  --初始序列的真实长度
  self.tmax = 0 -- we will keep track of max sequence length encountered in the data for efficiency
  for t=1,self.seq_length+2 do
    --can_skip控制是否执行下一次循环,相当于continue功能
    local can_skip = false
    local xt
    if t == 1 then
      -- feed in the images
      -- N为batchsize,K为encoding_size
      xt = imgs -- NxK sized input
    elseif t == 2 then
      -- feed in the start tokens
      local it = torch.LongTensor(batch_size):fill(self.vocab_size+1)
      --编码
      self.lookup_tables_inputs[t] = it
      --向前传播,得LSTM的输入xt
      xt = self.lookup_tables[t]:forward(it) -- NxK sized input (token embedding vectors)
    else
      -- feed in the rest of the sequence...
      -- 获得loopup_tabel的输入词向量
      local it = seq[t-2]:clone()
      --判断该it是不是为空,it是一个长度为batch_size的向量,sum(it)==0意味着这batch_Size个序列都训练完毕,至can_skip为零表示可以跳过
      if torch.sum(it) == 0 then
        -- computational shortcut for efficiency. All sequences have already terminated and only
        -- contain null tokens from here on. We can skip the rest of the forward pass and save time
        can_skip = true
      end
      --[[
        seq may contain zeros as null tokens, make sure we take them out to any arbitrary token
        that won't make lookup_table crash with an error.
        token #1 will do, arbitrarily. This will be ignored anyway
        because we will carefully set the loss to zero at these places
        in the criterion, so computation based on this value will be noop for the optimization.
      --]]
      --防止有部分序列已经训练完毕
      it[torch.eq(it,0)] = 1
      --取得xt
      if not can_skip then
        self.lookup_tables_inputs[t] = it
        xt = self.lookup_tables[t]:forward(it)
      end
    end

    if not can_skip then
      -- construct the inputs
      --符合LSTM格式运算
      self.inputs[t] = {xt,unpack(self.state[t-1])}
      -- forward the network
      -- 进行LSTM网络的向前传播,得到out输出table,最后一层为输出向量
      local out = self.clones[t]:forward(self.inputs[t])
      -- process the outputs
      -- 更新output
      self.output[t] = out[self.num_state+1] -- last element is the output vector
      -- 存储state状态
      self.state[t] = {} -- the rest is state
      for i=1,self.num_state do table.insert(self.state[t], out[i]) end
      --标记该组训练数据的最后position
      self.tmax = t
    end
  end

  return self.output
end

同样重载了updataGradInput(input,gradOutput)

--D是seq_length,M是vocal_size,N为batchsize
--[[
gradOutput is an (D+2)xNx(M+1) Tensor.
--]]
function layer:updateGradInput(input, gradOutput)
  local dimgs -- grad on input images
  -- go backwards and lets compute gradients
  local dstate = {[self.tmax] = self.init_state} -- this works when init_state is all zeros
  --注意这里self.tmax的最大值为seq_length+2
  for t=self.tmax,1,-1 do
    -- concat state gradients and output vector gradients at time step t
    local dout = {}
    --将梯度状态插入到dout中,这里的梯度状态分三个,一个是cell梯度,一个是hide梯度,一个是output所对应的梯度。
    for k=1,#dstate[t] do table.insert(dout, dstate[t][k]) end
    table.insert(dout,gradOutput[t])
    --这里dout类型为table,为什么能是table类型,因为LSTM核心网络是返回gmodule类型,我一开始习惯性的以为是module类型,吃了小亏。
    -- 蠢了这里的self.clones[t]的类型为table,而且是每次迭代新创建的dout
    local dinputs = self.clones[t]:backward(self.inputs[t], dout)
    -- split the gradient to xt and to state
    --这里的dinputs也是由三种构成的,分别为xt梯度,prev_c梯度,prev_h梯度
    local dxt = dinputs[1] -- first element is the input vector
    dstate[t-1] = {} -- copy over rest to state grad
    --将input梯度状态存储
    for k=2,self.num_state+1 do table.insert(dstate[t-1], dinputs[k]) end
    -- continue backprop of xt
    if t == 1 then
      --将梯度传导至图片
      dimgs = dxt
    else
      --将梯度传导至lookup_table层
      local it = self.lookup_tables_inputs[t]
      self.lookup_tables[t]:backward(it, dxt) -- backprop into lookup table
    end
  end
  -- we have gradient on image, but for LongTensor gt sequence we only create an empty tensor - can't backprop
  -- 这里只输出图片接口的梯度,因为词向量的梯度已到尽头了不能传导了
  self.gradInput = {dimgs, torch.Tensor()}
  return self.gradInput
end

nn.LanguageModelCriterion这个类是作为languagemodule最后的结果校准,这个继承于nn.Criterion,先说明一下这个模型的评估准则,首先模型输出的是一个一维的维数大小为VOCAB_SIZE+1大小的向量,其中元素经过softmax,取值在[0,1)之间,模型去向量中最大的元素的编号为词的索引值,所以最后模型预测的结果为索引值对应的词,并将正确索引值所在的元素位置梯度至为-1,并将误差添加到总误差中。

function crit:updateOutput(input, seq)
  --创建gradInput tensor
  self.gradInput:resizeAs(input):zero() -- reset to zeros
  --获得输入数据的各维度信息,L为seq——length+2,N为batch_size,MP1为Vocab——size+1
  local L,N,Mp1 = input:size(1), input:size(2), input:size(3)
  --D为seq_length
  local D = seq:size(1)
  assert(D == L-2, 'input Tensor should be 2 larger in time')
  local loss = 0
  local n = 0
  for b=1,N do -- iterate over batches
    local first_time = true
    for t=2,L do -- iterate over sequence time (ignore t=1, dummy forward for the image)
      -- fetch the index of the next token in the sequence
      -- 创建目标值的索引
      local target_index
      if t-1 > D then -- we are out of bounds of the index sequence: pad with null tokens
        target_index = 0
      else
        --取得目标值的索引,为什么是t-1,原作者作出解释
        target_index = seq[{t-1,b}] -- t-1 is correct, since at t=2 START token was fed in and we want to predict first word (and 2-1 = 1).
      end
      -- the first time we see null token as next index, actually want the model to predict the END token
      if target_index == 0 and first_time then
        --这是标明END token
        target_index = Mp1
        first_time = false
      end

      -- if there is a non-null next token, enforce loss!
      if target_index ~= 0 then
        -- accumulate loss
        --查看input[{t,b,target_index}]的值,等于input[t][b][target_index]
        loss = loss - input[{ t,b,target_index }] -- log(p)
        --将目标梯度标记为-1,其他为0
        self.gradInput[{ t,b,target_index }] = -1
        n = n + 1
      end

    end
  end
  --取得输出
  self.output = loss / n -- normalize by number of predictions that were made
  self.gradInput:div(n)
  return self.output
end

这LanguageModule.lua,就到这里吧,我看这代码比较慢,最主要的原因是我初学torch与lua,我需要将看代码,一边学习torch与lua,我最终目的还需我自己做实验,这最后一个模块train.lua写完之后,我会沉寂几个月,中间偶尔会发最近image caption这个小领域的最新情况和大神们的进展,继续加油!Fighting!

你可能感兴趣的:(Torch)