语音识别系列4--语音识别CTC之模型训练源码解析

一、介绍

上一节我们简单介绍了CTC及数据准备过程,做好了数据准备,本节我们介绍CTC模型训练及源码解析。

CTC(Connectionist Temporal Classification)连接时间分类,直观上理解,循环神经网络(RNN)更适合于CTC训练,关于CTC的原理上的介绍,大家已经写的很多了,本节我们主要从代码着手,帮助大家从零搭建CTC-ASR训练系统。既然是系统,我们就让代码的扩展性更强一些,我们现在支持LSTM网络结构。

 

二、训练源码及解析

2.1配置文件(config-lstm.yml):

param: #配置参数
  num_classes: 219 #我们使用声韵母建模,音素总个数为219个
  encoder_type: lstm #网络结构使用LSTM结构
  input_size: 40 #输入我们使用40维的MFCC
  left_context: 10 #输入左边拼帧10帧
  right_context: 10 #输入右边拼帧10帧
  num_units: 512 #隐层单元个数
  num_layers: 4 #隐层数
  lstm_impl: BasicLSTMCell #LSTM结构类型
  use_peephole: True #LSTM结构是否使用PEEPHOLE
  weight_init: 0.1 #初始化参数
  clip_grad_norm: 5.0 #梯度更新参数
  clip_activation: 50 #激活函数截断参数
  num_proj: 256 #映射层维数
  weight_decay: 0 #正则化系数
  train_data_size: 3000 #训练数据量
  label_type: monophone #建模单元类型
  optimizer: adam #使用的优化器
  learning_rate: 0.0001 #初始学习率
  dropout: 0.8 #参数更新比例
  bottleneck_dim: 0 #瓶颈层维数
  train_data_file: ./data/th30h.tfrecords #训练数据及标签
  label_file: ./data/dict.txt #音素对应的字典
  beam_width: 1 #解码beam宽度
  batch_size: 32 #更新一次参数batch大小
  print_step: 50 #保存模型的频率,50次迭代保存一次模型
  num_epoch: 6 #数据迭代轮数

2.2网络结构文件(lstm.py):

# -*- coding: utf-8 -*-

"""Unidirectional LSTM encoder."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

#LSTM编码器,支持BasicLSTM,LSTM,BlockLSTM
#之所以叫编码器,相当于把语音特征编码到分类标签
class LSTMEncoder(object):
  """Unidirectional LSTM encoder.
  Args:
    num_units (int): 每一层的结点数
    num_proj (int): 映射层的结点数
    num_layers (int): 网络层数
    lstm_impl (string, optional): LSTM结构的不同实现
      - BasicLSTMCell: tf.contrib.rnn.BasicLSTMCell 基本LSTM (no peephole)
      - LSTMCell: tf.contrib.rnn.LSTMCell 标准LSTM
      - LSTMBlockCell: tf.contrib.rnn.LSTMBlockCell BLOCK LSTM
    use_peephole (bool): 是否使用peephole
    parameter_init (float): 初始化网络参数
    clip_activation (float): 通过激活函数后的裁剪范围 (> 0)
    time_major (bool, optional): 计算时是否使用时间为主序
    name (string, optional): 设置网络结构名称
  """
  def __init__(self,
               num_units,
               num_proj,
               num_layers,
               lstm_impl,
               use_peephole,
               parameter_init,
               clip_activation,
               time_major=False,
               name='lstm_encoder'):
    self.num_units = num_units
    if lstm_impl != 'LSTMCell':
      self.num_proj = None
    else:
      self.num_proj = num_proj
    self.num_layers = num_layers
    self.lstm_impl = lstm_impl
    self.use_peephole = use_peephole
    self.parameter_init = parameter_init
    self.clip_activation = clip_activation
    self.time_major = time_major
    self.name = name
  #可调用对象
  def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
    """Construct model graph.
    Args:
      inputs (placeholder): A tensor of size`[B, T, input_size]`
      inputs_seq_len (placeholder): A tensor of size` [B]`
      keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
      is_training (bool):
    Returns:
      outputs: Encoder states.
        if time_major is True, a tensor of size
                    `[T, B, num_units (num_proj)]`
        otherwise, `[B, T, num_units (num_proj)]`
      final_state: A final hidden state of the encoder
    """
    initializer = tf.random_uniform_initializer(
      minval=-self.parameter_init, maxval=self.parameter_init)
    if self.lstm_impl == 'BasicLSTMCell':
      outputs, final_state = basiclstmcell(
                             self.num_units, self.num_layers,
                             inputs, inputs_seq_len, keep_prob, initializer,
                             self.time_major)
    elif self.lstm_impl == 'LSTMCell':
      outputs, final_state = lstmcell(
                             self.num_units, self.num_proj, self.num_layers,
                             self.use_peephole, self.clip_activation,
                             inputs, inputs_seq_len, keep_prob, initializer,
                             self.time_major)
    elif self.lstm_impl == 'LSTMBlockCell':
      outputs, final_state = lstmblockcell(
                             self.num_units, self.num_layers,
                             self.use_peephole,
                             inputs, inputs_seq_len, keep_prob, initializer,
                             self.time_major)
    else:
      raise IndexError( 'lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' +
                '"LSTMBlockCell" or "LSTMBlockFusedCell" or ' +
                '"CudnnLSTM".')
    return outputs, final_state

#basic lstm网络结构
def basiclstmcell(num_units, num_layers, inputs, inputs_seq_len,
                  keep_prob, initializer, time_major):
  if time_major:
    # Convert from batch-major to time-major
    inputs = tf.transpose(inputs, [1, 0, 2])
  lstm_list = []
  with tf.variable_scope('multi_lstm', initializer=initializer) as scope:
    for i_layer in range(1, num_layers + 1, 1):
      lstm = tf.contrib.rnn.BasicLSTMCell(
                           num_units,
                           forget_bias=1.0,
                           state_is_tuple=True,
                           activation=tf.tanh)
      # Dropout for the hidden-hidden connections
      lstm = tf.contrib.rnn.DropoutWrapper(
                lstm, output_keep_prob=keep_prob)
      lstm_list.append(lstm)
    # Stack multiple cells
    stacked_lstm = tf.contrib.rnn.MultiRNNCell(
                     lstm_list, state_is_tuple=True)
    # Ignore 2nd return (the last state)
    outputs, final_state = tf.nn.dynamic_rnn(
                     cell=stacked_lstm,
                     inputs=inputs,
                     sequence_length=inputs_seq_len,
                     dtype=tf.float32,
                     time_major=time_major,
                     scope=scope)
  return outputs, final_state

#标准lstm网络结构
def lstmcell(num_units, num_proj, num_layers, use_peephole, clip_activation,
             inputs, inputs_seq_len, keep_prob, initializer, time_major):
  if time_major:
    # Convert form batch-major to time-major
    inputs = tf.transpose(inputs, [1, 0, 2])
  lstm_list = []
  with tf.variable_scope('multi_lstm', initializer=initializer) as scope:
    for i_layer in range(1, num_layers + 1, 1): 
      lstm = tf.contrib.rnn.LSTMCell(
                num_units,
                use_peepholes=use_peephole,
                cell_clip=clip_activation,
                num_proj=num_proj,
                forget_bias=1.0,
                state_is_tuple=True)
      # Dropout for the hidden-hidden connections
      lstm = tf.contrib.rnn.DropoutWrapper(
                lstm, output_keep_prob=keep_prob)
      lstm_list.append(lstm)
    # Stack multiple cells
    stacked_lstm = tf.contrib.rnn.MultiRNNCell(
            lstm_list, state_is_tuple=True)
    # Ignore 2nd return (the last state)
    outputs, final_state = tf.nn.dynamic_rnn(
            cell=stacked_lstm,
            inputs=inputs,
            sequence_length=inputs_seq_len,
            dtype=tf.float32,
            time_major=time_major,
            scope=scope)
  return outputs, final_state

#block lstm网络结构
def lstmblockcell(num_units, num_layers, use_peephole, inputs,
                  inputs_seq_len, keep_prob, initializer, time_major):
  if time_major:
    inputs = tf.transpose(inputs, [1, 0, 2])
  lstm_list = []
  with tf.variable_scope('multi_lstm', initializer=initializer) as scope:
    for i_layer in range(1, num_layers + 1, 1):
      lstm = tf.contrib.rnn.LSTMBlockCell(
                num_units, forget_bias=1.0, 
                use_peephole=use_peephole)
      lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
      lstm_list.append(lstm)
    stacked_lstm = tf.contrib.rnn.MultiRNNCell(lstm_list, state_is_tuple=True)
    outputs, final_state = tf.nn.dynamic_rnn(cell=stacked_lstm,
      inputs=inputs,sequence_length=inputs_seq_len,
      dtype=tf.float32,time_major=time_major,scope=scope)
  return outputs, final_state

2.3选择网络结构(choose_encoder.py):

# -*- coding: utf-8 -*-

"""Select & load encoder."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from lstm import LSTMEncoder

ENCODERS = { 
  "lstm": LSTMEncoder,
}

#选择模型结构,这里仅支持LSTM结构
def load(encoder_type):
  """Select & load encoder.
  Args:
    encoder_type (string): name of the ctc model in the key of ENCODERS
  Returns:
    An instance of the encoder
  """
  if encoder_type not in ENCODERS.keys():
    raise ValueError(
          "encoder_type should be one of [%s], you provided %s." %
          (", ".join(ENCODERS), encoder_type))
  return ENCODERS[encoder_type]

2.4基本的工具函数(basic_util.py):

# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
from os.path import join, isdir
#这里是一些工具函数

#创建目录
def mkdir(path_to_dir):
  if path_to_dir is not None and (not isdir(path_to_dir)):
    os.makedirs(path_to_dir)
  return path_to_dir

#创建子目录
def mkdir_join(path_to_dir, *dir_name):
  if path_to_dir is None:
    return path_to_dir
  for i in range(len(dir_name)):
    if '.' not in dir_name[i]:
      path_to_dir = mkdir(join(path_to_dir, dir_name[i]))
    else:
      path_to_dir = join(path_to_dir, dir_name[i])
  return path_to_dir

#统计总的参数量
def count_total_parameters(variables):
  total_parameters = 0 
  parameters_dict = {}
  for variable in variables:
    shape = variable.get_shape()
    variable_parameters = 1 
    for dim in shape:
      variable_parameters *= dim.value
    total_parameters += variable_parameters
    parameters_dict[variable.name] = variable_parameters
  return parameters_dict, total_parameters

2.5 模型基类(model_base.py):

# -*- coding: utf-8 -*-

"""Base class for all models."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

#支持的优化器
OPTIMIZER_CLS_NAMES = { 
  "adagrad": tf.train.AdagradOptimizer,
  "adadelta": tf.train.AdadeltaOptimizer,
  "adam": tf.train.AdamOptimizer,
  "rmsprop": tf.train.RMSPropOptimizer,
  "sgd": tf.train.GradientDescentOptimizer,
  "momentum": tf.train.MomentumOptimizer,
  "nestrov": tf.train.MomentumOptimizer
}

#模型基类
class ModelBase(object):
  def __init__(self, *args, **kwargs):
    pass

  def _build(self, *args, **kwargs):
    """Construct model graph."""
    raise NotADirectoryError

  def create_placeholders(self):
    """Create placeholders and append them to list."""
    raise NotImplementedError

  def compute_loss(self, *args, **kwargs):
    """Operation for computing loss."""
    raise NotImplementedError

  def _add_noise_to_inputs(self, inputs, stddev=0.075):
    """Add gaussian noise to the inputs.
    Args:
      inputs: the noise free input-features.
      stddev (float, optional): The standart deviation of the noise.
      Default is 0.075.
    Returns:
      inputs: Input features plus noise.
    """
    raise NotImplementedError

  def _add_noise_to_gradients(grads_and_vars, gradient_noise_scale,
                                stddev=0.075):
    """Adds scaled noise from a 0-mean normal distribution to gradients.
    Args:
      grads_and_vars:
      gradient_noise_scale:
      stddev (float):
    Returns:
    """
    raise NotImplementedError

  #设置优化器
  def _set_optimizer(self, optimizer, learning_rate):
    """Set optimizer.
    Args:
      optimizer (string): the name of the optimizer in
        OPTIMIZER_CLS_NAMES
      learning_rate (float): A learning rate
    Returns:
      optimizer:
    """
    optimizer = optimizer.lower()
    if optimizer not in OPTIMIZER_CLS_NAMES:
      raise ValueError(
        "Optimizer name should be one of [%s], you provided %s." %
        (", ".join(OPTIMIZER_CLS_NAMES), optimizer))

    # Select optimizer
    if optimizer == 'momentum':
      return OPTIMIZER_CLS_NAMES[optimizer](
             learning_rate=learning_rate,
             momentum=0.9)
    elif optimizer == 'nestrov':
      return OPTIMIZER_CLS_NAMES[optimizer](
             learning_rate=learning_rate,
             momentum=0.9,
             use_nesterov=True)
    else:
      return OPTIMIZER_CLS_NAMES[optimizer](
             learning_rate=learning_rate)

  def train(self, loss, optimizer, learning_rate):
    """Operation for training. Only the sigle GPU training is supported.
    Args:
      loss: An operation for computing loss
      optimizer (string): name of the optimizer in OPTIMIZER_CLS_NAMES
      learning_rate (placeholder): A learning rate
    Returns:
      train_op: operation for training
    """
    # Create a variable to track the global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set optimizer
    self.optimizer = self._set_optimizer(optimizer, learning_rate)

    if self.clip_grad_norm is not None:
      # Compute gradients
      grads_and_vars = self.optimizer.compute_gradients(loss)

      # Clip gradients
      clipped_grads_and_vars = self._clip_gradients(grads_and_vars)

      # Create operation for gradient update
      with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        train_op = self.optimizer.apply_gradients(
                    clipped_grads_and_vars,
                    global_step=global_step)
    else:
      # Use the optimizer to apply the gradients that minimize the loss
      # and also increment the global step counter as a single training
      # step
      with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        train_op = self.optimizer.minimize(
          loss, global_step=global_step)
    return train_op

  def _clip_gradients(self, grads_and_vars):
    """Clip gradients.
    Args:
      grads_and_vars (list): list of tuples of `(grads, vars)`
    Returns:
      clipped_grads_and_vars (list): list of tuple of
                `(clipped grads, vars)`
    """
    clipped_grads_and_vars = []

    # Clip gradient norm
    for grad, var in grads_and_vars:
      if grad is not None:
        clipped_grads_and_vars.append(
          (tf.clip_by_norm(grad, clip_norm=self.clip_grad_norm),
          var))
    return clipped_grads_and_vars

2.6 CTC模型(model_ctc.py):

# -*- coding: utf-8 -*-
"""CTC model."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
from model_base import ModelBase
from choose_encoder import load

class CTC(ModelBase):
  """Connectionist Temporal Classification (CTC) network.
  Args:
    encoder_type (string): The type of an encoder
      lstm: Unidirectional LSTM
    input_size (int): the dimensions of input vectors
    num_units (int): the number of units in each layer
    num_layers (int): the number of layers
    num_classes (int): the number of classes of target labels
      (except for a blank label)
    lstm_impl (string, optional): a base implementation of LSTM. This is
        not used for GRU models.
      - BasicLSTMCell: tf.contrib.rnn.BasicLSTMCell (no peephole)
      - LSTMCell: tf.contrib.rnn.LSTMCell
      - LSTMBlockCell: tf.contrib.rnn.LSTMBlockCell
      Choose the background implementation of tensorflow.
      Default is LSTMBlockCell.
    use_peephole (bool, optional): if True, use peephole connection. This
      is not used for GRU models.
    left_context (int, optional): the number of left context to slice
    right_context (int, optional): the number of right context to slice
    parameter_init (float, optional): the range of uniform distribution to
      initialize weight parameters (>= 0)
    clip_grad_norm (float, optional): the range of clipping of gradient
      norm (> 0)
    clip_activation (float, optional): the range of clipping of cell
      activation (> 0). This is not used for GRU models.
    num_proj (int, optional): the number of nodes in the projection layer.
      This is not used for GRU models.
    weight_decay (float, optional): a parameter for weight decay
    bottleneck_dim (int, optional): the dimensions of the bottleneck layer
    time_major (bool, optional): if True, time-major computation will be
      performed
  """
  def __init__(self,
         encoder_type,
         input_size,
         num_units,
         num_layers,
         num_classes,
         lstm_impl='LSTMBlockCell',
         use_peephole=True,
         left_context=10,
         right_context=10,
         parameter_init=0.1,
         clip_grad_norm=None,
         clip_activation=None,
         num_proj=None,
         weight_decay=0.0,
         bottleneck_dim=None,
         time_major=True):
    super(CTC, self).__init__()
    if clip_grad_norm is not None:
      assert float(clip_grad_norm) > 0, 'clip_grad_norm must be larger than 0.'
    assert float(weight_decay) >= 0, 'weight_decay must not be a negative value.'

    self.encoder_type = encoder_type
    self.input_size = input_size
    self.left_context = left_context
    self.right_context = right_context
    self.num_units = num_units
    if int(num_proj) == 0:
      self.num_proj = None
    elif num_proj is not None:
      self.num_proj = int(num_proj)
    else:
      self.num_proj = None
    self.num_layers = num_layers
    self.bottleneck_dim = bottleneck_dim
    #add blank state
    self.num_classes = num_classes + 1
    self.lstm_impl = lstm_impl
    self.use_peephole = use_peephole

    # Regularization
    self.parameter_init = parameter_init
    self.clip_grad_norm = clip_grad_norm
    self.clip_activation = clip_activation
    self.weight_decay = weight_decay

    # Summaries for TensorBoard
    self.summaries_train = []
    self.summaries_dev = []

    # Placeholders
    self.inputs_pl_list = []
    self.labels_pl_list = []
    self.inputs_seq_len_pl_list = []
    self.keep_prob_pl_list = []

    self.time_major = time_major
    self.name = encoder_type + '_ctc'

    if encoder_type in ['lstm']:
      self.encoder = load(encoder_type)(
        num_units=num_units,
        num_proj=self.num_proj,
        num_layers=num_layers,
        lstm_impl=lstm_impl,
        use_peephole=use_peephole,
        parameter_init=parameter_init,
        clip_activation=clip_activation,
        time_major=time_major)
    else:
      raise NotImplementedError

  def _build(self, inputs, inputs_seq_len, keep_prob, is_training):
    """Construct model graph.
    Args:
      inputs: A tensor of size `[B, T, input_size]`
      inputs_seq_len (placeholder): A tensor of size` [B]`
      keep_prob (placeholder, float): A probability to keep nodes
        in the hidden-hidden connection
      is_training (bool):
    Returns:
      logits: A tensor of size `[T, B, num_classes]`
    """
    # inputs: `[B, T, input_size]`
    batch_size = tf.shape(inputs)[0]
    max_time = tf.shape(inputs)[1]
    encoder_outputs, final_state = self.encoder(
      inputs, inputs_seq_len, keep_prob, is_training)
    self.encoder_outputs = encoder_outputs
    # Reshape to apply the same weights over the timesteps
    output_dim = encoder_outputs.shape.as_list()[-1]
    outputs_2d = tf.reshape(encoder_outputs, shape=[batch_size * max_time, output_dim])
    if self.bottleneck_dim is not None and self.bottleneck_dim != 0:
      with tf.variable_scope('bottleneck') as scope:
        outputs_2d = tf.contrib.layers.fully_connected(
          outputs_2d,
          num_outputs=self.bottleneck_dim,
          activation_fn=tf.nn.relu,
          weights_initializer=tf.truncated_normal_initializer(stddev=self.parameter_init),
          biases_initializer=tf.zeros_initializer(),
          scope=scope)
      # Dropout for the hidden-output connections
      outputs_2d = tf.nn.dropout(
          outputs_2d, keep_prob, name='dropout_bottleneck')
    with tf.variable_scope('output') as scope:
      logits_2d = tf.contrib.layers.fully_connected(outputs_2d,
          num_outputs=self.num_classes,
          activation_fn=None,
          weights_initializer=tf.truncated_normal_initializer(
            stddev=self.parameter_init),
          biases_initializer=tf.zeros_initializer(),
          scope=scope)
      if self.time_major:
        # Reshape back to the original shape
        logits = tf.reshape(logits_2d, shape=[max_time, batch_size, self.num_classes])
      else:
        # Reshape back to the original shape
        logits = tf.reshape(logits_2d, shape=[batch_size, max_time, self.num_classes])
        # Convert to time-major: `[T, B, num_classes]'
        logits = tf.transpose(logits, [1, 0, 2])
    return logits

  def create_placeholders(self):
    """Create placeholders and append them to list."""
    self.inputs_pl_list.append(
      tf.placeholder(tf.float32, shape=[None, None, self.input_size * (self.left_context + self.right_context + 1)],
                     name='input'))
    self.labels_pl_list.append(
      tf.SparseTensor(tf.placeholder(tf.int64, name='indices'),
                      tf.placeholder(tf.int32, name='values'),
                      tf.placeholder(tf.int64, name='shape')))
    self.inputs_seq_len_pl_list.append(
      tf.placeholder(tf.int32, shape=[None], name='inputs_seq_len'))
    self.keep_prob_pl_list.append(
      tf.placeholder(tf.float32, name='keep_prob'))

  def compute_loss(self, inputs, labels, inputs_seq_len,
     keep_prob, scope=None, softmax_temperature=1,
                     is_training=True):
    """Operation for computing CTC loss.
    Args:
      inputs: A tensor of size `[B, T, input_size]`
      labels: A SparseTensor of target labels
      inputs_seq_len: A tensor of size `[B]`
      keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
      scope (optional): A scope in the model tower
      softmax_temperature (int, optional): temperature parameter for
                ths softmax layer
      is_training (bool, optional):
    Returns:
      total_loss: operation for computing total ctc loss (ctc loss + L2).
                 This is a single scalar tensor to minimize.
      logits: A tensor of size `[T, B, num_classes]`
    """
    # Build model graph
    logits = self._build(inputs, inputs_seq_len, keep_prob,
                             is_training=is_training)
    # Weight decay
    if self.weight_decay > 0:
      with tf.name_scope("weight_decay_loss"):
        weight_sum = 0
        for var in tf.trainable_variables():
          if 'bias' not in var.name.lower():
            weight_sum += tf.nn.l2_loss(var)
        tf.add_to_collection('losses', weight_sum * self.weight_decay)
    with tf.name_scope("ctc_loss"):
      ctc_losses = tf.nn.ctc_loss(labels,
        logits / softmax_temperature,
        tf.cast(inputs_seq_len, tf.int32),
        preprocess_collapse_repeated=False,
        ctc_merge_repeated=True,
        ignore_longer_outputs_than_inputs=True,
        time_major=True)
      ctc_loss = tf.reduce_mean(ctc_losses, name='ctc_loss_mean')
      tf.add_to_collection('losses', ctc_loss)
    # Compute total loss
    total_loss = tf.add_n(tf.get_collection('losses', scope),
                              name='total_loss')
    # Add a scalar summary for the snapshot of loss
    if self.weight_decay > 0:
      self.summaries_train.append(
        tf.summary.scalar('weight_loss_train',
                          weight_sum * self.weight_decay))
      self.summaries_dev.append(
        tf.summary.scalar('weight_loss_dev',
                          weight_sum * self.weight_decay))

      self.summaries_train.append(
        tf.summary.scalar('total_loss_train', total_loss))
      self.summaries_dev.append(
        tf.summary.scalar('total_loss_dev', total_loss))

    self.summaries_train.append(
      tf.summary.scalar('ctc_loss_train', ctc_loss))
    self.summaries_dev.append(
      tf.summary.scalar('ctc_loss_dev', ctc_loss))
    return total_loss, logits

  def decoder(self, logits, inputs_seq_len, beam_width=1):
    """Operation for decoding.
    Args:
      logits: A tensor of size `[T, B, num_classes]`
      inputs_seq_len: A tensor of size `[B]`
      beam_width (int, optional): beam width for beam search.
                1 disables beam search, which mean greedy decoding.
    Return:
      decode_op: A SparseTensor
    """
    assert isinstance(beam_width, int), "beam_width must be integer."
    assert beam_width >= 1, "beam_width must be >= 1"

    if beam_width == 1:
      decoded, _ = tf.nn.ctc_greedy_decoder(logits, inputs_seq_len)
    else:
      decoded, _ = tf.nn.ctc_beam_search_decoder(logits, inputs_seq_len,
                beam_width=beam_width)
    decode_op = tf.to_int32(decoded[0])
    return decode_op

  def posteriors(self, logits, blank_prior=1):
    """Operation for computing posteriors of each time steps.
    Args:
      logits: A tensor of size `[T, B, num_classes]`
      blank_prior (float): A prior for blank classes. posteriors are
                divided by this prior.
    Return:
      posteriors_op: operation for computing posteriors for each class
    """
    # Convert to batch-major: `[B, T, num_classes]'
    logits = tf.transpose(logits, (1, 0, 2))
    logits_2d = tf.reshape(logits, [-1, self.num_classes])
    posteriors_op = tf.nn.softmax(logits_2d)
    return posteriors_op

  def compute_ler(self, decode_op, labels):
    """Operation for computing LER (Label Error Rate).
    Args:
      decode_op: operation for decoding
      labels: A SparseTensor of target labels
    Return:
      ler_op: operation for computing LER
    """
    # Compute LER (normalize by label length)
    ler_op = tf.reduce_mean(tf.edit_distance(
            decode_op, labels, normalize=True))
    # Add a scalar summary for the snapshot of LER
    self.summaries_train.append(tf.summary.scalar('ler_train', ler_op))
    self.summaries_dev.append(tf.summary.scalar('ler_dev', ler_op))
    return ler_op

2.7 训练主程序(train_ctc.py):

# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from os.path import join, isfile, abspath
import sys 
import time
from setproctitle import setproctitle
import shutil
import yaml
import os

from model_ctc import CTC
from basic_util import mkdir_join, mkdir
from basic_util import count_total_parameters
import numpy as np
import math

from tensorflow.python.framework import graph_util

left_context = 10
right_context = 10
skip = 4

def general_frame(feature, seq_len):
  max_len, mfcc_len = feature.shape
  feat_num = left_context + right_context + 1
  frame_list = [np.concatenate([feature[0 if m<0 else seq_len-1 
               if m>seq_len-1 else m] for m in range(n-left_context, 
               n+right_context+1)]) for n in range(0, seq_len, skip)]
  new_seq_len = math.ceil(seq_len / skip)
  new_feature = np.asarray(frame_list).astype(np.float32)
  return new_feature, new_seq_len

def parse_function(example_proto):
  features = {'feature': tf.VarLenFeature(tf.string),
              'label'  : tf.VarLenFeature(tf.string),
              'seq_len': tf.FixedLenFeature([], tf.int64)}
  parsed_features = tf.parse_single_example(example_proto, features)
  feature = parsed_features['feature']
  feature = tf.sparse_tensor_to_dense(parsed_features['feature'], default_value=b'0.0')
  feature = tf.decode_raw(feature[0], tf.float32)
  feature = tf.reshape(feature, [-1, 40])
  label = parsed_features['label']
  label = tf.sparse_tensor_to_dense(parsed_features['label'], default_value=b'0')
  label = tf.decode_raw(label[0], tf.int64)
  seq_len = parsed_features['seq_len']
  feature, seq_len = tf.py_func(general_frame, [feature, seq_len], [tf.float32, tf.int64])
  seq_len = tf.cast(seq_len, tf.int32)
  return feature, label, seq_len

def dense_to_sparse(dense):
  indices = []
  values = []
  for n, seq in enumerate(dense):
    seq = np.append(seq, -1)
    seq = seq[:np.argmin(seq)]
    indices.extend(zip([n] * len(seq), range(len(seq))))
    values.extend(seq)
  indices = np.asarray(indices, dtype=np.int64)
  values = np.asarray(values, dtype=np.int32)
  shape = np.asarray(dense.shape, dtype=np.int64)
  return indices, values, shape

def average_gradients(tower_grads):
  average_grads=[]
  for grad_and_vars in zip(*tower_grads):
    grads=[]
    for g,_ in grad_and_vars:
      expend_g=tf.expand_dims(g,0)
      grads.append(expend_g)
    grad=tf.concat(grads,0)
    grad=tf.reduce_mean(grad,0)
    v=grad_and_vars[0][1]
    grad_and_var=(grad,v)
    average_grads.append(grad_and_var)
  return average_grads

def do_train(model, params, gpu_indices):
  # Tell TensorFlow that the model will be built into the default graph
  with tf.Graph().as_default(), tf.device('/cpu:0'):
    global_step = tf.Variable(0, name='global_step', trainable=False)
    # Set optimizer
    #设置优化器
    learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate')
    optimizer = model._set_optimizer(params['optimizer'], learning_rate_pl)
    # Calculate the gradients for each model tower
    #定义梯度和损失
    total_grads_and_vars, total_losses = [], []
    decode_ops, ler_ops = [], []
    #定义所有的设备名称
    all_devices = ['/gpu:%d' % i_gpu for i_gpu in range(len(gpu_indices))]
    with tf.variable_scope(tf.get_variable_scope()):
      for i_gpu in range(len(all_devices)):
        with tf.device(all_devices[i_gpu]):
          with tf.name_scope('tower_gpu%d' % i_gpu) as scope:
            # Define placeholders in each tower
            model.create_placeholders()
            tower_loss, tower_logits = model.compute_loss(
                   model.inputs_pl_list[i_gpu],
                   model.labels_pl_list[i_gpu],
                   model.inputs_seq_len_pl_list[i_gpu],
                   model.keep_prob_pl_list[i_gpu],
                   scope)
            tower_loss = tf.expand_dims(tower_loss, axis=0)
            total_losses.append(tower_loss)
            #仅在一个卡上更新参数
            tf.get_variable_scope().reuse_variables()
            tower_grads_and_vars = optimizer.compute_gradients(
              tower_loss)
            tower_grads_and_vars = model._clip_gradients(tower_grads_and_vars)
            total_grads_and_vars.append(tower_grads_and_vars)
            decode_op_tower = model.decoder(tower_logits, model.inputs_seq_len_pl_list[i_gpu], 
                              beam_width=params['beam_width'])
            decode_ops.append(decode_op_tower)
            ler_op_tower = model.compute_ler(decode_op_tower, model.labels_pl_list[i_gpu])
            ler_op_tower = tf.expand_dims(ler_op_tower, axis=0)
            ler_ops.append(ler_op_tower)
    #loss平均
    total_losses = tf.concat(axis=0, values=total_losses)
    loss_op = tf.reduce_mean(total_losses, axis=0)
    #ler平均
    ler_ops = tf.concat(axis=0, values=ler_ops)
    ler_op = tf.reduce_mean(ler_ops, axis=0)
    #梯度平均
    average_grads_and_vars = average_gradients(total_grads_and_vars)
    train_op = optimizer.apply_gradients(average_grads_and_vars,global_step=global_step)

    summary_train = tf.summary.merge(model.summaries_train)
    summary_dev = tf.summary.merge(model.summaries_dev)
    #一次初始化所有参数
    init_op = tf.global_variables_initializer()
    saver = tf.train.Saver(max_to_keep=None)
    #统计所有的参数
    parameters_dict, total_parameters = count_total_parameters(tf.trainable_variables())

    for parameter_name in sorted(parameters_dict.keys()):
      print("%s %d" % (parameter_name, parameters_dict[parameter_name]))
    print("Total %d variables, %s M parameters" %  (len(parameters_dict.keys()),
           "{:,}".format(total_parameters / 1000000)))
    #获取数据
    train_dataset = tf.data.TFRecordDataset(params['train_data_file'])
    train_dataset = train_dataset.map(parse_function)
    train_dataset = train_dataset.shuffle(1000)
    train_dataset = train_dataset.padded_batch(params['batch_size'], padded_shapes=([None, None],
               [None], []), padding_values=(0.0, tf.cast(-1, tf.int64), tf.cast(0, tf.int32)))
    train_dataset = train_dataset.repeat(1)

    iterator = train_dataset.make_initializable_iterator()
    #获取一个batch
    batch_feat, batch_label, batch_seq_len = iterator.get_next()
    s_indices, s_value, s_shape = tf.py_func(dense_to_sparse, [batch_label],  [tf.int64, tf.int32, tf.int64])
    batch_label = tf.SparseTensor(s_indices, s_value, s_shape)
 
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,log_device_placement=False)) as sess:
      summary_writer = tf.summary.FileWriter(model.save_path, sess.graph)
      sess.run(init_op)
      start_time_train = time.time()
      learning_rate = float(params['learning_rate'])
      step_id = 0
      last_id = 0
      #找到上一次训练的模型并接着训练
      ckpt = tf.train.latest_checkpoint(model.save_path)
      print("===========================")
      print("ckpt:", ckpt)
      if ckpt != None:
        saver.restore(sess, ckpt)
        ind = ckpt.rfind("-")
        last_id = int(ckpt[ind + 1:])
        print("++++++++++++++++++++++++++++")
        print("last_id: ", last_id)
      print("===========================")
      for epoch in range(params['num_epoch']):
        start_time_epoch = time.time()
        sess.run(iterator.initializer)
        print("global_step: ", sess.run(global_step))
        try:
          while (True):
            start_time_step = time.time()
            if step_id < last_id:
              step_id += 1
              print("===skip: ", step_id)
              sys.stdout.flush()
              continue
            feed_dict_train = {}
            #获取数据
            for i_gpu in range(len(gpu_indices)):
              new_feat, new_label, new_seq_len = sess.run([batch_feat, batch_label, batch_seq_len])
              feed_dict_train[model.inputs_pl_list[i_gpu]] = new_feat
              feed_dict_train[model.labels_pl_list[i_gpu]] = new_label
              feed_dict_train[model.inputs_seq_len_pl_list[i_gpu]] = new_seq_len
              feed_dict_train[model.keep_prob_pl_list[i_gpu]] = float(params['dropout'])
            feed_dict_train[learning_rate_pl] = learning_rate
            #一次训练
            step_loss, step_ler, _, _ = sess.run([loss_op, 
                            ler_op, global_step, train_op], feed_dict=feed_dict_train)
            step_id += 1
            end_time_step = time.time()
            step_time = end_time_step - start_time_step
            #打印一次训练结果
            print("batch: ", step_id, " loss: ", step_loss, " ler: ", step_ler, " time: ", step_time)
            sys.stdout.flush()
            #每隔一段时间保存一次模型
            if step_id % params['print_step'] == 0:
              summary_str_train = sess.run(summary_train, feed_dict=feed_dict_train)
              summary_writer.add_summary(summary_str_train, step_id)
              summary_writer.flush()
              checkpoint_file = join(model.save_path, 'model.ckpt')
              save_path = saver.save(sess, checkpoint_file, global_step=step_id)
              print("Model saved in file: %s" % save_path)
              sys.stdout.flush()
        #出错之后进行下一轮训练
        except tf.errors.OutOfRangeError:
          end_time_epoch = time.time()
          epoch_time = end_time_epoch - start_time_epoch
          print("epoch: ", epoch, " end, use time: ", epoch_time)
          sys.stdout.flush()
      end_time_train = time.time()
      train_time = end_time_train - start_time_train
      print("train end, total time: ", train_time)
      sys.stdout.flush()
      summary_writer.close()

def main(config_path, model_save_path, gpu_indices):

  #加载配置文件
  with open(config_path, "r") as f:
    config = yaml.load(f)
    params = config['param']

  # Model setting
  model = CTC(encoder_type=params['encoder_type'],
              input_size=params['input_size'],
              left_context=params['left_context'],
              right_context=params['right_context'],
              num_units=params['num_units'],
              num_layers=params['num_layers'],
              num_classes=params['num_classes'],
              lstm_impl=params['lstm_impl'],
              use_peephole=params['use_peephole'],
              parameter_init=params['weight_init'],
              clip_grad_norm=params['clip_grad_norm'],
              clip_activation=params['clip_activation'],
              num_proj=params['num_proj'],
              weight_decay=params['weight_decay'])
  # Set process name
  setproctitle('tf' + model.name + '_' + str(params['train_data_size']) + '_' + params['label_type'])
  #设置模型的名称
  model.name += '_' + str(params['num_units'])
  model.name += '_' + str(params['num_layers'])
  model.name += '_' + params['optimizer']
  model.name += '_lr' + str(params['learning_rate'])
  if params['num_proj'] != 0:
    model.name += '_proj' + str(params['num_proj'])
  if params['dropout'] != 0:
    model.name += '_drop' + str(params['dropout'])
  if params['weight_decay'] != 0:
    model.name += '_wd' + str(params['weight_decay'])
  if params['bottleneck_dim'] != 0:
    model.name += '_bottle' + str(params['bottleneck_dim'])
  if len(gpu_indices) >= 2:
    model.name += '_gpu' + str(len(gpu_indices))

  # Set save path
  #创建模型保存路径
  model.save_path = mkdir_join(
        model_save_path, 'ctc', params['label_type'],
        str(params['train_data_size']), model.name)

  # Reset model directory
  model_index = 0
  new_model_path = model.save_path
  while True:
    #如果模型保存路径中有COMPLETE.TXT,则修改目录名
    if isfile(join(new_model_path, 'complete.txt')):
      # Training of the first model have been finished
      model_index += 1
      new_model_path = model.save_path + '_' + str(model_index)
    elif False and isfile(join(new_model_path, 'config.yml')):
      # Training of the first model have not been finished yet
      model_index += 1
      new_model_path = model.save_path + '_' + str(model_index)
    else:
      break
  #创建新的模型目录
  model.save_path = mkdir(new_model_path)

  # Save config file
  #配置文件拷贝一份到模型目录中
  shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

  #sys.stdout = open(join(model.save_path, 'train.log'), 'w')
  #开始训练
  do_train(model=model, params=params, gpu_indices=gpu_indices)

if __name__ == '__main__':
  args = sys.argv
  if len(args) != 3 and len(args) != 4:
    print("input config file, model store path, [0,1,2,3]")
    exit(-1)
  import tensorflow as tf
  #选择哪些GPU可见
  os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2'
  main(config_path=args[1], model_save_path=args[2],
    gpu_indices=list(map(int, args[3].split(','))))

三、结论

以上就是CTC整个训练脚本,在3000小时数据上,字准确率达到了97%,句准确率达到了91%的效果。

你可能感兴趣的:(tensorflow,asr,ctc)