一、介绍
上一节我们简单介绍了CTC及数据准备过程,做好了数据准备,本节我们介绍CTC模型训练及源码解析。
CTC(Connectionist Temporal Classification)连接时间分类,直观上理解,循环神经网络(RNN)更适合于CTC训练,关于CTC的原理上的介绍,大家已经写的很多了,本节我们主要从代码着手,帮助大家从零搭建CTC-ASR训练系统。既然是系统,我们就让代码的扩展性更强一些,我们现在支持LSTM网络结构。
二、训练源码及解析
2.1配置文件(config-lstm.yml):
param: #配置参数
num_classes: 219 #我们使用声韵母建模,音素总个数为219个
encoder_type: lstm #网络结构使用LSTM结构
input_size: 40 #输入我们使用40维的MFCC
left_context: 10 #输入左边拼帧10帧
right_context: 10 #输入右边拼帧10帧
num_units: 512 #隐层单元个数
num_layers: 4 #隐层数
lstm_impl: BasicLSTMCell #LSTM结构类型
use_peephole: True #LSTM结构是否使用PEEPHOLE
weight_init: 0.1 #初始化参数
clip_grad_norm: 5.0 #梯度更新参数
clip_activation: 50 #激活函数截断参数
num_proj: 256 #映射层维数
weight_decay: 0 #正则化系数
train_data_size: 3000 #训练数据量
label_type: monophone #建模单元类型
optimizer: adam #使用的优化器
learning_rate: 0.0001 #初始学习率
dropout: 0.8 #参数更新比例
bottleneck_dim: 0 #瓶颈层维数
train_data_file: ./data/th30h.tfrecords #训练数据及标签
label_file: ./data/dict.txt #音素对应的字典
beam_width: 1 #解码beam宽度
batch_size: 32 #更新一次参数batch大小
print_step: 50 #保存模型的频率,50次迭代保存一次模型
num_epoch: 6 #数据迭代轮数
2.2网络结构文件(lstm.py):
# -*- coding: utf-8 -*-
"""Unidirectional LSTM encoder."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
#LSTM编码器,支持BasicLSTM,LSTM,BlockLSTM
#之所以叫编码器,相当于把语音特征编码到分类标签
class LSTMEncoder(object):
"""Unidirectional LSTM encoder.
Args:
num_units (int): 每一层的结点数
num_proj (int): 映射层的结点数
num_layers (int): 网络层数
lstm_impl (string, optional): LSTM结构的不同实现
- BasicLSTMCell: tf.contrib.rnn.BasicLSTMCell 基本LSTM (no peephole)
- LSTMCell: tf.contrib.rnn.LSTMCell 标准LSTM
- LSTMBlockCell: tf.contrib.rnn.LSTMBlockCell BLOCK LSTM
use_peephole (bool): 是否使用peephole
parameter_init (float): 初始化网络参数
clip_activation (float): 通过激活函数后的裁剪范围 (> 0)
time_major (bool, optional): 计算时是否使用时间为主序
name (string, optional): 设置网络结构名称
"""
def __init__(self,
num_units,
num_proj,
num_layers,
lstm_impl,
use_peephole,
parameter_init,
clip_activation,
time_major=False,
name='lstm_encoder'):
self.num_units = num_units
if lstm_impl != 'LSTMCell':
self.num_proj = None
else:
self.num_proj = num_proj
self.num_layers = num_layers
self.lstm_impl = lstm_impl
self.use_peephole = use_peephole
self.parameter_init = parameter_init
self.clip_activation = clip_activation
self.time_major = time_major
self.name = name
#可调用对象
def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
"""Construct model graph.
Args:
inputs (placeholder): A tensor of size`[B, T, input_size]`
inputs_seq_len (placeholder): A tensor of size` [B]`
keep_prob (placeholder, float): A probability to keep nodes
in the hidden-hidden connection
is_training (bool):
Returns:
outputs: Encoder states.
if time_major is True, a tensor of size
`[T, B, num_units (num_proj)]`
otherwise, `[B, T, num_units (num_proj)]`
final_state: A final hidden state of the encoder
"""
initializer = tf.random_uniform_initializer(
minval=-self.parameter_init, maxval=self.parameter_init)
if self.lstm_impl == 'BasicLSTMCell':
outputs, final_state = basiclstmcell(
self.num_units, self.num_layers,
inputs, inputs_seq_len, keep_prob, initializer,
self.time_major)
elif self.lstm_impl == 'LSTMCell':
outputs, final_state = lstmcell(
self.num_units, self.num_proj, self.num_layers,
self.use_peephole, self.clip_activation,
inputs, inputs_seq_len, keep_prob, initializer,
self.time_major)
elif self.lstm_impl == 'LSTMBlockCell':
outputs, final_state = lstmblockcell(
self.num_units, self.num_layers,
self.use_peephole,
inputs, inputs_seq_len, keep_prob, initializer,
self.time_major)
else:
raise IndexError( 'lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' +
'"LSTMBlockCell" or "LSTMBlockFusedCell" or ' +
'"CudnnLSTM".')
return outputs, final_state
#basic lstm网络结构
def basiclstmcell(num_units, num_layers, inputs, inputs_seq_len,
keep_prob, initializer, time_major):
if time_major:
# Convert from batch-major to time-major
inputs = tf.transpose(inputs, [1, 0, 2])
lstm_list = []
with tf.variable_scope('multi_lstm', initializer=initializer) as scope:
for i_layer in range(1, num_layers + 1, 1):
lstm = tf.contrib.rnn.BasicLSTMCell(
num_units,
forget_bias=1.0,
state_is_tuple=True,
activation=tf.tanh)
# Dropout for the hidden-hidden connections
lstm = tf.contrib.rnn.DropoutWrapper(
lstm, output_keep_prob=keep_prob)
lstm_list.append(lstm)
# Stack multiple cells
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
lstm_list, state_is_tuple=True)
# Ignore 2nd return (the last state)
outputs, final_state = tf.nn.dynamic_rnn(
cell=stacked_lstm,
inputs=inputs,
sequence_length=inputs_seq_len,
dtype=tf.float32,
time_major=time_major,
scope=scope)
return outputs, final_state
#标准lstm网络结构
def lstmcell(num_units, num_proj, num_layers, use_peephole, clip_activation,
inputs, inputs_seq_len, keep_prob, initializer, time_major):
if time_major:
# Convert form batch-major to time-major
inputs = tf.transpose(inputs, [1, 0, 2])
lstm_list = []
with tf.variable_scope('multi_lstm', initializer=initializer) as scope:
for i_layer in range(1, num_layers + 1, 1):
lstm = tf.contrib.rnn.LSTMCell(
num_units,
use_peepholes=use_peephole,
cell_clip=clip_activation,
num_proj=num_proj,
forget_bias=1.0,
state_is_tuple=True)
# Dropout for the hidden-hidden connections
lstm = tf.contrib.rnn.DropoutWrapper(
lstm, output_keep_prob=keep_prob)
lstm_list.append(lstm)
# Stack multiple cells
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
lstm_list, state_is_tuple=True)
# Ignore 2nd return (the last state)
outputs, final_state = tf.nn.dynamic_rnn(
cell=stacked_lstm,
inputs=inputs,
sequence_length=inputs_seq_len,
dtype=tf.float32,
time_major=time_major,
scope=scope)
return outputs, final_state
#block lstm网络结构
def lstmblockcell(num_units, num_layers, use_peephole, inputs,
inputs_seq_len, keep_prob, initializer, time_major):
if time_major:
inputs = tf.transpose(inputs, [1, 0, 2])
lstm_list = []
with tf.variable_scope('multi_lstm', initializer=initializer) as scope:
for i_layer in range(1, num_layers + 1, 1):
lstm = tf.contrib.rnn.LSTMBlockCell(
num_units, forget_bias=1.0,
use_peephole=use_peephole)
lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
lstm_list.append(lstm)
stacked_lstm = tf.contrib.rnn.MultiRNNCell(lstm_list, state_is_tuple=True)
outputs, final_state = tf.nn.dynamic_rnn(cell=stacked_lstm,
inputs=inputs,sequence_length=inputs_seq_len,
dtype=tf.float32,time_major=time_major,scope=scope)
return outputs, final_state
2.3选择网络结构(choose_encoder.py):
# -*- coding: utf-8 -*-
"""Select & load encoder."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from lstm import LSTMEncoder
ENCODERS = {
"lstm": LSTMEncoder,
}
#选择模型结构,这里仅支持LSTM结构
def load(encoder_type):
"""Select & load encoder.
Args:
encoder_type (string): name of the ctc model in the key of ENCODERS
Returns:
An instance of the encoder
"""
if encoder_type not in ENCODERS.keys():
raise ValueError(
"encoder_type should be one of [%s], you provided %s." %
(", ".join(ENCODERS), encoder_type))
return ENCODERS[encoder_type]
2.4基本的工具函数(basic_util.py):
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from os.path import join, isdir
#这里是一些工具函数
#创建目录
def mkdir(path_to_dir):
if path_to_dir is not None and (not isdir(path_to_dir)):
os.makedirs(path_to_dir)
return path_to_dir
#创建子目录
def mkdir_join(path_to_dir, *dir_name):
if path_to_dir is None:
return path_to_dir
for i in range(len(dir_name)):
if '.' not in dir_name[i]:
path_to_dir = mkdir(join(path_to_dir, dir_name[i]))
else:
path_to_dir = join(path_to_dir, dir_name[i])
return path_to_dir
#统计总的参数量
def count_total_parameters(variables):
total_parameters = 0
parameters_dict = {}
for variable in variables:
shape = variable.get_shape()
variable_parameters = 1
for dim in shape:
variable_parameters *= dim.value
total_parameters += variable_parameters
parameters_dict[variable.name] = variable_parameters
return parameters_dict, total_parameters
2.5 模型基类(model_base.py):
# -*- coding: utf-8 -*-
"""Base class for all models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
#支持的优化器
OPTIMIZER_CLS_NAMES = {
"adagrad": tf.train.AdagradOptimizer,
"adadelta": tf.train.AdadeltaOptimizer,
"adam": tf.train.AdamOptimizer,
"rmsprop": tf.train.RMSPropOptimizer,
"sgd": tf.train.GradientDescentOptimizer,
"momentum": tf.train.MomentumOptimizer,
"nestrov": tf.train.MomentumOptimizer
}
#模型基类
class ModelBase(object):
def __init__(self, *args, **kwargs):
pass
def _build(self, *args, **kwargs):
"""Construct model graph."""
raise NotADirectoryError
def create_placeholders(self):
"""Create placeholders and append them to list."""
raise NotImplementedError
def compute_loss(self, *args, **kwargs):
"""Operation for computing loss."""
raise NotImplementedError
def _add_noise_to_inputs(self, inputs, stddev=0.075):
"""Add gaussian noise to the inputs.
Args:
inputs: the noise free input-features.
stddev (float, optional): The standart deviation of the noise.
Default is 0.075.
Returns:
inputs: Input features plus noise.
"""
raise NotImplementedError
def _add_noise_to_gradients(grads_and_vars, gradient_noise_scale,
stddev=0.075):
"""Adds scaled noise from a 0-mean normal distribution to gradients.
Args:
grads_and_vars:
gradient_noise_scale:
stddev (float):
Returns:
"""
raise NotImplementedError
#设置优化器
def _set_optimizer(self, optimizer, learning_rate):
"""Set optimizer.
Args:
optimizer (string): the name of the optimizer in
OPTIMIZER_CLS_NAMES
learning_rate (float): A learning rate
Returns:
optimizer:
"""
optimizer = optimizer.lower()
if optimizer not in OPTIMIZER_CLS_NAMES:
raise ValueError(
"Optimizer name should be one of [%s], you provided %s." %
(", ".join(OPTIMIZER_CLS_NAMES), optimizer))
# Select optimizer
if optimizer == 'momentum':
return OPTIMIZER_CLS_NAMES[optimizer](
learning_rate=learning_rate,
momentum=0.9)
elif optimizer == 'nestrov':
return OPTIMIZER_CLS_NAMES[optimizer](
learning_rate=learning_rate,
momentum=0.9,
use_nesterov=True)
else:
return OPTIMIZER_CLS_NAMES[optimizer](
learning_rate=learning_rate)
def train(self, loss, optimizer, learning_rate):
"""Operation for training. Only the sigle GPU training is supported.
Args:
loss: An operation for computing loss
optimizer (string): name of the optimizer in OPTIMIZER_CLS_NAMES
learning_rate (placeholder): A learning rate
Returns:
train_op: operation for training
"""
# Create a variable to track the global step
global_step = tf.Variable(0, name='global_step', trainable=False)
# Set optimizer
self.optimizer = self._set_optimizer(optimizer, learning_rate)
if self.clip_grad_norm is not None:
# Compute gradients
grads_and_vars = self.optimizer.compute_gradients(loss)
# Clip gradients
clipped_grads_and_vars = self._clip_gradients(grads_and_vars)
# Create operation for gradient update
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
train_op = self.optimizer.apply_gradients(
clipped_grads_and_vars,
global_step=global_step)
else:
# Use the optimizer to apply the gradients that minimize the loss
# and also increment the global step counter as a single training
# step
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
train_op = self.optimizer.minimize(
loss, global_step=global_step)
return train_op
def _clip_gradients(self, grads_and_vars):
"""Clip gradients.
Args:
grads_and_vars (list): list of tuples of `(grads, vars)`
Returns:
clipped_grads_and_vars (list): list of tuple of
`(clipped grads, vars)`
"""
clipped_grads_and_vars = []
# Clip gradient norm
for grad, var in grads_and_vars:
if grad is not None:
clipped_grads_and_vars.append(
(tf.clip_by_norm(grad, clip_norm=self.clip_grad_norm),
var))
return clipped_grads_and_vars
2.6 CTC模型(model_ctc.py):
# -*- coding: utf-8 -*-
"""CTC model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from model_base import ModelBase
from choose_encoder import load
class CTC(ModelBase):
"""Connectionist Temporal Classification (CTC) network.
Args:
encoder_type (string): The type of an encoder
lstm: Unidirectional LSTM
input_size (int): the dimensions of input vectors
num_units (int): the number of units in each layer
num_layers (int): the number of layers
num_classes (int): the number of classes of target labels
(except for a blank label)
lstm_impl (string, optional): a base implementation of LSTM. This is
not used for GRU models.
- BasicLSTMCell: tf.contrib.rnn.BasicLSTMCell (no peephole)
- LSTMCell: tf.contrib.rnn.LSTMCell
- LSTMBlockCell: tf.contrib.rnn.LSTMBlockCell
Choose the background implementation of tensorflow.
Default is LSTMBlockCell.
use_peephole (bool, optional): if True, use peephole connection. This
is not used for GRU models.
left_context (int, optional): the number of left context to slice
right_context (int, optional): the number of right context to slice
parameter_init (float, optional): the range of uniform distribution to
initialize weight parameters (>= 0)
clip_grad_norm (float, optional): the range of clipping of gradient
norm (> 0)
clip_activation (float, optional): the range of clipping of cell
activation (> 0). This is not used for GRU models.
num_proj (int, optional): the number of nodes in the projection layer.
This is not used for GRU models.
weight_decay (float, optional): a parameter for weight decay
bottleneck_dim (int, optional): the dimensions of the bottleneck layer
time_major (bool, optional): if True, time-major computation will be
performed
"""
def __init__(self,
encoder_type,
input_size,
num_units,
num_layers,
num_classes,
lstm_impl='LSTMBlockCell',
use_peephole=True,
left_context=10,
right_context=10,
parameter_init=0.1,
clip_grad_norm=None,
clip_activation=None,
num_proj=None,
weight_decay=0.0,
bottleneck_dim=None,
time_major=True):
super(CTC, self).__init__()
if clip_grad_norm is not None:
assert float(clip_grad_norm) > 0, 'clip_grad_norm must be larger than 0.'
assert float(weight_decay) >= 0, 'weight_decay must not be a negative value.'
self.encoder_type = encoder_type
self.input_size = input_size
self.left_context = left_context
self.right_context = right_context
self.num_units = num_units
if int(num_proj) == 0:
self.num_proj = None
elif num_proj is not None:
self.num_proj = int(num_proj)
else:
self.num_proj = None
self.num_layers = num_layers
self.bottleneck_dim = bottleneck_dim
#add blank state
self.num_classes = num_classes + 1
self.lstm_impl = lstm_impl
self.use_peephole = use_peephole
# Regularization
self.parameter_init = parameter_init
self.clip_grad_norm = clip_grad_norm
self.clip_activation = clip_activation
self.weight_decay = weight_decay
# Summaries for TensorBoard
self.summaries_train = []
self.summaries_dev = []
# Placeholders
self.inputs_pl_list = []
self.labels_pl_list = []
self.inputs_seq_len_pl_list = []
self.keep_prob_pl_list = []
self.time_major = time_major
self.name = encoder_type + '_ctc'
if encoder_type in ['lstm']:
self.encoder = load(encoder_type)(
num_units=num_units,
num_proj=self.num_proj,
num_layers=num_layers,
lstm_impl=lstm_impl,
use_peephole=use_peephole,
parameter_init=parameter_init,
clip_activation=clip_activation,
time_major=time_major)
else:
raise NotImplementedError
def _build(self, inputs, inputs_seq_len, keep_prob, is_training):
"""Construct model graph.
Args:
inputs: A tensor of size `[B, T, input_size]`
inputs_seq_len (placeholder): A tensor of size` [B]`
keep_prob (placeholder, float): A probability to keep nodes
in the hidden-hidden connection
is_training (bool):
Returns:
logits: A tensor of size `[T, B, num_classes]`
"""
# inputs: `[B, T, input_size]`
batch_size = tf.shape(inputs)[0]
max_time = tf.shape(inputs)[1]
encoder_outputs, final_state = self.encoder(
inputs, inputs_seq_len, keep_prob, is_training)
self.encoder_outputs = encoder_outputs
# Reshape to apply the same weights over the timesteps
output_dim = encoder_outputs.shape.as_list()[-1]
outputs_2d = tf.reshape(encoder_outputs, shape=[batch_size * max_time, output_dim])
if self.bottleneck_dim is not None and self.bottleneck_dim != 0:
with tf.variable_scope('bottleneck') as scope:
outputs_2d = tf.contrib.layers.fully_connected(
outputs_2d,
num_outputs=self.bottleneck_dim,
activation_fn=tf.nn.relu,
weights_initializer=tf.truncated_normal_initializer(stddev=self.parameter_init),
biases_initializer=tf.zeros_initializer(),
scope=scope)
# Dropout for the hidden-output connections
outputs_2d = tf.nn.dropout(
outputs_2d, keep_prob, name='dropout_bottleneck')
with tf.variable_scope('output') as scope:
logits_2d = tf.contrib.layers.fully_connected(outputs_2d,
num_outputs=self.num_classes,
activation_fn=None,
weights_initializer=tf.truncated_normal_initializer(
stddev=self.parameter_init),
biases_initializer=tf.zeros_initializer(),
scope=scope)
if self.time_major:
# Reshape back to the original shape
logits = tf.reshape(logits_2d, shape=[max_time, batch_size, self.num_classes])
else:
# Reshape back to the original shape
logits = tf.reshape(logits_2d, shape=[batch_size, max_time, self.num_classes])
# Convert to time-major: `[T, B, num_classes]'
logits = tf.transpose(logits, [1, 0, 2])
return logits
def create_placeholders(self):
"""Create placeholders and append them to list."""
self.inputs_pl_list.append(
tf.placeholder(tf.float32, shape=[None, None, self.input_size * (self.left_context + self.right_context + 1)],
name='input'))
self.labels_pl_list.append(
tf.SparseTensor(tf.placeholder(tf.int64, name='indices'),
tf.placeholder(tf.int32, name='values'),
tf.placeholder(tf.int64, name='shape')))
self.inputs_seq_len_pl_list.append(
tf.placeholder(tf.int32, shape=[None], name='inputs_seq_len'))
self.keep_prob_pl_list.append(
tf.placeholder(tf.float32, name='keep_prob'))
def compute_loss(self, inputs, labels, inputs_seq_len,
keep_prob, scope=None, softmax_temperature=1,
is_training=True):
"""Operation for computing CTC loss.
Args:
inputs: A tensor of size `[B, T, input_size]`
labels: A SparseTensor of target labels
inputs_seq_len: A tensor of size `[B]`
keep_prob (placeholder, float): A probability to keep nodes
in the hidden-hidden connection
scope (optional): A scope in the model tower
softmax_temperature (int, optional): temperature parameter for
ths softmax layer
is_training (bool, optional):
Returns:
total_loss: operation for computing total ctc loss (ctc loss + L2).
This is a single scalar tensor to minimize.
logits: A tensor of size `[T, B, num_classes]`
"""
# Build model graph
logits = self._build(inputs, inputs_seq_len, keep_prob,
is_training=is_training)
# Weight decay
if self.weight_decay > 0:
with tf.name_scope("weight_decay_loss"):
weight_sum = 0
for var in tf.trainable_variables():
if 'bias' not in var.name.lower():
weight_sum += tf.nn.l2_loss(var)
tf.add_to_collection('losses', weight_sum * self.weight_decay)
with tf.name_scope("ctc_loss"):
ctc_losses = tf.nn.ctc_loss(labels,
logits / softmax_temperature,
tf.cast(inputs_seq_len, tf.int32),
preprocess_collapse_repeated=False,
ctc_merge_repeated=True,
ignore_longer_outputs_than_inputs=True,
time_major=True)
ctc_loss = tf.reduce_mean(ctc_losses, name='ctc_loss_mean')
tf.add_to_collection('losses', ctc_loss)
# Compute total loss
total_loss = tf.add_n(tf.get_collection('losses', scope),
name='total_loss')
# Add a scalar summary for the snapshot of loss
if self.weight_decay > 0:
self.summaries_train.append(
tf.summary.scalar('weight_loss_train',
weight_sum * self.weight_decay))
self.summaries_dev.append(
tf.summary.scalar('weight_loss_dev',
weight_sum * self.weight_decay))
self.summaries_train.append(
tf.summary.scalar('total_loss_train', total_loss))
self.summaries_dev.append(
tf.summary.scalar('total_loss_dev', total_loss))
self.summaries_train.append(
tf.summary.scalar('ctc_loss_train', ctc_loss))
self.summaries_dev.append(
tf.summary.scalar('ctc_loss_dev', ctc_loss))
return total_loss, logits
def decoder(self, logits, inputs_seq_len, beam_width=1):
"""Operation for decoding.
Args:
logits: A tensor of size `[T, B, num_classes]`
inputs_seq_len: A tensor of size `[B]`
beam_width (int, optional): beam width for beam search.
1 disables beam search, which mean greedy decoding.
Return:
decode_op: A SparseTensor
"""
assert isinstance(beam_width, int), "beam_width must be integer."
assert beam_width >= 1, "beam_width must be >= 1"
if beam_width == 1:
decoded, _ = tf.nn.ctc_greedy_decoder(logits, inputs_seq_len)
else:
decoded, _ = tf.nn.ctc_beam_search_decoder(logits, inputs_seq_len,
beam_width=beam_width)
decode_op = tf.to_int32(decoded[0])
return decode_op
def posteriors(self, logits, blank_prior=1):
"""Operation for computing posteriors of each time steps.
Args:
logits: A tensor of size `[T, B, num_classes]`
blank_prior (float): A prior for blank classes. posteriors are
divided by this prior.
Return:
posteriors_op: operation for computing posteriors for each class
"""
# Convert to batch-major: `[B, T, num_classes]'
logits = tf.transpose(logits, (1, 0, 2))
logits_2d = tf.reshape(logits, [-1, self.num_classes])
posteriors_op = tf.nn.softmax(logits_2d)
return posteriors_op
def compute_ler(self, decode_op, labels):
"""Operation for computing LER (Label Error Rate).
Args:
decode_op: operation for decoding
labels: A SparseTensor of target labels
Return:
ler_op: operation for computing LER
"""
# Compute LER (normalize by label length)
ler_op = tf.reduce_mean(tf.edit_distance(
decode_op, labels, normalize=True))
# Add a scalar summary for the snapshot of LER
self.summaries_train.append(tf.summary.scalar('ler_train', ler_op))
self.summaries_dev.append(tf.summary.scalar('ler_dev', ler_op))
return ler_op
2.7 训练主程序(train_ctc.py):
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from os.path import join, isfile, abspath
import sys
import time
from setproctitle import setproctitle
import shutil
import yaml
import os
from model_ctc import CTC
from basic_util import mkdir_join, mkdir
from basic_util import count_total_parameters
import numpy as np
import math
from tensorflow.python.framework import graph_util
left_context = 10
right_context = 10
skip = 4
def general_frame(feature, seq_len):
max_len, mfcc_len = feature.shape
feat_num = left_context + right_context + 1
frame_list = [np.concatenate([feature[0 if m<0 else seq_len-1
if m>seq_len-1 else m] for m in range(n-left_context,
n+right_context+1)]) for n in range(0, seq_len, skip)]
new_seq_len = math.ceil(seq_len / skip)
new_feature = np.asarray(frame_list).astype(np.float32)
return new_feature, new_seq_len
def parse_function(example_proto):
features = {'feature': tf.VarLenFeature(tf.string),
'label' : tf.VarLenFeature(tf.string),
'seq_len': tf.FixedLenFeature([], tf.int64)}
parsed_features = tf.parse_single_example(example_proto, features)
feature = parsed_features['feature']
feature = tf.sparse_tensor_to_dense(parsed_features['feature'], default_value=b'0.0')
feature = tf.decode_raw(feature[0], tf.float32)
feature = tf.reshape(feature, [-1, 40])
label = parsed_features['label']
label = tf.sparse_tensor_to_dense(parsed_features['label'], default_value=b'0')
label = tf.decode_raw(label[0], tf.int64)
seq_len = parsed_features['seq_len']
feature, seq_len = tf.py_func(general_frame, [feature, seq_len], [tf.float32, tf.int64])
seq_len = tf.cast(seq_len, tf.int32)
return feature, label, seq_len
def dense_to_sparse(dense):
indices = []
values = []
for n, seq in enumerate(dense):
seq = np.append(seq, -1)
seq = seq[:np.argmin(seq)]
indices.extend(zip([n] * len(seq), range(len(seq))))
values.extend(seq)
indices = np.asarray(indices, dtype=np.int64)
values = np.asarray(values, dtype=np.int32)
shape = np.asarray(dense.shape, dtype=np.int64)
return indices, values, shape
def average_gradients(tower_grads):
average_grads=[]
for grad_and_vars in zip(*tower_grads):
grads=[]
for g,_ in grad_and_vars:
expend_g=tf.expand_dims(g,0)
grads.append(expend_g)
grad=tf.concat(grads,0)
grad=tf.reduce_mean(grad,0)
v=grad_and_vars[0][1]
grad_and_var=(grad,v)
average_grads.append(grad_and_var)
return average_grads
def do_train(model, params, gpu_indices):
# Tell TensorFlow that the model will be built into the default graph
with tf.Graph().as_default(), tf.device('/cpu:0'):
global_step = tf.Variable(0, name='global_step', trainable=False)
# Set optimizer
#设置优化器
learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate')
optimizer = model._set_optimizer(params['optimizer'], learning_rate_pl)
# Calculate the gradients for each model tower
#定义梯度和损失
total_grads_and_vars, total_losses = [], []
decode_ops, ler_ops = [], []
#定义所有的设备名称
all_devices = ['/gpu:%d' % i_gpu for i_gpu in range(len(gpu_indices))]
with tf.variable_scope(tf.get_variable_scope()):
for i_gpu in range(len(all_devices)):
with tf.device(all_devices[i_gpu]):
with tf.name_scope('tower_gpu%d' % i_gpu) as scope:
# Define placeholders in each tower
model.create_placeholders()
tower_loss, tower_logits = model.compute_loss(
model.inputs_pl_list[i_gpu],
model.labels_pl_list[i_gpu],
model.inputs_seq_len_pl_list[i_gpu],
model.keep_prob_pl_list[i_gpu],
scope)
tower_loss = tf.expand_dims(tower_loss, axis=0)
total_losses.append(tower_loss)
#仅在一个卡上更新参数
tf.get_variable_scope().reuse_variables()
tower_grads_and_vars = optimizer.compute_gradients(
tower_loss)
tower_grads_and_vars = model._clip_gradients(tower_grads_and_vars)
total_grads_and_vars.append(tower_grads_and_vars)
decode_op_tower = model.decoder(tower_logits, model.inputs_seq_len_pl_list[i_gpu],
beam_width=params['beam_width'])
decode_ops.append(decode_op_tower)
ler_op_tower = model.compute_ler(decode_op_tower, model.labels_pl_list[i_gpu])
ler_op_tower = tf.expand_dims(ler_op_tower, axis=0)
ler_ops.append(ler_op_tower)
#loss平均
total_losses = tf.concat(axis=0, values=total_losses)
loss_op = tf.reduce_mean(total_losses, axis=0)
#ler平均
ler_ops = tf.concat(axis=0, values=ler_ops)
ler_op = tf.reduce_mean(ler_ops, axis=0)
#梯度平均
average_grads_and_vars = average_gradients(total_grads_and_vars)
train_op = optimizer.apply_gradients(average_grads_and_vars,global_step=global_step)
summary_train = tf.summary.merge(model.summaries_train)
summary_dev = tf.summary.merge(model.summaries_dev)
#一次初始化所有参数
init_op = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=None)
#统计所有的参数
parameters_dict, total_parameters = count_total_parameters(tf.trainable_variables())
for parameter_name in sorted(parameters_dict.keys()):
print("%s %d" % (parameter_name, parameters_dict[parameter_name]))
print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()),
"{:,}".format(total_parameters / 1000000)))
#获取数据
train_dataset = tf.data.TFRecordDataset(params['train_data_file'])
train_dataset = train_dataset.map(parse_function)
train_dataset = train_dataset.shuffle(1000)
train_dataset = train_dataset.padded_batch(params['batch_size'], padded_shapes=([None, None],
[None], []), padding_values=(0.0, tf.cast(-1, tf.int64), tf.cast(0, tf.int32)))
train_dataset = train_dataset.repeat(1)
iterator = train_dataset.make_initializable_iterator()
#获取一个batch
batch_feat, batch_label, batch_seq_len = iterator.get_next()
s_indices, s_value, s_shape = tf.py_func(dense_to_sparse, [batch_label], [tf.int64, tf.int32, tf.int64])
batch_label = tf.SparseTensor(s_indices, s_value, s_shape)
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,log_device_placement=False)) as sess:
summary_writer = tf.summary.FileWriter(model.save_path, sess.graph)
sess.run(init_op)
start_time_train = time.time()
learning_rate = float(params['learning_rate'])
step_id = 0
last_id = 0
#找到上一次训练的模型并接着训练
ckpt = tf.train.latest_checkpoint(model.save_path)
print("===========================")
print("ckpt:", ckpt)
if ckpt != None:
saver.restore(sess, ckpt)
ind = ckpt.rfind("-")
last_id = int(ckpt[ind + 1:])
print("++++++++++++++++++++++++++++")
print("last_id: ", last_id)
print("===========================")
for epoch in range(params['num_epoch']):
start_time_epoch = time.time()
sess.run(iterator.initializer)
print("global_step: ", sess.run(global_step))
try:
while (True):
start_time_step = time.time()
if step_id < last_id:
step_id += 1
print("===skip: ", step_id)
sys.stdout.flush()
continue
feed_dict_train = {}
#获取数据
for i_gpu in range(len(gpu_indices)):
new_feat, new_label, new_seq_len = sess.run([batch_feat, batch_label, batch_seq_len])
feed_dict_train[model.inputs_pl_list[i_gpu]] = new_feat
feed_dict_train[model.labels_pl_list[i_gpu]] = new_label
feed_dict_train[model.inputs_seq_len_pl_list[i_gpu]] = new_seq_len
feed_dict_train[model.keep_prob_pl_list[i_gpu]] = float(params['dropout'])
feed_dict_train[learning_rate_pl] = learning_rate
#一次训练
step_loss, step_ler, _, _ = sess.run([loss_op,
ler_op, global_step, train_op], feed_dict=feed_dict_train)
step_id += 1
end_time_step = time.time()
step_time = end_time_step - start_time_step
#打印一次训练结果
print("batch: ", step_id, " loss: ", step_loss, " ler: ", step_ler, " time: ", step_time)
sys.stdout.flush()
#每隔一段时间保存一次模型
if step_id % params['print_step'] == 0:
summary_str_train = sess.run(summary_train, feed_dict=feed_dict_train)
summary_writer.add_summary(summary_str_train, step_id)
summary_writer.flush()
checkpoint_file = join(model.save_path, 'model.ckpt')
save_path = saver.save(sess, checkpoint_file, global_step=step_id)
print("Model saved in file: %s" % save_path)
sys.stdout.flush()
#出错之后进行下一轮训练
except tf.errors.OutOfRangeError:
end_time_epoch = time.time()
epoch_time = end_time_epoch - start_time_epoch
print("epoch: ", epoch, " end, use time: ", epoch_time)
sys.stdout.flush()
end_time_train = time.time()
train_time = end_time_train - start_time_train
print("train end, total time: ", train_time)
sys.stdout.flush()
summary_writer.close()
def main(config_path, model_save_path, gpu_indices):
#加载配置文件
with open(config_path, "r") as f:
config = yaml.load(f)
params = config['param']
# Model setting
model = CTC(encoder_type=params['encoder_type'],
input_size=params['input_size'],
left_context=params['left_context'],
right_context=params['right_context'],
num_units=params['num_units'],
num_layers=params['num_layers'],
num_classes=params['num_classes'],
lstm_impl=params['lstm_impl'],
use_peephole=params['use_peephole'],
parameter_init=params['weight_init'],
clip_grad_norm=params['clip_grad_norm'],
clip_activation=params['clip_activation'],
num_proj=params['num_proj'],
weight_decay=params['weight_decay'])
# Set process name
setproctitle('tf' + model.name + '_' + str(params['train_data_size']) + '_' + params['label_type'])
#设置模型的名称
model.name += '_' + str(params['num_units'])
model.name += '_' + str(params['num_layers'])
model.name += '_' + params['optimizer']
model.name += '_lr' + str(params['learning_rate'])
if params['num_proj'] != 0:
model.name += '_proj' + str(params['num_proj'])
if params['dropout'] != 0:
model.name += '_drop' + str(params['dropout'])
if params['weight_decay'] != 0:
model.name += '_wd' + str(params['weight_decay'])
if params['bottleneck_dim'] != 0:
model.name += '_bottle' + str(params['bottleneck_dim'])
if len(gpu_indices) >= 2:
model.name += '_gpu' + str(len(gpu_indices))
# Set save path
#创建模型保存路径
model.save_path = mkdir_join(
model_save_path, 'ctc', params['label_type'],
str(params['train_data_size']), model.name)
# Reset model directory
model_index = 0
new_model_path = model.save_path
while True:
#如果模型保存路径中有COMPLETE.TXT,则修改目录名
if isfile(join(new_model_path, 'complete.txt')):
# Training of the first model have been finished
model_index += 1
new_model_path = model.save_path + '_' + str(model_index)
elif False and isfile(join(new_model_path, 'config.yml')):
# Training of the first model have not been finished yet
model_index += 1
new_model_path = model.save_path + '_' + str(model_index)
else:
break
#创建新的模型目录
model.save_path = mkdir(new_model_path)
# Save config file
#配置文件拷贝一份到模型目录中
shutil.copyfile(config_path, join(model.save_path, 'config.yml'))
#sys.stdout = open(join(model.save_path, 'train.log'), 'w')
#开始训练
do_train(model=model, params=params, gpu_indices=gpu_indices)
if __name__ == '__main__':
args = sys.argv
if len(args) != 3 and len(args) != 4:
print("input config file, model store path, [0,1,2,3]")
exit(-1)
import tensorflow as tf
#选择哪些GPU可见
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2'
main(config_path=args[1], model_save_path=args[2],
gpu_indices=list(map(int, args[3].split(','))))
三、结论
以上就是CTC整个训练脚本,在3000小时数据上,字准确率达到了97%,句准确率达到了91%的效果。