/examples/timit/training/run_ctc.sh
./run_ctc.sh ../config/ctc/blstm_rmsprop_phone61.yml 0
#!/bin/zsh
# 使用的是zsh,据说zsh是最好用的shell
MODEL_SAVE_PATH="/n/sd8/inaguma/result/tensorflow/timit"
# Select GPU
# 选择GPU
if [ $# -ne 2 ]; then
echo "Error: set GPU number & config path." 1>&2
echo "Usage: ./run_ctc.sh path_to_config_file gpu_index" 1>&2
exit 1
fi
# Set path to CUDA
# 设置CUDA路径
export PATH=$PATH:/usr/local/cuda-8.0/bin
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-8.0/lib64:/usr/local/cuda-8.0/extras/CUPTI/lib64
# Set path to python
# 设置python路径
# PYTHON=/home/lab5/inaguma/.pyenv/versions/anaconda3-4.1.1/bin/python
PYTHON=/usr/bin/python
# 第一个参数
config_path=$1
# 第二个参数
gpu_index=$2
# basename是去掉路径中的目录部分,如basename /usr/lib结果为lib
# awk 是一个文本处理的工具
filename=$(basename $config_path | awk -F. '{print $1}')
# 创建一个文件夹
mkdir -p log
# 选择gpu,可以选择多个
# 执行python脚本,开始训练
# 标准输入
# Standard output version
# CUDA_VISIBLE_DEVICES=$gpu_index $PYTHON train_ctc.py \
# $config_path $MODEL_SAVE_PATH
# 后台运行,将log保存到.log 文件中,nohup就是永久执行,用户终端关闭后依然运行
# 指令最后加个&,是后台运行
# > 为输出重定向,即将train_ctc.py的输出写到 log/xx.log 中
# Background job version
CUDA_VISIBLE_DEVICES=$gpu_index nohup $PYTHON train_ctc.py \
$config_path $MODEL_SAVE_PATH > log/$filename".log" &
选择gpu可以参考这个https://blog.csdn.net/u014381600/article/details/72911262。
可以看到这个脚本主要是执行 train_ctc.py 这个脚本,脚本有两个参数,一个是保存超参数的配置文件,一个是保存训练好的模型的文件。同时配置文件是yaml格式的,给了一个示例。
# ! /usr/bin/env python
# -*- coding: utf-8 -*-
"""Train the CTC model (TIMIT corpus)."""
# 引入需要的模块
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from os.path import join, isfile, abspath
import sys
import time
import tensorflow as tf
from setproctitle import setproctitle
import yaml
import shutil
# 添加绝对路径
sys.path.append(abspath('../../../'))
# 引入数据集
from examples.timit.data.load_dataset_ctc import Dataset
from examples.timit.metrics.ctc import do_eval_per, do_eval_cer
from utils.io.labels.sparsetensor import list2sparsetensor
from utils.training.learning_rate_controller import Controller
from utils.training.plot import plot_loss, plot_ler
from utils.directory import mkdir_join, mkdir
from utils.parameter import count_total_parameters
from models.ctc.ctc import CTC
def do_train(model, params):
"""Run training. If target labels are phone, the model is evaluated by PER
with 39 phones.
如果标签是音素,模型会用39个音素的PER指标进行评价
Args:
model: the model to train
params (dict): A dictionary of parameters
参数:
模型: 要训练的模型,是CTC()函数的返回值,这个函数在models/ctc/ctc.py中
参数: 保存参数的字典,这个字典中为超参数
"""
# Load dataset
# 加载训练集,按照params这个字典里的数据
# Dataset() 是一个类,在examples/timit/data/load_dataset_ctc里
# Dataset()
train_data = Dataset(
data_type='train', label_type=params['label_type'],
batch_size=params['batch_size'], max_epoch=params['num_epoch'],
splice=params['splice'],
num_stack=params['num_stack'], num_skip=params['num_skip'],
sort_utt=True, sort_stop_epoch=params['sort_stop_epoch'])
# 加载开发集
dev_data = Dataset(
data_type='dev', label_type=params['label_type'],
batch_size=params['batch_size'], splice=params['splice'],
num_stack=params['num_stack'], num_skip=params['num_skip'],
sort_utt=False)
if 'char' in params['label_type']:
test_data = Dataset(
data_type='test', label_type=params['label_type'],
batch_size=1, splice=params['splice'],
num_stack=params['num_stack'], num_skip=params['num_skip'],
sort_utt=False)
else:
test_data = Dataset(
data_type='test', label_type='phone39',
batch_size=1, splice=params['splice'],
num_stack=params['num_stack'], num_skip=params['num_skip'],
sort_utt=False)
# Tell TensorFlow that the model will be built into the default graph
with tf.Graph().as_default():
# Define placeholders
# model 是传入的参数
model.create_placeholders()
learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate')
# Add to the graph each operation (including model definition)
# 向计算图中添加操作,这添加的之后要run的操作,参数都是占位符,之后通过sess.run()函数的的feed_dict来给填充具体数值。这里要看一下model中这些函数的,才知道函数里具体要做什么。
loss_op, logits = model.compute_loss(
model.inputs_pl_list[0],
model.labels_pl_list[0],
model.inputs_seq_len_pl_list[0],
model.keep_prob_pl_list[0])
train_op = model.train(
loss_op,
optimizer=params['optimizer'],
learning_rate=learning_rate_pl)
decode_op = model.decoder(logits,
model.inputs_seq_len_pl_list[0],
beam_width=params['beam_width'])
ler_op = model.compute_ler(decode_op, model.labels_pl_list[0])
# Define learning rate controller
# 定义学习率控制器
lr_controller = Controller(
learning_rate_init=params['learning_rate'],
decay_start_epoch=params['decay_start_epoch'],
decay_rate=params['decay_rate'],
decay_patient_epoch=params['decay_patient_epoch'],
lower_better=True)
# Build the summary tensor based on the TensorFlow collection of
# summaries
# 这里是为了可视化
summary_train = tf.summary.merge(model.summaries_train)
summary_dev = tf.summary.merge(model.summaries_dev)
# Add the variable initializer operation
init_op = tf.global_variables_initializer()
# Create a saver for writing training checkpoints
saver = tf.train.Saver(max_to_keep=None)
# Count total parameters
parameters_dict, total_parameters = count_total_parameters(
tf.trainable_variables())
for parameter_name in sorted(parameters_dict.keys()):
print("%s %d" % (parameter_name, parameters_dict[parameter_name]))
print("Total %d variables, %s M parameters" %
(len(parameters_dict.keys()),
"{:,}".format(total_parameters / 1000000)))
csv_steps, csv_loss_train, csv_loss_dev = [], [], []
csv_ler_train, csv_ler_dev = [], []
# Create a session for running operation on the graph
with tf.Session() as sess:
# Instantiate a SummaryWriter to output summaries and the graph
summary_writer = tf.summary.FileWriter(
model.save_path, sess.graph)
# Initialize parameters
sess.run(init_op)
# Train model
start_time_train = time.time()
start_time_epoch = time.time()
start_time_step = time.time()
ler_dev_best = 1
not_improved_epoch = 0
learning_rate = float(params['learning_rate'])
#
for step, (data, is_new_epoch) in enumerate(train_data):
# Create feed dictionary for next mini batch (train)
inputs, labels, inputs_seq_len, _ = data
# 注意这里的输入,包括inputs,label,seq_len,dropout,learning_rate
# 这里等于inputs[0] 是因为这里的inputs的第0个axis表示的gpu,这里只用了1个gpu
feed_dict_train = {
model.inputs_pl_list[0]: inputs[0],
model.labels_pl_list[0]: list2sparsetensor(
labels[0], padded_value=train_data.padded_value),
model.inputs_seq_len_pl_list[0]: inputs_seq_len[0],
model.keep_prob_pl_list[0]: 1 - float(params['dropout']),
learning_rate_pl: learning_rate
}
# Update parameters
# 更新参数,注意这里的feed_dict_train的格式
# train_op 定义了如何通过输入的feed_dict得到输出,比如train_op可能是乘操作
# 由于这个代码的输入是.npy文件,同时也没有给输入数据,所以需要查看一下输入数据到底是什么格式。可以从train_op来看。
sess.run(train_op, feed_dict=feed_dict_train)
if (step + 1) % params['print_step'] == 0:
# Create feed dictionary for next mini batch (dev)
(inputs, labels, inputs_seq_len, _), _ = dev_data.next()
feed_dict_dev = {
model.inputs_pl_list[0]: inputs[0],
model.labels_pl_list[0]: list2sparsetensor(
labels[0], padded_value=dev_data.padded_value),
model.inputs_seq_len_pl_list[0]: inputs_seq_len[0],
model.keep_prob_pl_list[0]: 1.0
}
# Compute loss
# 计算损失,feed_dict_train为传入的数据
loss_train = sess.run(loss_op, feed_dict=feed_dict_train)
loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev)
csv_steps.append(step)
csv_loss_train.append(loss_train)
csv_loss_dev.append(loss_dev)
# Change to evaluation mode
feed_dict_train[model.keep_prob_pl_list[0]] = 1.0
# Compute accuracy & update event files
ler_train, summary_str_train = sess.run(
[ler_op, summary_train], feed_dict=feed_dict_train)
ler_dev, summary_str_dev = sess.run(
[ler_op, summary_dev], feed_dict=feed_dict_dev)
csv_ler_train.append(ler_train)
csv_ler_dev.append(ler_dev)
summary_writer.add_summary(summary_str_train, step + 1)
summary_writer.add_summary(summary_str_dev, step + 1)
summary_writer.flush()
duration_step = time.time() - start_time_step
print("Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" %
(step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev,
learning_rate, duration_step / 60))
sys.stdout.flush()
start_time_step = time.time()
# Save checkpoint and evaluate model per epoch
if is_new_epoch:
duration_epoch = time.time() - start_time_epoch
print('-----EPOCH:%d (%.3f min)-----' %
(train_data.epoch, duration_epoch / 60))
# Save fugure of loss & ler
plot_loss(csv_loss_train, csv_loss_dev, csv_steps,
save_path=model.save_path)
plot_ler(csv_ler_train, csv_ler_dev, csv_steps,
label_type=params['label_type'],
save_path=model.save_path)
if train_data.epoch >= params['eval_start_epoch']:
start_time_eval = time.time()
if 'char' in params['label_type']:
print('=== Dev Data Evaluation ===')
ler_dev_epoch, wer_dev_epoch = do_eval_cer(
session=sess,
decode_op=decode_op,
model=model,
dataset=dev_data,
label_type=params['label_type'],
eval_batch_size=1)
print(' CER: %f %%' % (ler_dev_epoch * 100))
print(' WER: %f %%' % (wer_dev_epoch * 100))
if ler_dev_epoch < ler_dev_best:
ler_dev_best = ler_dev_epoch
not_improved_epoch = 0
print('■■■ ↑Best Score (CER)↑ ■■■')
# Save model only when best accuracy is
# obtained (check point)
checkpoint_file = join(
model.save_path, 'model.ckpt')
save_path = saver.save(
sess, checkpoint_file, global_step=train_data.epoch)
print("Model saved in file: %s" % save_path)
print('=== Test Data Evaluation ===')
ler_test, wer_test = do_eval_cer(
session=sess,
decode_op=decode_op,
model=model,
dataset=test_data,
label_type=params['label_type'],
is_test=True,
eval_batch_size=1)
print(' CER: %f %%' % (ler_test * 100))
print(' WER: %f %%' % (wer_test * 100))
else:
not_improved_epoch += 1
else:
print('=== Dev Data Evaluation ===')
ler_dev_epoch = do_eval_per(
session=sess,
decode_op=decode_op,
per_op=ler_op,
model=model,
dataset=dev_data,
label_type=params['label_type'],
eval_batch_size=1)
print(' PER: %f %%' % (ler_dev_epoch * 100))
if ler_dev_epoch < ler_dev_best:
ler_dev_best = ler_dev_epoch
not_improved_epoch = 0
print('■■■ ↑Best Score (PER)↑ ■■■')
# Save model only when best accuracy is
# obtained (check point)
checkpoint_file = join(
model.save_path, 'model.ckpt')
save_path = saver.save(
sess, checkpoint_file, global_step=train_data.epoch)
print("Model saved in file: %s" % save_path)
print('=== Test Data Evaluation ===')
ler_test = do_eval_per(
session=sess,
decode_op=decode_op,
per_op=ler_op,
model=model,
dataset=test_data,
label_type=params['label_type'],
is_test=True,
eval_batch_size=1)
print(' PER: %f %%' % (ler_test * 100))
else:
not_improved_epoch += 1
duration_eval = time.time() - start_time_eval
print('Evaluation time: %.3f min' %
(duration_eval / 60))
# Early stopping
if not_improved_epoch == params['not_improved_patient_epoch']:
break
# Update learning rate
learning_rate = lr_controller.decay_lr(
learning_rate=learning_rate,
epoch=train_data.epoch,
value=ler_dev_epoch)
start_time_epoch = time.time()
duration_train = time.time() - start_time_train
print('Total time: %.3f hour' % (duration_train / 3600))
# Training was finished correctly
with open(join(model.save_path, 'complete.txt'), 'w') as f:
f.write('')
# 主函数,有两个参数,shell脚本中传过来的
def main(config_path, model_save_path):
# Load a config file (.yml)
# 打开 config_path指向的文件,加载yaml文件,这个yaml文件中保存了训练的超参数,按照字典格式
with open(config_path, "r") as f:
config = yaml.load(f)
params = config['param']
# Except for a blank class
# 设置标签类型,设为61个音素或28个字符数等
if params['label_type'] == 'phone61':
params['num_classes'] = 61
elif params['label_type'] == 'phone48':
params['num_classes'] = 48
elif params['label_type'] == 'phone39':
params['num_classes'] = 39
elif params['label_type'] == 'character':
params['num_classes'] = 28
elif params['label_type'] == 'character_capital_divide':
params['num_classes'] = 72
else:
raise TypeError
# Model setting
# 设置模型,CTC()函数在models/ctc/ctc 中,返回的这个model之后会给do_train() 这个函数
model = CTC(encoder_type=params['encoder_type'],
input_size=params['input_size'],
splice=params['splice'],
num_stack=params['num_stack'],
num_units=params['num_units'],
num_layers=params['num_layers'],
num_classes=params['num_classes'],
lstm_impl=params['lstm_impl'],
use_peephole=params['use_peephole'],
parameter_init=params['weight_init'],
clip_grad_norm=params['clip_grad_norm'],
clip_activation=params['clip_activation'],
num_proj=params['num_proj'],
weight_decay=params['weight_decay'])
# Set process name
# 设置进程名称
setproctitle('tf_timit_' + model.name + '_' + params['label_type'])
# 修改model.name
model.name += '_' + str(params['num_units'])
model.name += '_' + str(params['num_layers'])
model.name += '_' + params['optimizer']
model.name += '_lr' + str(params['learning_rate'])
if params['num_proj'] != 0:
model.name += '_proj' + str(params['num_proj'])
if params['dropout'] != 0:
model.name += '_drop' + str(params['dropout'])
if params['num_stack'] != 1:
model.name += '_stack' + str(params['num_stack'])
if params['weight_decay'] != 0:
model.name += '_wd' + str(params['weight_decay'])
# Set save path
# 设置保存模型的路径
model.save_path = mkdir_join(
model_save_path, 'ctc', params['label_type'], model.name)
# Reset model directory
# 重置保存模型的目录
model_index = 0
new_model_path = model.save_path
while True:
if isfile(join(new_model_path, 'complete.txt')):
# Training of the first model have been finished
model_index += 1
new_model_path = model.save_path + '_' + str(model_index)
elif isfile(join(new_model_path, 'config.yml')):
# Training of the first model have not been finished yet
model_index += 1
new_model_path = model.save_path + '_' + str(model_index)
else:
break
model.save_path = mkdir(new_model_path)
# Save config file
# os模块中提供了对文件目录的 新建/删除/查看文件属性 的操作,shutil模块是对os模块的补充,包括移动,复制,打包,压缩
# 将config_path 拷贝到模型保存目录下
shutil.copyfile(config_path, join(model.save_path, 'config.yml'))
# 在 python 中调用print时,实际是调用了sys.stdout.write(obj+'\n')
# 这里是输重定向,重定向到 .log文件,之后print就会调用这个文件的 .write函数,即打印到这个文件中
sys.stdout = open(join(model.save_path, 'train.log'), 'w')
# TODO(hirofumi): change to logger
# 开始训练
do_train(model=model, params=params)
# 如果模块是被直接执行,则执行以下代码。如果是被引入,则不执行。
# 参考 http://blog.konghy.cn/2017/04/24/python-entry-program/
if __name__ == '__main__':
# 获取参数,这里的参数指的是python 解释器的参数
args = sys.argv
# python解释器这里有三个参数,第一个即args[0]为本文件的文件名
if len(args) != 3:
raise ValueError('Length of args should be 3.')
main(config_path=args[1], model_save_path=args[2])