# Flat start and monophone training, with delta-delta features. # This script applies cepstral mean normalization (per speaker).
#monophone 训练单音素模型
steps/train_mono.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono || exit 1;
#test monophone model
local/thchs-30_decode.sh --mono true --nj $n "steps/decode.sh" exp/mono data/mfcc &
echo "Usage: steps/train_mono.sh [options] "
echo " e.g.: steps/train_mono.sh data/train.1k data/lang exp/mono"
echo "main options (for others, see top of script file)"
# Begin configuration section.
nj=4
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
num_iters=40 # Number of iterations of training
max_iter_inc=30 # Last iter to increase #Gauss on.
totgauss=1000 # Target #Gaussians.
careful=false
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
config= # name of config file.
stage=-4
power=0.25 # exponent to determine number of gaussians from occurrence counts
norm_vars=false # deprecated, prefer --cmvn-opts "--norm-vars=false"
cmvn_opts= # can be used to add extra options to cmvn.
# End configuration section.
#decode word
utils/mkgraph.sh $opt data/graph/lang $srcdir $srcdir/graph_word || exit 1;
$decoder --cmd "$decode_cmd" --nj $nj $srcdir/graph_word $datadir/test $srcdir/decode_test_word || exit 1
#decode phone
utils/mkgraph.sh $opt data/graph_phone/lang $srcdir $srcdir/graph_phone || exit 1;
$decoder --cmd "$decode_cmd" --nj $nj $srcdir/graph_phone $datadir/test_phone $srcdir/decode_test_phone || exit 1
#monophone_ali
steps/align_si.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono exp/mono_ali || exit 1;
# Computes training alignments using a model with delta or
# LDA+MLLT features.
# If you supply the "--use-graphs true" option, it will use the training
# graphs from the source directory (where the model is). In this
# case the number of jobs must match with the source directory.
echo "usage: steps/align_si.sh "
echo "main options (for others, see top of script file)"
echo " --config # config containing options"
echo " --nj # number of parallel jobs"
echo " --use-graphs true # use graphs in src-dir"
echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs."
#triphone
steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 data/mfcc/train data/lang exp/mono_ali exp/tri1 || exit 1;
#test tri1 model
local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri1 data/mfcc &
# Begin configuration.
stage=-4 # This allows restarting after partway, when something when wrong.
config=
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 20 30";
num_iters=35 # Number of iterations of training
max_iter_inc=25 # Last iter to increase #Gauss on.
beam=10
careful=false
retry_beam=40
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
power=0.25 # Exponent for number of gaussians according to occurrence counts
cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves
norm_vars=false # deprecated. Prefer --cmvn-opts "--norm-vars=true"
# use the option --cmvn-opts "--norm-means=false"
cmvn_opts=
delta_opts=
context_opts= # use"--context-width=5 --central-position=2" for quinphone
# End configuration.
echo "Usage: steps/train_deltas.sh "
echo "e.g.: steps/train_deltas.sh 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1"
对特征使用LDA和MLLT进行变换,训练加入LDA和MLLT的三音素模型。
LDA+MLLT refers to the way we transform the features after computing the MFCCs: we splice across several frames, reduce the dimension (to 40 by default) using Linear Discriminant Analysis), and then later estimate, over multiple iterations, a diagonalizing transform known as MLLT or CTC.
详情可参考 http://kaldi-asr.org/doc/transform.html
#triphone_ali
steps/align_si.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri1 exp/tri1_ali || exit 1;
#lda_mllt
steps/train_lda_mllt.sh --cmd "$train_cmd" --splice-opts "--left-context=3 --right-context=3" 2500 15000 data/mfcc/train data/lang exp/tri1_ali exp/tri2b || exit 1;
#test tri2b model
local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri2b data/mfcc &
# Begin configuration.
cmd=run.pl
config=
stage=-5
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 20 30";
mllt_iters="2 4 6 12";
num_iters=35 # Number of iterations of training
max_iter_inc=25 # Last iter to increase #Gauss on.
dim=40
beam=10
retry_beam=40
careful=false
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
power=0.25 # Exponent for number of gaussians according to occurrence counts
randprune=4.0 # This is approximately the ratio by which we will speed up the
# LDA and MLLT calculations via randomized pruning.
splice_opts=
cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves
norm_vars=false # deprecated. Prefer --cmvn-opts "--norm-vars=false"
cmvn_opts=
context_opts= # use "--context-width=5 --central-position=2" for quinphone.
# End configuration.
#lda_mllt_ali
steps/align_si.sh --nj $n --cmd "$train_cmd" --use-graphs true data/mfcc/train data/lang exp/tri2b exp/tri2b_ali || exit 1;
#sat
steps/train_sat.sh --cmd "$train_cmd" 2500 15000 data/mfcc/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
#test tri3b model
local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri3b data/mfcc &
# Begin configuration section.
stage=-5
exit_stage=-100 # you can use this to require it to exit at the
# beginning of a specific stage. Not all values are
# supported.
fmllr_update_type=full
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
careful=false
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
realign_iters="10 20 30";
fmllr_iters="2 4 6 12";
silence_weight=0.0 # Weight on silence in fMLLR estimation.
num_iters=35 # Number of iterations of training
max_iter_inc=25 # Last iter to increase #Gauss on.
power=0.2 # Exponent for number of gaussians according to occurrence counts
cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves
phone_map=
train_tree=true
tree_stats_opts=
cluster_phones_opts=
compile_questions_opts=
# End configuration section.
# There are 3 models involved potentially in this script,
# and for a standard, speaker-independent system they will all be the same.
# The "alignment model" is for the 1st-pass decoding and to get the
# Gaussian-level alignments for the "adaptation model" the first time we
# do fMLLR. The "adaptation model" is used to estimate fMLLR transforms
# and to generate state-level lattices. The lattices are then rescored
# with the "final model".
#
# The following table explains where we get these 3 models from.
# Note: $srcdir is one level up from the decoding directory.
#
# Model Default source:
#
# "alignment model" $srcdir/final.alimdl --alignment-model
# (or $srcdir/final.mdl if alimdl absent)
# "adaptation model" $srcdir/final.mdl --adapt-model
# "final model" $srcdir/final.mdl --final-model
#sat_ali
steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
#quick
steps/train_quick.sh --cmd "$train_cmd" 4200 40000 data/mfcc/train data/lang exp/tri3b_ali exp/tri4b || exit 1;
#test tri4b model
local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri4b data/mfcc &
# Begin configuration..
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 15"; # Only realign twice.
num_iters=20 # Number of iterations of training
maxiterinc=15 # Last iter to increase #Gauss on.
batch_size=750 # batch size to use while compiling graphs... memory/speed tradeoff.
beam=10 # alignment beam.
retry_beam=40
stage=-5
cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves
# End configuration section.