train_mono.sh
#!/bin/bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0 # To be run from .. # Flat start and monophone training, with delta-delta features. # This script applies cepstral mean normalization (per speaker). # Begin configuration section. #线程数目 nj=4 cmd=run.pl #相关的尺度缩放因子 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" #迭代次数 num_iters=40 # Number of iterations of training #迭代到目标高斯数的最大次数 max_iter_inc=30 # Last iter to increase #Gauss on. #目标总高斯数 totgauss=1000 # Target #Gaussians. careful=false #对齐时,提高静音似然度的程度 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment #对齐的次数列表 realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38"; #配置文件名 config= # name of config file. stage=-4 #根据出现的次数,通过指数确定高斯数目 power=0.25 # exponent to determine number of gaussians from occurrence counts #norm_vars已经过时废弃 norm_vars=false # deprecated, prefer --cmvn-opts "--norm-vars=false" #cmvn选项 cmvn_opts= # can be used to add extra options to cmvn. # End configuration section. echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; if [ $# != 3 ]; then echo "Usage: steps/train_mono.sh [options]" echo " e.g.: steps/train_mono.sh data/train.1k data/lang exp/mono" echo "main options (for others, see top of script file)" echo " --config # config containing options " echo " --nj# number of parallel jobs " echo " --cmd (utils/run.pl|utils/queue.pl) # how to run jobs. " exit 1; fi #data/waves_train data=$1 #data/lang lang=$2 #exp/mono0 dir=$3 #cat data/lang/oov.txt 命令 oov_sym=`cat $lang/oov.int` || exit 1; #创建exp/mono0/log文件夹 mkdir -p $dir/log #将线程数写入 num_jobs文件 echo $nj > $dir/num_jobs #将数据集进行切分成多个子文件夹,便于多线程 #split_data.sh $data1 $nj sdata=$data/split$nj; [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; #cp data/lang/phones.txt exp/mono0/phones.txt cp $lang/phones.txt $dir || exit 1; #cmvn选项 $norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts" #保存cmvn配置 echo $cmvn_opts > $dir/cmvn_opts # keep track of options to CMVN. #apply-cmvn 提取特征的CMVN,即为倒谱方差均值归一化 #3个输入文件:utt2spk(发音id 说话人), cmvn.scp(说话人相关的统计量), feats.scp(训练用的特征文件) #输出是 ark:-|,利用管道技术把输出传递给下一个函数作为输入 #add-deltas 输入是ark:-,训练数据增加差分量,比如13维度mfcc处理后变成39维度 feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |" #sed命令 将全部的JOB替换成1 example_feats="`echo $feats | sed s/JOB/1/g`"; echo "$0: Initializing monophone system." #如果不存在音素列表文件,则退出 [ ! -f $lang/phones/sets.int ] && exit 1; #data/local/phones/set.txt set.int shared_phones_opt="--shared-phones=$lang/phones/sets.int" #$stage小于等于-3,则gmm-init-mono #src/gmmbin/gmm-init-mono #Usage: gmm-init-mono#计算所有特征数据每一维的全局均值、方差 #读取hmm-topo文件,根据sets.int,创建ctx_dep #GMM的均值和方差为上述全局均值和方差 #初始化转移模型可以通过配置文件conf/topo_org.proto设置 #将初始化转移模型、GMM模型写入exp/mono0/0.mdl,将ctx_dep写到tree中,决策树图(trains-id叶子节点) if [ $stage -le -3 ]; then # Note: JOB=1 just uses the 1st part of the features-- we only need a subset anyway. #如果特征维度为0,则feat-to-dim命令运行获取特征维度,忽略掉错误提示信息 #如果或者$feat_dim为空; if ! feat_dim=`feat-to-dim "$example_feats" - 2>/dev/null` || [ -z $feat_dim ]; then #将错误显示出来 feat-to-dim "$example_feats" - echo "error getting feature dimension" exit 1; #错误得到特征维度 fi #开始初始化 #subset-feats 特征数据中取10个特征用于构造原始模型 #输出 0.mdl 和tree $cmd JOB=1 $dir/log/init.log \ gmm-init-mono $shared_phones_opt "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo $feat_dim \ $dir/0.mdl $dir/tree || exit 1; fi #获取高斯数 gmm-info 0.mdl 搜索 gaussians,输出最后1个字段的内容 #awk NF 字段的个数, $NF表示最后一个字段的内容 numgauss=`gmm-info --print-args=false $dir/0.mdl | grep gaussians | awk '{print $NF}'` #(目标高斯数(外面已传入) - 高斯数(未知?))/高斯增加的最大迭代次数 = 每次迭代的高斯数增量 incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss echo "目标总高斯数: $totgauss!" echo "初始化高斯数: $numgauss!" if [ $stage -le -2 ]; then #编译训练图 #根据tree决策树,0.mdl模型 #compile-train-graphs #Usage: compile-train-graphs [options] #e.g.: compile-train-graphs tree 0.mdl lex.fst ark:train.tra ark:graphs.fsts #将字符转成int, "cat oov.int" #words.txt 词汇表 对应int #text 训练集 发音id 文本 #输出fst文件(fsts.JOB.gz),包含train.tra中的每个发音-id的FST,FST由无转移概率的HCLG组成 echo "$0: Compiling training graphs" $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/0.mdl $lang/L.fst \ "ark:sym2int.pl --map-oov $oov_sym -f 2- $lang/words.txt < $sdata/JOB/text|" \ "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; fi #align-equal-compiled #训练时需要将标注跟每一帧特征进行对齐,由于现在还没有可以用于对齐的模型, #所以采用最简单的方法 -- 均#匀对齐根据标注数目对特征序列进行等间隔切分, #例如一个具有5个标注的长度为100帧的特征序列,则认为1-20帧属于第1个标注,21-40属于第2个... #这种划分方法虽然会有误差,但待会在训练模型的过程中会不断地重新对齐。 #gmm-acc-stats-ali:根据对齐信息,计算每个高斯分布的均值和方差,输出到0.JOB.acc if [ $stage -le -1 ]; then echo "$0: Aligning data equally (pass 0)" $cmd JOB=1:$nj $dir/log/align.0.JOB.log \ align-equal-compiled "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" ark,t:- \| \ gmm-acc-stats-ali --binary=true $dir/0.mdl "$feats" ark:- \ $dir/0.JOB.acc || exit 1; fi # In the following steps, the --min-gaussian-occupancy=3 option is important, otherwise # we fail to est "rare" phones and later on, they never align properly. #在下面的步骤中,设置最小高斯出现次数阈值,否则,就会出现估计少数音素失败,它们永远不会合理的对齐。 #如果某个单高斯Component的occupancy_低于这个阈值,那么就不会更新这个高斯 ## 而且如果 --remove-low-count-gaussians=true,则对应得单高斯Component会被移除。 #更新模型 #Do Maximum Likelihood re-estimation of GMM-based acoustic model #Usage: gmm-est [options] #e.g.: gmm-est 1.mdl 1.acc 2.mdl #利用gmm-acc-stats-ali 0.*.acc统计信息, 更新转移模型,更新GMM模型 if [ $stage -le 0 ]; then gmm-est --min-gaussian-occupancy=3 --mix-up=$numgauss --power=$power \ $dir/0.mdl "gmm-sum-accs - $dir/0.*.acc|" $dir/1.mdl 2> $dir/log/update.0.log || exit 1; rm $dir/0.*.acc fi beam=6 # will change to 10 below after 1st pass # note: using slightly wider beams for WSJ vs. RM. x=1 #迭代次数 #steps/train_mono.sh: Pass 6 #steps/train_mono.sh: Aligning data #gmm-boost-silence 作用是让某些phones(由第一个参数指定)对应pdf的weight乘以--boost #参数所指定的数字,强行提高(如果大于1)/降低(如果小于1)这个phone的概率。 #gmm-align-compiled 解码对齐:每一帧,对齐的状态; 根据这句话的fst和特征,生成对应的对齐状态序列 #gmm-acc-stats-ali统计信息,用于更新模型 #gmm-est 利用上述信息,更新模型 # 线性增加混合高斯模型的数目,直到指定数量。 #遍历次数增加 #gmm-align-compiled # --acoustic-scale 选项跟GMM输出概率相关,用于平衡 GMM 输出概率和 HMM 跳转概率的重要性。 # --beam 选项用于计算对解码过程中出现较低log-likelihood的token进行裁剪的阈值,该值设计的越小,大部分token会被裁剪以便提高解码速度,但可能会在开始阶段把正确的token裁剪掉导致无法得到正确的解码路径。 # --retry-beam 选项用于修正上述的问题,当无法得到正确的解码路径后,会增加beam的值,如果找到了最佳解码路径则退出,否则一直增加指定该选项设置的值,如果还没找到,就抛出警告,导致这种问题要么是标注本来就不对,或者retry-beam也设计得太小 while [ $x -lt $num_iters ]; do echo "$0: Pass $x" if [ $stage -le $x ]; then if echo $realign_iters | grep -w $x >/dev/null; then echo "$0: Aligning data" mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] --careful=$careful "$mdl" \ "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" \ || exit 1; fi $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ gmm-acc-stats-ali $dir/$x.mdl "$feats" "ark:gunzip -c $dir/ali.JOB.gz|" \ $dir/$x.JOB.acc || exit 1; $cmd $dir/log/update.$x.log \ gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power $dir/$x.mdl \ "gmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl || exit 1; rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null fi if [ $x -le $max_iter_inc ]; then numgauss=$[$numgauss+$incgauss]; fi beam=10 x=$[$x+1] done #清空旧文件 #建立软链接 #$x.mdl final.mdl 最终模型 #$x,occs final.occs occupation counts ( cd $dir; rm final.{mdl,occs} 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs ) #以下做一些分析、诊断工作 steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir utils/summarize_warnings.pl $dir/log steps/info/gmm_dir_info.pl $dir echo "$0: Done training monophone system in $dir" exit 0 # example of showing the alignments: # show-alignments data/lang/phones.txt $dir/30.mdl "ark:gunzip -c $dir/ali.0.gz|" | head -4
prepare_lang.sh
#!/bin/bash # Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey); # Arnab Ghoshal # 2014 Guoguo Chen # 2015 Hainan Xu # 2016 FAU Erlangen (Author: Axel Horndasch) # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # This script prepares a directory such as data/lang/, in the standard format, # given a source directory containing a dictionary lexicon.txt in a form like: # word phone1 phone2 ... phoneN # per line (alternate prons would be separate lines), or a dictionary with probabilities # called lexiconp.txt in a form: # word pron-prob phone1 phone2 ... phoneN # (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if # lexicon.txt exists. # and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt # and extra_questions.txt # Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and # non-silence phones respectively (where silence includes various kinds of # noise, laugh, cough, filled pauses etc., and nonsilence phones includes the # "real" phones.) # In each line of those files is a list of phones, and the phones on each line # are assumed to correspond to the same "base phone", i.e. they will be # different stress or tone variations of the same basic phone. # The file "optional_silence.txt" contains just a single phone (typically SIL) # which is used for optional silence in the lexicon. # extra_questions.txt might be empty; typically will consist of lists of phones, # all members of each list with the same stress or tone; and also possibly a # list for the silence phones. This will augment the automatically generated # questions (note: the automatically generated ones will treat all the # stress/tone versions of a phone the same, so will not "get to ask" about # stress or tone). #这个脚本准备一个目录例如 data/lang #假定一个原文件夹包含一个词典 lexicon.txt,每行内容格式: 词 音素 音素 ... #或者一个包含词出现概率的词典 lexiconp.txt,每行内容格式: 词 概率 音素 音素 ... #注意,如果lexiconp.txt存在,我们会使用这个概率词典,即便lexicon.txt存在. #silence_phones.txt 静音音素(各种噪声、笑声、咳嗽、有声停顿); nonsilence_phones.txt 非静音音素(正常音素); #上述这些文件的每行代表一组相同的base phone,包含各种不同的重音或者声调。 #optional_silence.txt 仅仅包含一个单独的音素,典型的是SIL; #extra_questions.txt 可能是空的,典型的是它包含一些音素以及每个音素对应的相同的重音或者音调和静音音速列表. #它可以增加自动生成问题的数量(注意:自动产生问题对一个音素的所有变体都同等对待) # This script adds word-position-dependent phones and constructs a host of other # derived files, that go in data/lang/. # Begin configuration section. #正常音素的状态数 #非正常发音的状态数 #与位置相关的音素 num_sil_states=5 num_nonsil_states=3 position_dependent_phones=true # position_dependent_phones is false also when position dependent phones and word_boundary.txt # have been generated by another source #position_dependent_phones 为false时,位置相关的音素和word_boundary.txt将由另一种方式产生. #为true,将会被拆分成开始、中间、结束、孤立等位置相关的Phones. B I E S,构建决策树时对位置提问进行分裂. share_silence_phones=false # if true, then share pdfs of different silence # phones together. #位于同一行,表示共享 hmm-state 状态数 #share如果为true,不同静音音素将会共享pdfs(概率密度),但是转移概率还是不同的.在 #roots文件中,同一行,如果不共享,放在不同行. #split/not-split,对于根节点,是否有机会根据问题进行决策树分裂;如果分裂,则同一行的不同音素pdf可能不同;如果不分裂,则固定共享. sil_prob=0.5 #如果你想用make_unk_lm.sh,为未知的词建立 音素级别的LM. #需要提供这个unk_fst falg,eg:/unk_fst.txt. #是ake_unk_lm.sh的第二个参数. unk_fst= # if you want to model the unknown-word () # with a phone-level LM as created by make_unk_lm.sh, # provide the text-form FST via this flag, e.g./unk_fst.txt # wherewas the 2nd argument of make_unk_lm.sh. #音素符号表 phones.txt 音素 int编号 phone_symbol_table= # if set, use a specified phones.txt file. #混淆符号 字典中的词可能会出现同个发音的情况,在同音词 发音标注加入 disambig sysmbols(#1 #2 ...) #有多少个同音词,有多个disambig sysmbols.? 不太确定 extra_word_disambig_syms= # if set, add disambiguation symbols from this file (one per line) # to phones/disambig.txt, phones/wdisambig.txt and words.txt #标准的一个混淆符号用于optional silence. #增加这个混淆符号数目不会有问题,但是有用的当你后面想要引入这个标签到L_disambig.fst. num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence. # Increasing this number does not harm, but is only useful if you later # want to introduce this labels to L_disambig.fst # end configuration sections echo "$0 $@" # Print the command line for logging . utils/parse_options.sh if [ $# -ne 4 ]; then echo "usage: utils/prepare_lang.sh" echo "e.g.: utils/prepare_lang.sh data/local/dict data/local/lang data/lang " echo "should contain the following files: " echo " extra_questions.txt lexicon.txt nonsilence_phones.txt optional_silence.txt silence_phones.txt" echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info." echo "options: " echo " --num-sil-states# default: 5, #states in silence models. " echo " --num-nonsil-states# default: 3, #states in non-silence models. " echo " --position-dependent-phones (true|false) # default: true; if true, use _B, _E, _S & _I" echo " # markers on phones to indicate word-internal positions. " echo " --share-silence-phones (true|false) # default: false; if true, share pdfs of " echo " # all non-silence phones. " echo " --sil-prob# default: 0.5 [must have 0 <= silprob < 1] " echo " --phone-symbol-table# default: \"\"; if not empty, use the provided " echo " # phones.txt as phone symbol table. This is useful " echo " # if you use a new dictionary for the existing setup." echo " --unk-fst# default: none. e.g. exp/make_unk_lm/unk_fst.txt. " echo " # This is for if you want to model the unknown word" echo " # via a phone-level LM rather than a special phone" echo " # (this should be more useful for test-time than train-time)." echo " --extra-word-disambig-syms# default: \"\"; if not empty, add disambiguation symbols " echo " # from this file (one per line) to phones/disambig.txt," echo " # phones/wdisambig.txt and words.txt" exit 1; fi #data/local/dict srcdir=$1 #"" oov_word=$2 #data/local/lang tmpdir=$3 #data/lang dir=$4 mkdir -p $dir $tmpdir $dir/phones silprob=false [ -f $srcdir/lexiconp_silprob.txt ] && silprob=true [ -f path.sh ] && . ./path.sh ! utils/validate_dict_dir.pl $srcdir && \ echo "*Error validating directory $srcdir*" && exit 1; if [[ ! -f $srcdir/lexicon.txt ]]; then echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt" perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1; fi if [[ ! -f $srcdir/lexiconp.txt ]]; then echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt" perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1; fi if [ ! -z "$unk_fst" ] && [ ! -f "$unk_fst" ]; then echo "$0: expected --unk-fst $unk_fst to exist as a file" exit 1 fi if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then utils/validate_dict_dir.pl $srcdir # show the output. echo "Validation failed (second time)" exit 1; fi # phones.txt file provided, we will do some sanity check here. # $phone_symbol_table 就是phones.txt, 与{,non}silence_phones.txt check,是否有问题 if [[ ! -z $phone_symbol_table ]]; then # Checks if we have position dependent phones n1=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l` n2=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l` $position_dependent_phones && [ $n1 -eq $n2 ] &&\ echo "$0: Position dependent phones requested, but not in provided phone symbols" && exit 1; ! $position_dependent_phones && [ $n1 -ne $n2 ] &&\ echo "$0: Position dependent phones not requested, but appear in the provided phones.txt" && exit 1; # Checks if the phone sets match. cat $srcdir/{,non}silence_phones.txt | awk -v f=$phone_symbol_table ' BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }} { for (x = 1; x <= NF; ++x) { if (!($x in phones)) { print "Phone appears in the lexicon but not in the provided phones.txt: "$x; exit 1; }}}' || exit 1; fi # In case there are extra word-level disambiguation symbols we need # to make sure that all symbols in the provided file are valid. #如果有额外的词级别的混淆符号,我们需要确认所有符号是有效的 validate_disambig_sym_file.pl if [ ! -z "$extra_word_disambig_syms" ]; then if ! utils/lang/validate_disambig_sym_file.pl --allow-numeric "false" $extra_word_disambig_syms; then echo "$0: Validation of disambiguation file \"$extra_word_disambig_syms\" failed." exit 1; fi fi #如果position_dependent_phones有效,则添加 _B, _E, _S, _I 生成$tmpdir/lexiconp.txt,lexiconp_silprob.txt. #创建phone_map.txt,每行格式:#eg:AA AA_B AA_E AA_I AA_S if $position_dependent_phones; then # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by # adding the markers _B, _E, _S, _I depending on word position. # In this recipe, these markers apply to silence also. # Do this starting from lexiconp.txt only. if "$silprob"; then perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A; $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die; if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B "; for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt else perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die; if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B "; for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ < $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1; fi # create $tmpdir/phone_map.txt # this has the format (on each line) # ... # where the versions depend on the position of the phone within a word. # For instance, we'd have: # AA AA_B AA_E AA_I AA_S # for (B)egin, (E)nd, (I)nternal and (S)ingleton # and in the case of silence # SIL SIL SIL_B SIL_E SIL_I SIL_S # [because SIL on its own is one of the variants; this is for when it doesn't # occur inside a word but as an option in the lexicon.] # This phone map expands the phone lists into all the word-position-dependent # versions of the phone lists. cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ > $tmpdir/phone_map.txt else if "$silprob"; then cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt else cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt fi cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \ awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt fi mkdir -p $dir/phones # various sets of phones... # Sets of phones for use in clustering, and making monophone systems. #音素集合用于聚类,生成 mono系统. #如果共享静音音素,则 if $share_silence_phones; then # build a roots file that will force all the silence phones to share the # same pdf's. [three distinct states, only the transitions will differ.] # 'shared'/'not-shared' means, do we share the 3 states of the HMM # in the same tree-root? # Sharing across models(phones) is achieved by writing several phones # into one line of roots.txt (shared/not-shared doesn't affect this). # 'not-shared not-split' means we have separate tree roots for the 3 states, # but we never split the tree so they remain stumps, # so all phones in the line correspond to the same model. cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \ utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt cat $dir/phones/sets.txt | \ awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt else # different silence phones will have different GMMs. [note: here, all "shared split" means # is that we may have one GMM for all the states, or we can split on states. because they're # context-independent phones, they don't see the context.] cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt fi cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \ awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \ awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt cp $dir/phones/silence.txt $dir/phones/context_indep.txt # if extra_questions.txt is empty, it's OK. cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \ >$dir/phones/extra_questions.txt # Want extra questions about the word-start/word-end stuff. Make it separate for # silence and non-silence. Probably doesn't matter, as silence will rarely # be inside a word. # silence很少在一个词的中间. if $position_dependent_phones; then for suffix in _B _E _I _S; do (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt done for suffix in "" _B _E _I _S; do (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt done fi # add_lex_disambig.pl is responsible for adding disambiguation symbols to # the lexicon, for telling us how many disambiguation symbols it used, # and and also for modifying the unknown-word's pronunciation (if the # --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those # disambig symbols for that purpose. # The #2 will later be replaced with the actual unk model. The reason # for the #1 and the #3 is for disambiguation and also to keep the # FST compact. If we didn't have the #1, we might have a different copy of # the unk-model FST, or at least some of its arcs, for each start-state from # which an ... transition comes (instead of per end-state, which is more compact); # and adding the #3 prevents us from potentially having 2 copies of the unk-model # FST due to the optional-silence [the last phone of any word gets 2 arcs]. # add_lex_disambig.pl 添加歧义符号到Lexicon. 为了告诉我们有多少符号被用到,同样为了修正 #unknown-word's 发音成 #1 #2 #3 # if [ ! -z "$unk_fst" ]; then # if the --unk-fst option was provided... if "$silprob"; then utils/lang/internal/modify_unk_pron.py $tmpdir/lexiconp_silprob.txt "$oov_word" || exit 1 else utils/lang/internal/modify_unk_pron.py $tmpdir/lexiconp.txt "$oov_word" || exit 1 fi unk_opt="--first-allowed-disambig 4" else unk_opt= fi if "$silprob"; then ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt) else ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt) fi ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST. echo $ndisambig > $tmpdir/lex_ndisambig #$ndisambig 存在 $tmpdir/lex_ndisambig # Format of lexiconp_disambig.txt: # !SIL 1.0 SIL_S #1.0 SPN_S #1 #1.0 SPN_S #2 #1.0 NSN_S # !EXCLAMATION-POINT 1.0 EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E ( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt # 如果有词级别的歧义符,号也需要加到音素级别的歧义符表 # In case there are extra word-level disambiguation symbols they also # need to be added to the list of phone-level disambiguation symbols. if [ ! -z "$extra_word_disambig_syms" ]; then # We expect a file containing valid word-level disambiguation symbols. cat $extra_word_disambig_syms | awk '{ print $1 }' >> $dir/phones/disambig.txt fi #创建phones.txt # Create phone symbol table. if [[ ! -z $phone_symbol_table ]]; then start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'` echo "" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table ' BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\ cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt else echo " " | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \ awk '{n=NR-1; print $1, n;}' > $dir/phones.txt fi #创建一个文件word_boundary.txt,描述词边界信息用于每个音素 # Create a file that describes the word-boundary information for # each phone. 5 categories. if $position_dependent_phones; then cat $dir/phones/{silence,nonsilence}.txt | \ awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; } /_S$/{ print $1, "singleton"; next;} /_E$/{ print $1, "end"; next; } { print $1, "nonword";} ' > $dir/phones/word_boundary.txt else # word_boundary.txt might have been generated by another source [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary.txt fi #创建词符号表 words.txt # and是需要的, 用于arpa语言模型重新评分网格,它们通常不会出现再G.fst和L.fst中 # Create word symbol table. #andare only needed due to the need to rescore lattices with # ConstArpaLm format language model. They do not normally appear in G.fst or # L.fst. if "$silprob"; then # remove the silprob cat $tmpdir/lexiconp_silprob.txt |\ awk '{ for(i=1; i<=NF; i++) { if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print ""; } }' > $tmpdir/lexiconp.txt fi cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' BEGIN { print "0 "; } { if ($1 == "") { print "is in the vocabulary!" | "cat 1>&2" exit 1; } if ($1 == "") { print " is in the vocabulary!" | "cat 1>&2" exit 1; } printf("%s %d\n", $1, NR); } END { printf("#0 %d\n", NR+1); printf("%d\n", NR+2); printf(" %d\n", NR+3); }' > $dir/words.txt || exit 1; # # In case there are extra word-level disambiguation symbols they also # need to be added to words.txt #word.txt存在,统计当前词数目 #将这些歧义符号添加到当前的words.txt,包括符号的整数形式 if [ ! -z "$extra_word_disambig_syms" ]; then # Since words.txt already exists, we need to extract the current word count. word_count=`tail -n 1 $dir/words.txt | awk '{ print $2 }'` # We expect a file containing valid word-level disambiguation symbols. # The list of symbols is attached to the current words.txt (including # a numeric identifier for each symbol). cat $extra_word_disambig_syms | \ awk -v WC=$word_count '{ printf("%s %d\n", $1, ++WC); }' >> $dir/words.txt || exit 1; fi # format of $dir/words.txt: #0 #!EXCLAMATION-POINT 1 #!SIL 2 #"CLOSE-QUOTE 3 #... silphone=`cat $srcdir/optional_silence.txt` || exit 1; [ -z "$silphone" ] && \ ( echo "You have no optional-silence phone; it is required in the current scripts" echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \ exit 1; #创建align_lexicon.{txt,int} # 如果我们没有使用 word-position-dependent, 我们使用这种方法用于网格词对齐 # create $dir/phones/align_lexicon.{txt,int}. # This is the method we use for lattice word alignment if we are not # using word-position-dependent phones. # First remove pron-probs from the lexicon. #首先去除Lexicon中的概率 perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt #$silphone没有后缀,因为它是作为optional-silence出现,而不是一个词的一部分. # Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence, # and is not part of a word. [ ! -z "$silphone" ] && echo "$silphone " >> $tmpdir/align_lexicon.txt #排序,去重 cat $tmpdir/align_lexicon.txt | \ perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt # create phones/align_lexicon.int #转成int形式 cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \ utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int # Create the basic L.fst without disambiguation symbols, for use # in training. # 无需歧义符号,就可以创建基本的L.fst,在训练中有用到. #添加静音概率 (为词的前后出现静音的概率建模) #make_lexicon_fst.pl 将词典中的单词和音素转成fst输入文件的格式. if $silprob; then # Add silence probabilities (modlels the prob. of silence before and after each # word). On some setups this helps a bit. See utils/dict_dir_add_pronprobs.sh # and where it's called in the example scripts (run.sh). utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob.txt $srcdir/silprob.txt $silphone "" | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; else utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp.txt $sil_prob $silphone | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; fi # The file oov.txt contains a word that we will map any OOVs to during # training. # oov.txt 转成oov.int echo "$oov_word" > $dir/oov.txt || exit 1; cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1; # integer version of oov symbol, used in some scripts. # the file wdisambig.txt contains a (line-by-line) list of the text-form of the # disambiguation symbols that are used in the grammar and passed through by the # lexicon. At this stage it's hardcoded as '#0', but we're laying the groundwork # for more generality (which probably would be added by another script). # wdisambig_words.int contains the corresponding list interpreted by the # symbol table words.txt, and wdisambig_phones.int contains the corresponding # list interpreted by the symbol table phones.txt. # wdisambig.txt包含歧义符号的文本形式,这些符号经过lexicon在语法中用到. #在这个阶段,被硬编成'#0',我们为通用性打下基础 echo '#0' >$dir/phones/wdisambig.txt # In case there are extra word-level disambiguation symbols they need # to be added to the existing word-level disambiguation symbols file. if [ ! -z "$extra_word_disambig_syms" ]; then # We expect a file containing valid word-level disambiguation symbols. # The regular expression for awk is just a paranoia filter (e.g. for empty lines). cat $extra_word_disambig_syms | awk '{ print $1 }' >> $dir/phones/wdisambig.txt fi #转成int utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int # Create these lists of phones in colon-separated integer list form too, # for purposes of being given to programs as command-line options. for f in silence nonsilence optional_silence disambig context_indep; do utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \ awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1; done for x in sets extra_questions; do utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1; done utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \ > $dir/phones/roots.int || exit 1; if [ -f $dir/phones/word_boundary.txt ]; then utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \ > $dir/phones/word_boundary.int || exit 1; fi silphonelist=`cat $dir/phones/silence.csl` nonsilphonelist=`cat $dir/phones/nonsilence.csl` # Note: it's OK, after generating the 'lang' directory, to overwrite the topo file # with another one of your choice if the 'topo' file you want can't be generated by # utils/gen_topo.pl. We do this in the 'chain' recipes. Of course, the 'topo' file # should cover all the phones. Try running utils/validate_lang.pl to check that # everything is OK after modifying the topo file. utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$dir/topo # Create the lexicon FST with disambiguation symbols, and put it in lang_test. # There is an extra step where we create a loop to "pass through" the # disambiguation symbols from G.fst. # fstcompile 将text描述性的fst转换成二进制形式 # fstdraw可以将而二进制的fst可视化成图 # fstaddselfloops可对网络的起始节点添加自转条件. # fstarcsort对每条弧按照规则排序. if $silprob; then utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; else utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; fi if [ ! -z "$unk_fst" ]; then utils/lang/internal/apply_unk_lm.sh $unk_fst $dir || exit 1 if ! $position_dependent_phones; then echo "$0: warning: you are using the --unk-lm option and setting --position-dependent-phones false." echo " ... this will make it impossible to properly work out the word boundaries after" echo " ... decoding; quite a few scripts will not work as a result, and many scoring scripts" echo " ... will die." sleep 4 fi fi echo "$(basename $0): validating output directory" ! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" && exit 1; exit 0;
run.sh
#!/bin/bash train_cmd="utils/run.pl" decode_cmd="utils/run.pl" #have data, not need to download #if [ ! -d waves_yesno ]; then # wget http://www.openslr.org/resources/1/waves_yesno.tar.gz || exit 1; # was: # wget http://sourceforge.net/projects/kaldi/files/waves_yesno.tar.gz || exit 1; # tar -xvzf waves_yesno.tar.gz || exit 1; #fi train_yesno=waves_train test_base_name=waves_test #clear data exp mfcc filefolder rm -rf data exp mfcc # Data preparation # we need to rewrite scripts below local/prepare_data.sh Nestle #structure of dir and file name is different local/prepare_dict.sh #dict contains 10 words, not 2. #将词典转换成语言模型fst utils/prepare_lang.sh --position-dependent-phones false --sil-prob 0.8 --share-silence-phones true --num-sil-states 5 --num-nonsil-states 4 data/local/dict "" data/local/lang data/lang #测试与诊断语言模型 local/prepare_lm.sh echo "Data Prepraration finish!" # Feature extraction for x in waves_test waves_train; do steps/make_mfcc.sh --nj 8 data/$x exp/make_mfcc/$x mfcc steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc utils/fix_data_dir.sh data/$x done echo "Feature extraction finish!" # Mono training steps/train_mono.sh --nj 8 --cmd "$train_cmd" \ --totgauss 800 \ data/waves_train data/lang exp/mono0 echo "Mono training finish!" # Graph compilation 结合语言模型fst和声学模型;创建完全的识别网络 utils/mkgraph.sh data/lang_test_tg exp/mono0 exp/mono0/graph_tgpr echo "Graph compilation finish!" # Decoding steps/decode.sh --nj 1 --cmd "$decode_cmd" \ exp/mono0/graph_tgpr data/waves_test exp/mono0/decode_waves_test #xp/mono0/decode_waves_test wer_* 选取最好的结果 for x in exp/*/decode*; do [ -d $x ] && echo $x && grep WER $x/wer_* | utils/best_wer.sh; done