make_fbank.sh脚本使用方式:
make_fbank.sh [options] [ [] ]
其中data-dir代表了原始音频文件路径,log-dir代表提取特征的日志文件的路径,fbank-dir代表声学特征提取之后存放的路径。选项options中可以选指定特征提取的配置文件,以及并发数目,如果nj为40,相当于把原来原来数据平均分为40份,并行地同时提取特征。
具体的make_fbank.sh代码如下:
#!/bin/bash
# Copyright 2012-2016 Karel Vesely
# Copyright 2012-2016 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# To be run from .. (one directory up from here)
# see ../run.sh for example
# Begin configuration section.
nj=4 #the number of parallel jobs
cmd=run.pl
fbank_config=conf/fbank.conf # fbank特征提取的配置文件,为conf/fbank.conf,包含了特征维度和采样率等配置信息
compress=true
write_utt2num_frames=true # If true writes utt2num_frames.
write_utt2dur=true # if true writes duration of each utterance
# End configuration section.
echo "$0 $@" # Print the command line for logging. # 打印原始指令,将其保存在log中
if [ -f path.sh ]; then . ./path.sh; fi # 导入需要的环境变量
. parse_options.sh || exit 1; # 使能命令行解析脚本
# 如果参数数目不正确,那么打印本脚本的使用方式并退出
if [ $# -lt 1 ] || [ $# -gt 3 ]; then
cat >&2 < [ [] ]
e.g.: $0 data/train
Note: defaults to /log, and
defaults to /data
Options:
--fbank-config # config passed to compute-fbank-feats.
--nj # number of parallel jobs.
--cmd > # how to run jobs.
--write-utt2num-frames # If true, write utt2num_frames file.
--write-utt2dur # If true, write utt2dur file.
EOF
exit 1;
fi
# 分别接收原始音频数据的路径、特征提取日志路径、特征文件存放路径
data=$1
if [ $# -ge 2 ]; then
logdir=$2
else
logdir=$data/log
fi
if [ $# -ge 3 ]; then
fbankdir=$3
else
fbankdir=$data/data
fi
# 将fbankdir变为绝对路径
# make $fbankdir an absolute pathname.
fbankdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $fbankdir ${PWD}`
# use "name" as part of name of the archive.
name=`basename $data`
# 创建fbankdir、logdir
mkdir -p $fbankdir || exit 1;
mkdir -p $logdir || exit 1;
# 如果feats.scp存在且为常规文件,对其做备份,否则跳过这部分。
if [ -f $data/feats.scp ]; then
mkdir -p $data/.backup
echo "$0: moving $data/feats.scp to $data/.backup"
mv $data/feats.scp $data/.backup
fi
# 指定音频文件编号和其路径一一对应的wav.scp文件路径给scp变量,wav.scp文件是数据处理脚本XXXX_data_prep.sh生成的
scp=$data/wav.scp
required="$scp $fbank_config"
# 检查特征提取文件和scp文件是否齐全,若不齐全,报错,退出执行程序
for f in $required; do
if [ ! -f $f ]; then
echo "$0: no such file $f"
exit 1;
fi
done
# 检查所需文件是否齐全,这里不检查text文件和特征文件
utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
if [ -f $data/spk2warp ]; then
echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
vtln_opts="--vtln-map=ark:$data/utt2warp"
fi
# does nothing
for n in $(seq $nj); do
# the next command does nothing unless $fbankdir/storage/ exists, see
# utils/create_data_link.pl for more info.
utils/create_data_link.pl $fbankdir/raw_fbank_$name.$n.ark
done
# 设置在后续处理时,是否记录每条utterance的帧数
if $write_utt2num_frames; then
write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
else
write_num_frames_opt=
fi
# 设置在后续处理时,是否记录每条utterance的持续时间
if $write_utt2dur; then
write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
else
write_utt2dur_opt=
fi
if [ -f $data/segments ]; then
# 当$data路径下存在segments文件时
echo "$0 [info]: segments file exists: using that."
split_segments=
for n in $(seq $nj); do
split_segments="$split_segments $logdir/segments.$n"
done
utils/split_scp.pl $data/segments $split_segments || exit 1;
rm $logdir/.error 2>/dev/null
$cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \
extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
--config=$fbank_config ark:- ark:- \| \
copy-feats --compress=$compress $write_num_frames_opt ark:- \
ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \
|| exit 1;
else
# 当$data下面不存在segments文件时
echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." # 提示segments文件不存在
# 将scp文件中的音频文件以大致相等的数量切分为nj个scp文件,存在$logdir下面: wav.$1.scp wav.$2.scp ...
split_scps=""
for n in $(seq $nj); do
split_scps="$split_scps $logdir/wav.$n.scp"
done
utils/split_scp.pl $scp $split_scps || exit 1;
# 并行提取每个切分好的wav.jobnum.scp文件里面存储的音频文件的特征,分别保存特征提取日志
$cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \
compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \ # 利用compute-fbank-feats来提取特征,存为结尾为.ark的存档文件(archive)
--config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- \| \
copy-feats --compress=$compress $write_num_frames_opt ark:- \ # 利用copy-feats来复制生成的特征存档.ark文件,并创建每个特征存档文件对应的.scp文件
ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \
|| exit 1;
fi
# 如果特征提取时报错了,将错误打印出来
if [ -f $logdir/.error.$name ]; then
echo "$0: Error producing filterbank features for $name:"
tail $logdir/make_fbank_${name}.1.log
exit 1;
fi
# 将每个单独的job产生的.scp拼接,组成feats.scp,存储到$data
# concatenate the .scp files together.
for n in $(seq $nj); do
cat $fbankdir/raw_fbank_$name.$n.scp || exit 1
done > $data/feats.scp || exit 1
# 在$data下产生每个utterance同其帧数对应的文件
if $write_utt2num_frames; then
for n in $(seq $nj); do
cat $logdir/utt2num_frames.$n || exit 1
done > $data/utt2num_frames || exit 1
fi
# 在$data下产生每个utterance同其持续时间对应的文件
if $write_utt2dur; then
for n in $(seq $nj); do
cat $logdir/utt2dur.$n || exit 1
done > $data/utt2dur || exit 1
fi
# 存储帧移和特征提取的配置信息
# Store frame_shift and fbank_config along with features.
frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
printf "%.3f", 0.001 * $1; exit; }' $fbank_config)
echo ${frame_shift:-'0.01'} > $data/frame_shift
mkdir -p $data/conf && cp $fbank_config $data/conf/fbank.conf || exit 1
# 删除一些过程中产生的文件
rm $logdir/wav_${name}.*.scp $logdir/segments.* \
$logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
# 判断是否所有的音频文件都被成功提取了特征,否则报错
nf=$(wc -l < $data/feats.scp) # 统计feats.scp的行数
nu=$(wc -l < $data/utt2spk) # 统计总的utterance音频条数
if [ $nf -ne $nu ]; then
echo "$0: It seems not all of the feature files were successfully procesed" \
"($nf != $nu); consider using utils/fix_data_dir.sh $data"
fi
# not important
if (( nf < nu - nu/20 )); then
echo "$0: Less than 95% the features were successfully generated."\
"Probably a serious error."
exit 1
fi
echo "$0: Succeeded creating filterbank features for $name"
copy-feats 还可以用来改变特征数据的格式,因此可以转换ark格式文件为txt格式:
~/kaldi/src/featbin/copy-feats ark:xxx.ark ark,t:xxx.txt
上面文件基本完成了raw-fbank-feature的提取,但是一般来说都会对特征进行CMVN操作。
利用compute-cmvn-stats可以计算均值方差归一化统计量,可以根据选项进行每条utterance的cmvn、每个说话人的cmvn、全局的cmvn。使用方法实例:
compute-cmvn-stats scp:data/train/feats.scp data/train/cmvn.ark
上面的例子利用训练数据计算每个utterance的均值方差统计量:接受feats.scp,其中记录了每条utterance的特征的位置;利用这个文件又生成cmvn.ark,其中记录了计算得到的每个utterance的均值和方差。
然后利用apply-cmvn可以轻松地将计算出来的均值方差应用于提取出来的原始特征上。使用例子如下:
apply-cmvn [options] (|)
apply-cmvn --utt2spk=ark:data/train/utt2spk scp:data/train/cmvn.scp scp:data/train/feats.scp ark:-
这个例子中按照每个说话人进行CMVN。
除此之外,还可以应用add-deltas来加入特征的一阶差分和二阶差分,使用较为简单,此处不再赘述。
上面三项是基本的处理流程内包括的,此外,可能还需要进行特征的变换等操作,具体请见blog: kaldi特征提取详解