//我改的部分都用zhangfeifan进行注释了,想只看差别的可搜索此关键字进行这部分的查看
//思路
//从OnlineNnet2FeaturePipelineConfig->OnlineNnet2FeaturePipelineInfo
//->OnlineNnet2FeaturePipeline
//主要修改两处,一是看读入的config文件中有没有cmvn处理;二是在构造函数中,判断若有cmvn配置,则进行特征提取
// online2/online-nnet2-feature-pipeline.cc
// Copyright 2013-2014 Johns Hopkins University (author: Daniel Povey)
#include "online2/online-nnet2-feature-pipeline.h"
#include "transform/cmvn.h"
namespace kaldi {
OnlineNnet2FeaturePipelineInfo::OnlineNnet2FeaturePipelineInfo(
const OnlineNnet2FeaturePipelineConfig &config):
silence_weighting_config(config.silence_weighting_config) {
if (config.feature_type == "mfcc" || config.feature_type == "plp" ||
config.feature_type == "fbank") {
feature_type = config.feature_type;
} else {
KALDI_ERR << "Invalid feature type: " << config.feature_type << ". "
<< "Supported feature types: mfcc, plp.";
}
if (config.mfcc_config != "") {
ReadConfigFromFile(config.mfcc_config, &mfcc_opts);
if (feature_type != "mfcc")
KALDI_WARN << "--mfcc-config option has no effect "
<< "since feature type is set to " << feature_type << ".";
} // else use the defaults.
if (config.plp_config != "") {
ReadConfigFromFile(config.plp_config, &plp_opts);
if (feature_type != "plp")
KALDI_WARN << "--plp-config option has no effect "
<< "since feature type is set to " << feature_type << ".";
} // else use the defaults.
if (config.fbank_config != "") {
ReadConfigFromFile(config.fbank_config, &fbank_opts);
if (feature_type != "fbank")
KALDI_WARN << "--fbank-config option has no effect "
<< "since feature type is set to " << feature_type << ".";
} // else use the defaults.
add_pitch = config.add_pitch;
if (config.online_pitch_config != "") {
ReadConfigsFromFile(config.online_pitch_config,
&pitch_opts,
&pitch_process_opts);
if (!add_pitch)
KALDI_WARN << "--online-pitch-config option has no effect "
<< "since you did not supply --add-pitch option.";
} // else use the defaults.
//zhangfeifan start
//判断是否有cmvn的config文件
if (config.cmvn_config != "") {
ReadConfigFromFile(config.cmvn_config, &cmvn_opts);
global_cmvn_stats_rxfilename = config.global_cmvn_stats_rxfilename;
if (global_cmvn_stats_rxfilename == "")
KALDI_ERR << "--global-cmvn-stats option is required.";
} // else use the defaults.
//zhangfeifan end
if (config.ivector_extraction_config != "") {
use_ivectors = true;
OnlineIvectorExtractionConfig ivector_extraction_opts;
ReadConfigFromFile(config.ivector_extraction_config,
&ivector_extraction_opts);
ivector_extractor_info.Init(ivector_extraction_opts);
} else {
use_ivectors = false;
}
}
//构造函数同-->Online-feature的init()
OnlineNnet2FeaturePipeline::OnlineNnet2FeaturePipeline(
const OnlineNnet2FeaturePipelineInfo &info):
info_(info) {
//zhangfeifan start
if(info_.global_cmvn_stats_rxfilename!="")
ReadKaldiObject(info_.global_cmvn_stats_rxfilename,&global_cmvn_stats_);
//zhangfeifan end
if (info_.feature_type == "mfcc") {
base_feature_ = new OnlineMfcc(info_.mfcc_opts);
} else if (info_.feature_type == "plp") {
base_feature_ = new OnlinePlp(info_.plp_opts);
} else if (info_.feature_type == "fbank") {
base_feature_ = new OnlineFbank(info_.fbank_opts);
} else {
KALDI_ERR << "Code error: invalid feature type " << info_.feature_type;
}
//zhangfeifan start
{
if(global_cmvn_stats_.NumRows() != 0){
if (info_.add_pitch){
int32 global_dim = global_cmvn_stats_.NumCols() - 1;
int32 dim = base_feature_->Dim();
KALDI_ASSERT(global_dim >= dim);
if (global_dim > dim){
Matrix last_col(global_cmvn_stats_.ColRange(global_dim, 1));
global_cmvn_stats_.Resize(global_cmvn_stats_.NumRows(), dim + 1,
kCopyData);
global_cmvn_stats_.ColRange(dim, 1).CopyFromMat(last_col);
}
}
Matrix global_cmvn_stats_dbl(global_cmvn_stats_);
OnlineCmvnState initial_state(global_cmvn_stats_dbl);
cmvn_ = new OnlineCmvn(info_.cmvn_opts, initial_state, base_feature_);//构造函数会加上该特征
}
}
//zhngfeifan end
if (info_.add_pitch) {
pitch_ = new OnlinePitchFeature(info_.pitch_opts);
pitch_feature_ = new OnlineProcessPitch(info_.pitch_process_opts,
pitch_);
if(global_cmvn_stats_.NumRows() != 0)
{
feature_plus_optional_pitch_ = new OnlineAppendFeature(cmvn_,
pitch_feature_);//zhangfeifan
}
else
{
feature_plus_optional_pitch_ = new OnlineAppendFeature(base_feature_,
pitch_feature_);//zhangfeifan
}
} else {
pitch_ = NULL;
pitch_feature_ = NULL;
if(global_cmvn_stats_.NumRows() != 0)
feature_plus_optional_pitch_ = cmvn_;//zhangfeian
else
feature_plus_optional_pitch_ = base_feature_;
}
if (info_.use_ivectors) {
ivector_feature_ = new OnlineIvectorFeature(info_.ivector_extractor_info,
base_feature_);
final_feature_ = new OnlineAppendFeature(feature_plus_optional_pitch_,
ivector_feature_);
} else {
ivector_feature_ = NULL;
final_feature_ = feature_plus_optional_pitch_;
}
dim_ = final_feature_->Dim();
}
int32 OnlineNnet2FeaturePipeline::Dim() const { return dim_; }
bool OnlineNnet2FeaturePipeline::IsLastFrame(int32 frame) const {
return final_feature_->IsLastFrame(frame);
}
int32 OnlineNnet2FeaturePipeline::NumFramesReady() const {
return final_feature_->NumFramesReady();
}
void OnlineNnet2FeaturePipeline::GetFrame(int32 frame,
VectorBase *feat) {
return final_feature_->GetFrame(frame, feat);
}
//SetAdaptationState是ivector的自适应,应用cmvn的
void OnlineNnet2FeaturePipeline::SetAdaptationState(
const OnlineIvectorExtractorAdaptationState &adaptation_state) {
if (info_.use_ivectors) {
ivector_feature_->SetAdaptationState(adaptation_state);
}
// else silently do nothing, as there is nothing to do.
}
void OnlineNnet2FeaturePipeline::GetAdaptationState(
OnlineIvectorExtractorAdaptationState *adaptation_state) const {
if (info_.use_ivectors) {
ivector_feature_->GetAdaptationState(adaptation_state);
}
// else silently do nothing, as there is nothing to do.
}
//zhangfeifan start
void OnlineNnet2FeaturePipeline::SetCmvnState(const OnlineCmvnState &cmvn_state) {
cmvn_->SetState(cmvn_state);
}
void OnlineNnet2FeaturePipeline::GetCmvnState(OnlineCmvnState *cmvn_state) {
int32 frame = cmvn_->NumFramesReady() - 1;
// the following call will crash if no frames are ready.
cmvn_->GetState(frame, cmvn_state);
}
void OnlineNnet2FeaturePipeline::FreezeCmvn() {
cmvn_->Freeze(cmvn_->NumFramesReady() - 1);
}
//zhangfeifan end
//析构函数
OnlineNnet2FeaturePipeline::~OnlineNnet2FeaturePipeline() {
// Note: the delete command only deletes pointers that are non-NULL. Not all
// of the pointers below will be non-NULL.
// Some of the online-feature pointers are just copies of other pointers,
// and we do have to avoid deleting them in those cases.
if (final_feature_ != feature_plus_optional_pitch_)
delete final_feature_;
delete ivector_feature_;
if (feature_plus_optional_pitch_ != base_feature_)
delete feature_plus_optional_pitch_;
delete pitch_feature_;
delete pitch_;
delete cmvn_;//zhangfeifan,没有判断是否有pitch,有必要吗?
delete base_feature_;
}
void OnlineNnet2FeaturePipeline::AcceptWaveform(
BaseFloat sampling_rate,
const VectorBase &waveform) {
base_feature_->AcceptWaveform(sampling_rate, waveform);
if (pitch_)
pitch_->AcceptWaveform(sampling_rate, waveform);
}
void OnlineNnet2FeaturePipeline::InputFinished() {
base_feature_->InputFinished();
if (pitch_)
pitch_->InputFinished();
}
BaseFloat OnlineNnet2FeaturePipelineInfo::FrameShiftInSeconds() const {
if (feature_type == "mfcc") {
return mfcc_opts.frame_opts.frame_shift_ms / 1000.0f;
} else if (feature_type == "fbank") {
return fbank_opts.frame_opts.frame_shift_ms / 1000.0f;
} else if (feature_type == "plp") {
return plp_opts.frame_opts.frame_shift_ms / 1000.0f;
} else {
KALDI_ERR << "Unknown feature type " << feature_type;
return 0.0;
}
}
} // namespace kaldi
// online2/online-nnet2-feature-pipeline.h
// Copyright 2013-2014 Johns Hopkins University (author: Daniel Povey)
#ifndef KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_
#define KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_
#include
#include
#include
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"
#include "base/kaldi-error.h"
#include "feat/online-feature.h"
#include "feat/pitch-functions.h"
#include "online2/online-ivector-feature.h"
namespace kaldi {
/// @addtogroup onlinefeat OnlineFeatureExtraction
/// @{
/// @file
/// This file contains a different version of the feature-extraction pipeline in
/// \ref online-feature-pipeline.h, specialized for use in neural network
/// decoding with iVectors. Our recipe is that we extract iVectors that will
/// be used as an additional input to the neural network, in addition to
/// a window of several frames of spliced raw features (MFCC, PLP or filterbanks).
/// The iVectors are extracted on top of a (splice+LDA+MLLT) feature pipeline,
/// with the added complication that the GMM posteriors used for the iVector
/// extraction are obtained with a version of the features that has online
/// cepstral mean (and optionally variance) normalization, whereas the stats for
/// iVector are accumulated with a non-mean-normalized version of the features.
/// The idea here is that we want the iVector to learn the mean offset, but
/// we want the posteriors to be somewhat invariant to mean offsets.
///
/// Most of the logic for the actual iVector estimation is in \ref
/// online-ivector-feature.h, this header contains mostly glue.
///
/// Although the name of this header mentions nnet2, actually the code is
/// used in the online decoding with nnet3 also.
/// This configuration class is to set up OnlineNnet2FeaturePipelineInfo, which
/// in turn is the configuration class for OnlineNnet2FeaturePipeline.
/// Instead of taking the options for the parts of the feature pipeline
/// directly, it reads in the names of configuration classes.
struct OnlineNnet2FeaturePipelineConfig {
std::string feature_type; // "plp" or "mfcc" or "fbank"
std::string mfcc_config;
std::string plp_config;
std::string fbank_config;
// Note: if we do add pitch, it will not be added to the features we give to
// the iVector extractor but only to the features we give to the neural
// network, after the base features but before the iVector. We don't think
// the iVector will be particularly helpful in normalizing the pitch features,
// and we wanted to avoid complications with things like online CMVN.
bool add_pitch;
// the following contains the type of options that you could give to
// compute-and-process-kaldi-pitch-feats.
std::string online_pitch_config;
//zhangfeifan start
std::string cmvn_config;
std::string global_cmvn_stats_rxfilename;
//zhangfeifan end
// The configuration variables in ivector_extraction_config relate to the
// iVector extractor and options related to it, see type
// OnlineIvectorExtractionConfig.
std::string ivector_extraction_config;
// Config that relates to how we weight silence for (ivector) adaptation
// this is registered directly to the command line as you might want to
// play with it in test time.
OnlineSilenceWeightingConfig silence_weighting_config;
OnlineNnet2FeaturePipelineConfig():
feature_type("mfcc"), add_pitch(false) { }
void Register(OptionsItf *opts) {
opts->Register("feature-type", &feature_type,
"Base feature type [mfcc, plp, fbank]");
opts->Register("mfcc-config", &mfcc_config, "Configuration file for "
"MFCC features (e.g. conf/mfcc.conf)");
opts->Register("plp-config", &plp_config, "Configuration file for "
"PLP features (e.g. conf/plp.conf)");
opts->Register("fbank-config", &fbank_config, "Configuration file for "
"filterbank features (e.g. conf/fbank.conf)");
opts->Register("add-pitch", &add_pitch, "Append pitch features to raw "
"MFCC/PLP/filterbank features [but not for iVector extraction]");
opts->Register("online-pitch-config", &online_pitch_config, "Configuration "
"file for online pitch features, if --add-pitch=true (e.g. "
"conf/online_pitch.conf)");
//zhangfeifan start
opts->Register("cmvn-config", &cmvn_config, "Configuration class "
"file for online CMVN features (e.g. conf/online_cmvn.conf)");
opts->Register("global-cmvn-stats", &global_cmvn_stats_rxfilename,
"(Extended) filename for global CMVN stats, e.g. obtained "
"from 'matrix-sum scp:data/train/cmvn.scp -'");
//zhangfeifan end
opts->Register("ivector-extraction-config", &ivector_extraction_config,
"Configuration file for online iVector extraction, "
"see class OnlineIvectorExtractionConfig in the code");
silence_weighting_config.RegisterWithPrefix("ivector-silence-weighting", opts);
}
};
/// This class is responsible for storing configuration variables, objects and
/// options for OnlineNnet2FeaturePipeline (including the actual LDA and
/// CMVN-stats matrices, and the iVector extractor, which is a member of
/// ivector_extractor_info. This class does not register options on the command
/// line; instead, it is initialized from class OnlineNnet2FeaturePipelineConfig
/// which reads the options from the command line. The reason for structuring
/// it this way is to make it easier to configure from code as well as from the
/// command line, as well as for easiter multithreaded operation.
struct OnlineNnet2FeaturePipelineInfo {
OnlineNnet2FeaturePipelineInfo():
feature_type("mfcc"), add_pitch(false) { }
OnlineNnet2FeaturePipelineInfo(
const OnlineNnet2FeaturePipelineConfig &config);
BaseFloat FrameShiftInSeconds() const;
std::string feature_type; // "mfcc" or "plp" or "fbank"
MfccOptions mfcc_opts; // options for MFCC computation,
// if feature_type == "mfcc"
PlpOptions plp_opts; // Options for PLP computation, if feature_type == "plp"
FbankOptions fbank_opts; // Options for filterbank computation, if
// feature_type == "fbank"
bool add_pitch;
PitchExtractionOptions pitch_opts; // Options for pitch extraction, if done.
ProcessPitchOptions pitch_process_opts; // Options for pitch post-processing
//zhangfeifan start
OnlineCmvnOptions cmvn_opts; // Options for online CMN/CMVN computation.
std::string global_cmvn_stats_rxfilename; // Filename used for reading global
// CMVN stats
//zhangfeifan end
// If the user specified --ivector-extraction-config, we assume we're using
// iVectors as an extra input to the neural net. Actually, we don't
// anticipate running this setup without iVectors.
bool use_ivectors;
OnlineIvectorExtractionInfo ivector_extractor_info;
// Config for weighting silence in iVector adaptation.
// We declare this outside of ivector_extractor_info... it was
// just easier to set up the code that way; and also we think
// it's the kind of thing you might want to play with directly
// on the command line instead of inside sub-config-files.
OnlineSilenceWeightingConfig silence_weighting_config;
int32 IvectorDim() { return ivector_extractor_info.extractor.IvectorDim(); }
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineNnet2FeaturePipelineInfo);
};
/// OnlineNnet2FeaturePipeline is a class that's responsible for putting
/// together the various parts of the feature-processing pipeline for neural
/// networks, in an online setting. The recipe here does not include fMLLR;
/// instead, it assumes we're giving raw features such as MFCC or PLP or
/// filterbank (with no CMVN) to the neural network, and optionally augmenting
/// these with an iVector that describes the speaker characteristics. The
/// iVector is extracted using class OnlineIvectorFeature (see that class for
/// more info on how it's done).
/// No splicing is currently done in this code, as we're currently only supporting
/// the nnet2 neural network in which the splicing is done inside the network.
/// Probably our strategy for nnet1 network conversion would be to convert to nnet2
/// and just add layers to do the splicing.
class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
public:
/// Constructor from the "info" object. After calling this for a
/// non-initial utterance of a speaker, you may want to call
/// SetAdaptationState().
explicit OnlineNnet2FeaturePipeline(
const OnlineNnet2FeaturePipelineInfo &info);
/// Member functions from OnlineFeatureInterface:
/// Dim() will return the base-feature dimension (e.g. 13 for normal MFCC);
/// plus the pitch-feature dimension (e.g. 3), if used; plus the iVector
/// dimension, if used. Any frame-splicing happens inside the neural-network
/// code.
virtual int32 Dim() const;
virtual bool IsLastFrame(int32 frame) const;
virtual int32 NumFramesReady() const;
virtual void GetFrame(int32 frame, VectorBase *feat);
/// Set the adaptation state to a particular value, e.g. reflecting previous
/// utterances of the same speaker; this will generally be called after
/// Copy().
void SetAdaptationState(
const OnlineIvectorExtractorAdaptationState &adaptation_state);
/// Get the adaptation state; you may want to call this before destroying this
/// object, to get adaptation state that can be used to improve decoding of
/// later utterances of this speaker. You might not want to do this, though,
/// if you have reason to believe that something went wrong in the recognition
/// (e.g., low confidence).
void GetAdaptationState(
OnlineIvectorExtractorAdaptationState *adaptation_state) const;
//zhangfeifan start
void FreezeCmvn(); // stop it from moving further (do this when you start
// using fMLLR). This will crash if NumFramesReady() == 0.
/// Set the CMVN state to a particular value (will generally be
/// called after Copy().
void SetCmvnState(const OnlineCmvnState &cmvn_state);
void GetCmvnState(OnlineCmvnState *cmvn_state);
//zhangfeifan end
/// Accept more data to process. It won't actually process it until you call
/// GetFrame() [probably indirectly via (decoder).AdvanceDecoding()], when you
/// call this function it will just copy it). sampling_rate is necessary just
/// to assert it equals what's in the config.
void AcceptWaveform(BaseFloat sampling_rate,
const VectorBase &waveform);
BaseFloat FrameShiftInSeconds() const { return info_.FrameShiftInSeconds(); }
/// If you call InputFinished(), it tells the class you won't be providing any
/// more waveform. This will help flush out the last few frames of delta or
/// LDA features, and finalize the pitch features (making them more
/// accurate)... although since in neural-net decoding we don't anticipate
/// rescoring the lattices, this may not be much of an issue.
void InputFinished();
// This function returns the ivector-extracting part of the feature pipeline
// (or NULL if iVectors are not being used); the pointer is owned here and not
// given to the caller. This function is used in nnet3, and also in the
// silence-weighting code used to exclude silence from the iVector estimation.
OnlineIvectorFeature *IvectorFeature() {
return ivector_feature_;
}
// This function returns the part of the feature pipeline that would be given
// as the primary (non-iVector) input to the neural network in nnet3
// applications.
OnlineFeatureInterface *InputFeature() {
return feature_plus_optional_pitch_;
}
virtual ~OnlineNnet2FeaturePipeline();
private:
const OnlineNnet2FeaturePipelineInfo &info_;
//zhangfeifan start
Matrix global_cmvn_stats_; // Global CMVN stats.
OnlineCmvn *cmvn_;
//zhangfeifan end
OnlineBaseFeature *base_feature_; // MFCC/PLP/filterbank
OnlinePitchFeature *pitch_; // Raw pitch, if used
OnlineProcessPitch *pitch_feature_; // Processed pitch, if pitch used.
// feature_plus_pitch_ is the base_feature_ appended (OnlineAppendFeature)
/// with pitch_feature_, if used; otherwise, points to the same address as
/// base_feature_.
OnlineFeatureInterface *feature_plus_optional_pitch_;
OnlineIvectorFeature *ivector_feature_; // iVector feature, if used.
// final_feature_ is feature_plus_optional_pitch_ appended
// (OnlineAppendFeature) with ivector_feature_, if ivector_feature_ is used;
// otherwise, points to the same address as feature_plus_optional_pitch_.
OnlineFeatureInterface *final_feature_;
// we cache the feature dimension, to save time when calling Dim().
int32 dim_;
};
/// @} End of "addtogroup onlinefeat"
} // namespace kaldi
#endif // KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_