kaldi 源码分析(六) - 指令参数分析

在 kaldi 系统中通常会出现很多参数相关的解析,比如:

# 对于 声纹识别 中获取 ivector 特征向量使用如下脚本
steps/online/nnet2/extract_ivectors_online.sh
# 其主要的步骤如下: 
#      1. 特征处理:cmvn+splice+lda
#      2. 根据特征和m(final.dubm)获得每个speaker对应的s
#      3. 根据s、m(final.dubm)、T(final.ie)得到w

# 查看 ivector 特征
copy-feats --binary=false --compress=false ark:ivector_online.1.ark ark,t:ivector_online.1.ark.txt

如上所示, 在 copy-feats 中存在 ark:ivector_online.1.ark 类似的东西,为了深入理解 kaldi 运行机制,我们先从尝试解析这些参数开始深入

指令源码分析

在 src/feats/copy-feats.cc 文件中我们能看到所有 copy-feats 实现的全貌,

int main(int argc, char *argv[]) {
  try {
    using namespace kaldi;

    const char *usage =
        "Copy features [and possibly change format]\n"
        "Usage: copy-feats [options]  \n"
        "or:   copy-feats [options]  \n"
        "e.g.: copy-feats ark:- ark,scp:foo.ark,foo.scp\n"
        " or: copy-feats ark:foo.ark ark,t:txt.ark\n"
        "See also: copy-matrix, copy-feats-to-htk, copy-feats-to-sphinx, select-feats,\n"
        "extract-feature-segments, subset-feats, subsample-feats, splice-feats, paste-feats,\n"
        "concat-feats\n";

    ParseOptions po(usage);
    bool binary = true;
    bool htk_in = false;
    bool sphinx_in = false;
    bool compress = false;
    int32 compression_method_in = 1;
    std::string num_frames_wspecifier;
//  注册选项参数,并将相应的选项值输入到对应的变量中
    po.Register("htk-in", &htk_in, "Read input as HTK features");
    po.Register("sphinx-in", &sphinx_in, "Read input as Sphinx features");
    po.Register("binary", &binary, "Binary-mode output (not relevant if writing "
                "to archive)");
    po.Register("compress", &compress, "If true, write output in compressed form"
                "(only currently supported for wxfilename, i.e. archive/script,"
                "output)");
    po.Register("compression-method", &compression_method_in,
                "Only relevant if --compress=true; the method (1 through 7) to "
                "compress the matrix.  Search for CompressionMethod in "
                "src/matrix/compressed-matrix.h.");
    po.Register("write-num-frames", &num_frames_wspecifier,
                "Wspecifier to write length in frames of each utterance. "
                "e.g. 'ark,t:utt2num_frames'.  Only applicable if writing tables, "
                "not when this program is writing individual files.  See also "
                "feat-to-len.");

// 解析相应的参数列表内容
    po.Read(argc, argv);

    if (po.NumArgs() != 2) {
      po.PrintUsage();
      exit(1);
    }

    int32 num_done = 0;

    CompressionMethod compression_method = static_cast(
        compression_method_in);

    if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) {
      // Copying tables of features.
// 获取输入文件的配置
      std::string rspecifier = po.GetArg(1);
// 获取输出到文件的配置
      std::string wspecifier = po.GetArg(2);
      Int32Writer num_frames_writer(num_frames_wspecifier);

      if (!compress) {
// 输出Matrix到文件
        BaseFloatMatrixWriter kaldi_writer(wspecifier);
        if (htk_in) {
// 序列化读入文件
          SequentialTableReader htk_reader(rspecifier);
          for (; !htk_reader.Done(); htk_reader.Next(), num_done++) {
            kaldi_writer.Write(htk_reader.Key(), htk_reader.Value().first);
            if (!num_frames_wspecifier.empty())
              num_frames_writer.Write(htk_reader.Key(),
                                      htk_reader.Value().first.NumRows());
          }
        } else if (sphinx_in) {
          SequentialTableReader > sphinx_reader(rspecifier);
          for (; !sphinx_reader.Done(); sphinx_reader.Next(), num_done++) {
            kaldi_writer.Write(sphinx_reader.Key(), sphinx_reader.Value());
            if (!num_frames_wspecifier.empty())
              num_frames_writer.Write(sphinx_reader.Key(),
                                      sphinx_reader.Value().NumRows());
          }
        } else {
          SequentialBaseFloatMatrixReader kaldi_reader(rspecifier);
          for (; !kaldi_reader.Done(); kaldi_reader.Next(), num_done++) {
            kaldi_writer.Write(kaldi_reader.Key(), kaldi_reader.Value());
            if (!num_frames_wspecifier.empty())
              num_frames_writer.Write(kaldi_reader.Key(),
                                      kaldi_reader.Value().NumRows());
          }
        }
      } else {
        CompressedMatrixWriter kaldi_writer(wspecifier);
        if (htk_in) {
          SequentialTableReader htk_reader(rspecifier);
          for (; !htk_reader.Done(); htk_reader.Next(), num_done++) {
            kaldi_writer.Write(htk_reader.Key(),
                               CompressedMatrix(htk_reader.Value().first,
                                                compression_method));
            if (!num_frames_wspecifier.empty())
              num_frames_writer.Write(htk_reader.Key(),
                                      htk_reader.Value().first.NumRows());
          }
        } else if (sphinx_in) {
          SequentialTableReader > sphinx_reader(rspecifier);
          for (; !sphinx_reader.Done(); sphinx_reader.Next(), num_done++) {
            kaldi_writer.Write(sphinx_reader.Key(),
                               CompressedMatrix(sphinx_reader.Value(),
                                                compression_method));
            if (!num_frames_wspecifier.empty())
              num_frames_writer.Write(sphinx_reader.Key(),
                                      sphinx_reader.Value().NumRows());
          }
        } else {
          SequentialBaseFloatMatrixReader kaldi_reader(rspecifier);
          for (; !kaldi_reader.Done(); kaldi_reader.Next(), num_done++) {
            kaldi_writer.Write(kaldi_reader.Key(),
                               CompressedMatrix(kaldi_reader.Value(),
                                                compression_method));
            if (!num_frames_wspecifier.empty())
              num_frames_writer.Write(kaldi_reader.Key(),
                                      kaldi_reader.Value().NumRows());
          }
        }
      }
      KALDI_LOG << "Copied " << num_done << " feature matrices.";
      return (num_done != 0 ? 0 : 1);
    } else {
      KALDI_ASSERT(!compress && "Compression not yet supported for single files");
      if (!num_frames_wspecifier.empty())
        KALDI_ERR << "--write-num-frames option not supported when writing/reading "
                  << "single files.";

      std::string feat_rxfilename = po.GetArg(1), feat_wxfilename = po.GetArg(2);

      Matrix feat_matrix;
      if (htk_in) {
        Input ki(feat_rxfilename); // Doesn't look for read binary header \0B, because
        // no bool* pointer supplied.
        HtkHeader header; // we discard this info.
        ReadHtk(ki.Stream(), &feat_matrix, &header);
      } else if (sphinx_in) {
        KALDI_ERR << "For single files, sphinx input is not yet supported.";
      } else {
        ReadKaldiObject(feat_rxfilename, &feat_matrix);
      }
      WriteKaldiObject(feat_matrix, feat_wxfilename, binary);
      KALDI_LOG << "Copied features from " << PrintableRxfilename(feat_rxfilename)
                << " to " << PrintableWxfilename(feat_wxfilename);
    }
  } catch(const std::exception &e) {
    std::cerr << e.what();
    return -1;
  }
}

从上述看到, 其中大多数是封装好了的 Reader/Writer 并将参数直接传入到 Reader/Writer 中进行解析。
这些 Reader/Writer 都是在 src/util/kaldi-table.h 文件中进行定义的, 大多数是定义的模板类,其相关实现则 在 src/util/kaldi-table-inl.h 文件进行实现。以 SequentialTableReader 为例子:

template
SequentialTableReader::SequentialTableReader(const std::string
                                                     &rspecifier): impl_(NULL) {
// 调用 SequentialTableReader.Open() 函数
  if (rspecifier != "" && !Open(rspecifier))
    KALDI_ERR << "Error constructing TableReader: rspecifier is " << rspecifier;
}

template
bool SequentialTableReader::Open(const std::string &rspecifier) {
  if (IsOpen())
    if (!Close())
      KALDI_ERR << "Could not close previously open object.";
  // now impl_ will be NULL.

  RspecifierOptions opts;
// 调用 ClassifyRspecifier 类来解析相应的参数内容
// 根据 RspecifierType 来将操作交给相应的实现来执行
  RspecifierType wt = ClassifyRspecifier(rspecifier, NULL, &opts);
  switch (wt) {
    case kArchiveRspecifier:
      impl_ = new SequentialTableReaderArchiveImpl();
      break;
    case kScriptRspecifier:
      impl_ = new SequentialTableReaderScriptImpl();
      break;
    case kNoRspecifier: default:
      KALDI_WARN << "Invalid rspecifier " << rspecifier;
      return false;
  }
  if (!impl_->Open(rspecifier)) {
    delete impl_;
    impl_ = NULL;
    return false;  // sub-object will have printed warnings.
  }
  if (opts.background) {
    impl_ = new SequentialTableReaderBackgroundImpl(
        impl_);
    if (!impl_->Open("")) {
      // the rxfilename is ignored in that Open() call.
      // It should only return false on code error.
      return false;
    }
  }
  return true;
}

这里以 SequentialTableReaderArchiveImpl 为例:

template  class SequentialTableReaderArchiveImpl:
      public SequentialTableReaderImplBase {
 public:
  typedef typename Holder::T T;

  SequentialTableReaderArchiveImpl(): state_(kUninitialized) { }

  virtual bool Open(const std::string &rspecifier) {
    if (state_ != kUninitialized) {
      if (!Close()) {  // call Close() yourself to suppress this exception.
        if (opts_.permissive)
          KALDI_WARN << "Error closing previous input "
              "(only warning, since permissive mode).";
        else
          KALDI_ERR << "Error closing previous input.";
      }
    }
    rspecifier_ = rspecifier;
// 解析 RspecifierType 来获取类型, 并将文件信息存放在 archive_rxfilename_ 变量中,将其选项参数放置到 opts_ 变量中
    RspecifierType rs = ClassifyRspecifier(rspecifier,
                                           &archive_rxfilename_,
                                           &opts_);
    KALDI_ASSERT(rs == kArchiveRspecifier);

    bool ans;
    // NULL means don't expect binary-mode header
    if (Holder::IsReadInBinary())
      ans = input_.Open(archive_rxfilename_, NULL);
    else
      ans = input_.OpenTextMode(archive_rxfilename_);
    if (!ans) {  // header.
      KALDI_WARN << "Failed to open stream "
                 << PrintableRxfilename(archive_rxfilename_);
      state_ = kUninitialized;  // Failure on Open
      return false;  // User should print the error message.
    }
    state_ = kFileStart;
    Next();
    if (state_ == kError) {
      KALDI_WARN << "Error beginning to read archive file (wrong filename?): "
                 << PrintableRxfilename(archive_rxfilename_);
      input_.Close();
      state_ = kUninitialized;
      return false;
    }
    KALDI_ASSERT(state_ == kHaveObject || state_ == kEof);
    return true;
  }
....

整体看下来,基本是由 ClassifyRspecifier 来进行解析相应参数的,因此可以专注查看 ClassifyRspecifier 实现,就能明白这些参数是干啥的了。
在 src/util/kaldi-table.cc 文件中,我们看到 ClassifyRspecifier 相关实现如下:

RspecifierType ClassifyRspecifier(const std::string &rspecifier,
                                  std::string *wxfilename,
                                  RspecifierOptions *opts) {
  // Examples
  // ark:rxfilename  ->  kArchiveRspecifier
  // scp:rxfilename  -> kScriptRspecifier
  //
  // We also allow the meaningless prefixes b, and t,
  // plus the options o (once), no (not-once),
  // s (sorted) and ns (not-sorted), p (permissive)
  // and np (not-permissive).
  // so the following would be valid:
  //
  // f, o, b, np, ark:rxfilename  ->  kArchiveRspecifier
  //
  // Examples:
  //
  // b, ark:rxfilename  ->  kArchiveRspecifier
  // t, ark:rxfilename  ->  kArchiveRspecifier
  // b, scp:rxfilename  -> kScriptRspecifier
  // t, no, s, scp:rxfilename  -> kScriptRspecifier
  // t, ns, scp:rxfilename  -> kScriptRspecifier

  // Improperly formed Rspecifiers will be classified as kNoRspecifier.

  if (wxfilename) wxfilename->clear();

  if (opts != NULL)
    *opts = RspecifierOptions();  // Make sure all the defaults are as in the
                                  // default constructor of the options class.

  size_t pos = rspecifier.find(':');
  if (pos == std::string::npos) return kNoRspecifier;

  if (isspace(*(rspecifier.rbegin()))) return kNoRspecifier;  // Trailing space
  // disallowed.

  std::string before_colon(rspecifier, 0, pos),
      after_colon(rspecifier, pos+1);

  std::vector split_first_part;  // Split part before ':' on ', '.
  SplitStringToVector(before_colon, ", ", false, &split_first_part);  // false==
  // don't omit empty strings between commas.

  RspecifierType rs = kNoRspecifier;

  for (size_t i = 0; i < split_first_part.size(); i++) {
    const std::string &str = split_first_part[i];  // e.g. "b", "t", "f", "ark",
    // "scp".
    const char *c = str.c_str();
    if (!strcmp(c, "b"));  // Ignore this option.  It's so we can use the same
    // specifiers for rspecifiers and wspecifiers.
    else if (!strcmp(c, "t"));  // Ignore this option too.
    else if (!strcmp(c, "o")) {
      if (opts) opts->once = true;
    } else if (!strcmp(c, "no")) {
      if (opts) opts->once = false;
    } else if (!strcmp(c, "p")) {
      if (opts) opts->permissive = true;
    } else if (!strcmp(c, "np")) {
      if (opts) opts->permissive = false;
    } else if (!strcmp(c, "s")) {
      if (opts) opts->sorted = true;
    } else if (!strcmp(c, "ns")) {
      if (opts) opts->sorted = false;
    } else if (!strcmp(c, "cs")) {
      if (opts) opts->called_sorted = true;
    } else if (!strcmp(c, "ncs")) {
      if (opts) opts->called_sorted = false;
    } else if (!strcmp(c, "bg")) {
      if (opts) opts->background = true;
    } else if (!strcmp(c, "ark")) {
      if (rs == kNoRspecifier) rs = kArchiveRspecifier;
      else
        return kNoRspecifier;  // Repeated or combined ark and scp options
      // invalid.
    } else if (!strcmp(c, "scp")) {
      if (rs == kNoRspecifier) rs = kScriptRspecifier;
      else
        return kNoRspecifier;  // Repeated or combined ark and scp options
      // invalid.
    } else {
      return kNoRspecifier;  // Could not interpret this option.
    }
  }
  if ((rs == kArchiveRspecifier || rs == kScriptRspecifier)
     && wxfilename != NULL)
    *wxfilename = after_colon;
  return rs;
}

注意:

1. 在 kaldi 中 Reader/Writer 都是以模板的形式给出的实现,其中会添加一个 Holder , 这个 Holder 可以简单看作是将 对象 与 流形式 Binary 之间的相互转换 , 然后在用 Reader/Writer 将其进行相应的读取和写入操作

2. Holder 的实现多在  src/util/kaldi-holder.h 中定义,然后在 src/util/kaldi-holder-inl.h 文件中进行实现

3. 对于 外部引用这是在  src/util/table-types.h 中进行 typedef ,从而在外部使用时直接使用即可。

4. 对于实际进行读写操作的 Input/Output 类,则是在 src/util/kaldi-io.cc 文件中进行实现的,其中包含了比如 pipe/file/std 等方式的读写操作,输出类型如下:
     enum OutputType {
             kNoOutput,
             kFileOutput,
             kStandardOutput,
             kPipeOutput
      };
    而 读类型如下:
     enum InputType {
             kNoInput,
             kFileInput,
             kStandardInput,
             kOffsetFileInput,
             kPipeInput
      };
      /// ClassifyWxfilename interprets filenames as follows:
      ///  - kNoOutput: invalid filenames (leading or trailing space, things that look
      ///     like wspecifiers and rspecifiers or like pipes to read from with leading
      ///     |.
      ///  - kFileOutput: Normal filenames
      ///  - kStandardOutput: The empty string or "-", interpreted as standard output
      ///  - kPipeOutput: pipes, e.g. "gunzip -c some_file.gz |"

最后, 总结下 Rspecifier 参数相关的说明:

// Documentation for "rspecifier"
// "rspecifier" describes how we read a set of objects indexed by keys.
// The possibilities are:
//
// ark:rxfilename
// scp:rxfilename
//
// We also allow various modifiers:
//   o   means the program will only ask for each key once, which enables
//       the reader to discard already-asked-for values.
//   s   means the keys are sorted on input (means we don't have to read till
//       eof if someone asked for a key that wasn't there).
//   cs  means that it is called in sorted order (we are generally asserting
//       this based on knowledge of how the program works).
//   p   means "permissive", and causes it to skip over keys whose corresponding
//       scp-file entries cannot be read. [and to ignore errors in archives and
//       script files, and just consider the "good" entries].
//       We allow the negation of the options above, as in no, ns, np,
//       but these aren't currently very useful (just equivalent to omitting the
//       corresponding option).
//       [any of the above options can be prefixed by n to negate them, e.g. no,
//       ns, ncs, np; but these aren't currently useful as you could just omit
//       the option].
//   bg means "background".  It currently has no effect for random-access readers,
//       but for sequential readers it will cause it to "read ahead" to the next
//       value, in a background thread.  Recommended when reading larger objects
//       such as neural-net training examples, especially when you want to
//       maximize GPU usage.
//
//   b   is ignored [for scripting convenience] , opts->binary = true 
//   t   is ignored [for scripting convenience] , opts->binary = false 
//
//
//  So for instance the following would be a valid rspecifier:
//
//   "o, s, p, ark:gunzip -c foo.gz|"

最后总结下:

  • xfilename的类型如下:
 1. “-”或“” 表示标准输入
 2. “some command |” 表示一个输入管道命令,i.e.我们去掉管道符“|”,把剩下的字符串通过popen()传入shell
 3. “/some/filename:12345” 表示文件的偏置,i.e.我们打开文件并定位至12345
 4. “/some/filename”... 与以上不匹配的模式都会被当做普通的文件名(当然,一些明显的错误会被检测出来,在它们被打开之前)
  • wxfilename的类型如下:
 1.  “-”或“” 表示标准输入
 2.  “| some command” 表示一个输出管道命令,i.e.我们去掉管道符“|”,把剩下的字符串通过popen()传入shell
 3. “/some/filename”... 与以上不匹配的模式都会被当做普通的文件名(当然,会检测并过滤掉明显的错误)

你可能感兴趣的:(kaldi 源码分析(六) - 指令参数分析)