语音识别CMUSphinx(3)特征信息的提取

为了能在安卓Demo中提取出有用的信息,先来参考.c源码(因为Android Demo是用JNI封装了一下.c实现的)

.c源码中所需要的声学分数提取

INFO

这里我想要的就是start end ascr等参数,查看continuous.c源代码,主要的逻辑如下

//当能够读到音频时一直进行    
while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) {
        ps_process_raw(ps, adbuf, k, FALSE, FALSE);
      //in_speech=TRUE表示在说话,为FALSE表示为静音silence
        in_speech = ps_get_in_speech(ps);
      //utterance 开始
        if (in_speech && !utt_started) {
            utt_started = TRUE;
        } 
      //如果开始之后又没有在说话了(即到达了一句话尾部的silence停顿)
        if (!in_speech && utt_started) {
          //调试信息是在这个函数里面输出的
            ps_end_utt(ps);
            hyp = ps_get_hyp(ps, NULL);
            if (hyp != NULL)
            printf("%s\n", hyp);
            if (print_times)
            print_word_times();
            fflush(stdout);

            ps_start_utt(ps);
            utt_started = FALSE;
        }
    }//while结束
    //音频读取完毕 再输出一次
    ps_end_utt(ps);
    if (utt_started) {
        hyp = ps_get_hyp(ps, NULL);
        if (hyp != NULL) {
            printf("%s\n", hyp);
            if (print_times) {
            print_word_times();
        }
    }
    }

来到ps_end_utt对应行

            E_INFO("%s (%d)\n", hyp, score);
            E_INFO_NOFN("%-20s %-5s %-5s %-5s %-10s %-10s %-3s\n",
                    "word", "start", "end", "pprob", "ascr", "lscr", "lback");
            for (seg = ps_seg_iter(ps); seg;
             seg = ps_seg_next(seg)) {
                char const *word;
            int sf, ef;
            int32 post, lscr, ascr, lback;

            word = ps_seg_word(seg);
            ps_seg_frames(seg, &sf, &ef);
            post = ps_seg_prob(seg, &ascr, &lscr, &lback);
            E_INFO_NOFN("%-20s %-5d %-5d %-1.3f %-10d %-10d %-3d\n",
                            word, sf, ef, logmath_exp(ps_get_logmath(ps), post),
                        ascr, lscr, lback);
            }

对应开头的

INFO

所需要的函数就是ps_seg_frame ps_seg_prob
https://sourceforge.net/p/cmusphinx/discussion/help/thread/dd80eb2a/
也印证了这一点


安卓环境下的 Hypothesis就是识别结果了。

不过相比于Kaldi而言,没办法识别音素对应的重音比较遗憾。
https://sourceforge.net/p/cmusphinx/discussion/sphinx4/thread/736c772a/?limit=25#d425

Android中的工作模式

onPartialResultonResult等回掉函数的工作形式则需要查看SpeechRecognizer.class里面的逻辑,是对Decoder.class的一层封装,添加语法等等可以在这一层进行(也可以在Decoder层进行,不过偏底层罢了,参考下一小节),另外在这个类中涉及到音频的处理,使用安卓的类AudioRecord来获取录音(AudioRecord的使用可以参考https://blog.csdn.net/qq_36982160/article/details/79383046),存入buffer,再根据是否结束等等进行逻辑判断,大体和.c中的一致。

        public void run() {
            SpeechRecognizer.this.recorder.startRecording();
            if (SpeechRecognizer.this.recorder.getRecordingState() == 1) {
                SpeechRecognizer.this.recorder.stop();
                IOException ioe = new IOException("Failed to start recording. Microphone might be already in use.");
                SpeechRecognizer.this.mainHandler.post(SpeechRecognizer.this.new OnErrorEvent(ioe));
            } else {
                Log.d(SpeechRecognizer.TAG, "Starting decoding");
                SpeechRecognizer.this.decoder.startUtt();
                short[] buffer = new short[SpeechRecognizer.this.bufferSize];
              //这里和.c源代码里面一样,同样是获取是否有要识别的内容
                boolean inSpeech = SpeechRecognizer.this.decoder.getInSpeech();
                SpeechRecognizer.this.recorder.read(buffer, 0, buffer.length);

                while(!interrupted() && (this.timeoutSamples == -1 || this.remainingSamples > 0)) {
                  //nread为读取的样本数量
                    int nread = SpeechRecognizer.this.recorder.read(buffer, 0, buffer.length);
                    if (-1 == nread) {
                        throw new RuntimeException("error reading audio buffer");
                    }

                    if (nread > 0) {
                        SpeechRecognizer.this.decoder.processRaw(buffer, (long)nread, false, false);
                        if (SpeechRecognizer.this.decoder.getInSpeech() != inSpeech) {
                            inSpeech = SpeechRecognizer.this.decoder.getInSpeech();
                            SpeechRecognizer.this.mainHandler.post(SpeechRecognizer.this.new InSpeechChangeEvent(inSpeech));
                        }

                        if (inSpeech) {
                            this.remainingSamples = this.timeoutSamples;
                        }

                        Hypothesis hypothesis = SpeechRecognizer.this.decoder.hyp();
                      //这里的false跟踪过去表示还没有完全结束,即回调的函数是onPartialResult
                        SpeechRecognizer.this.mainHandler.post(SpeechRecognizer.this.new ResultEvent(hypothesis, false));
                    }

                    if (this.timeoutSamples != -1) {
                        this.remainingSamples -= nread;
                    }
                }
              //这里stop之后就会调用onResult
                SpeechRecognizer.this.recorder.stop();
                SpeechRecognizer.this.decoder.endUtt();
                SpeechRecognizer.this.mainHandler.removeCallbacksAndMessages((Object)null);
                if (this.timeoutSamples != -1 && this.remainingSamples <= 0) {
                    SpeechRecognizer.this.mainHandler.post(SpeechRecognizer.this.new TimeoutEvent());
                }

            }
        }

Decoder.class

从上面的逻辑可以看出,和底层音频数据以及JNI打交道的是Decoder.class,想要处理每个buffer里面储存的PCM格式音频的话可以使用getRawdata,其他的通过命令行设置的参数应该也可以通过该接口实现,参考
https://stackoverflow.com/questions/29008111/give-a-file-as-input-to-pocketsphinx-on-android

    Config c = Decoder.defaultConfig();
    c.setString("-hmm", "../../model/en-us/en-us");
    c.setString("-lm", "../../model/en-us/en-us.lm.dmp");
    c.setString("-dict", "../../model/en-us/cmudict-en-us.dict");
    Decoder d = new Decoder(c);

    URL testwav = new URL("file:../../test/data/goforward.wav");
    FileInputStream stream = new FileInputStream(new File(testwav)));

    d.startUtt();
    byte[] b = new byte[4096];
    try {
        int nbytes;
        while ((nbytes = stream.read(b)) >= 0) {
            ByteBuffer bb = ByteBuffer.wrap(b, 0, nbytes);

            // Not needed on desktop but required on android
            bb.order(ByteOrder.LITTLE_ENDIAN); 

            short[] s = new short[nbytes/2];
            bb.asShortBuffer().get(s);
            d.processRaw(s, nbytes/2, false, false);
        }
    } catch (IOException e) {
        fail("Error when reading goforward.wav" + e.getMessage());
    }
    d.endUtt();
    System.out.println(d.hyp().getHypstr());
    for (Segment seg : d.seg()) {
        System.out.println(seg.getWord());
    }
}

对齐信息调整

有时候每个单词出现的对齐信息不是从0开始的,而是和上一段录音结束时间有关,这点有点麻烦,应该怎么把这个信息重置呢?

/**
 * Get the offset of the utterance start of the current stream, helpful for stream-wide timing.
 */
int32
acmod_stream_offset(acmod_t *acmod)
{
    return acmod->utt_start_frame;
}

/**
 * Reset the current stream
 */
void acmod_start_stream(acmod_t *acmod)
{
    fe_start_stream(acmod->fe);
    acmod->utt_start_frame = 0;
}

跟踪到

int
ps_start_stream(ps_decoder_t *ps)
{
    acmod_start_stream(ps->acmod);
    return 0;
}

也就是Decoder.class中的

    public void startStream() {
        PocketSphinxJNI.Decoder_startStream(this.swigCPtr, this);
    }

在每次重新识别的时候,调用这个函数就可以重置了。然而在安卓实测的时候发现可能是重置需要一定时间,重置未完成的时候再开始识别,重置有时候会失败。于是换了最暴力的重置,在生成解码器的时候保存Config,然后重新开始新识别的时候直接重置该最初的Config

文件储存

Java class直接向JNI中指定了rawlogdir,再从C源码中查看参数保存的位置

    if (ps->rawlogdir) {
        char *logfn = string_join(ps->rawlogdir, "/",
                                  uttid, ".raw", NULL);
        FILE *rawfh;
        E_INFO("Writing raw audio file: %s\n", logfn);
        if ((rawfh = fopen(logfn, "wb")) == NULL) {
            E_ERROR_SYSTEM("Failed to open raw audio file %s", logfn);
            ckd_free(logfn);
            return -1;
        }
        ckd_free(logfn);
        acmod_set_rawfh(ps->acmod, rawfh);
    }

直接查找rawlogdir相关的引用即可。

Rawdata

        if (acmod->rawfh) {
            fwrite(prev_audio_inptr, sizeof(int16),
                   processed_samples, acmod->rawfh);
        }

此处每个单位是int16,sizeof(int16)=2

文件名格式

查找一下uttid的引用,格式如下


最终在手机上的储存是类似000000000.raw的形式。MFCC特征的文件名也是同样的格式。
在每次start_utt的时候增加。在Android中为每次startListening的时候调用start_utt

MFCC

一开始set的时候如下,写入了一个int的0

acmod_set_mfcfh(acmod_t *acmod, FILE *logfh)
{
    int rv = 0;

    if (acmod->mfcfh)
        fclose(acmod->mfcfh);
    acmod->mfcfh = logfh;
    fwrite(&rv, 4, 1, acmod->mfcfh);
    return rv;
}

中途处理的时候如下,这里cep是个二维数组,第一维n_frames=?(每次是变化的),第二维feat_cepsize(acmod->fcb)=13即每帧有多少个特征,总大小n

static int
acmod_log_mfc(acmod_t *acmod,
              mfcc_t **cep, int n_frames)
{
    int n = n_frames * feat_cepsize(acmod->fcb);
    /* Write features. */
    if (fwrite(cep[0], sizeof(mfcc_t), n, acmod->mfcfh) != n) {
        E_ERROR_SYSTEM("Failed to write %d values to file", n);
    }
    return 0;
}

而在结束的时候写入了特征总长度(减去了开头的rv=0这个int的大小(4))

    if (acmod->mfcfh) {
        long outlen;
        int32 rv;
        outlen = (ftell(acmod->mfcfh) - 4) / 4;
        /* Try to seek and write */
        if ((rv = fseek(acmod->mfcfh, 0, SEEK_SET)) == 0) {
            fwrite(&outlen, 4, 1, acmod->mfcfh);
        }
        fclose(acmod->mfcfh);
        acmod->mfcfh = NULL;
    }

因此整个MFCC文件的格式就是0+(n帧*13维) MFCC特征+MFCC特征长度,每个单位都是4个字节

在以上过程之后我们就可以对特征信息、MFCC特征、原音频文件进行后续处理了。

附录

.c中用到的几个函数:
fwrite函数
fseek函数
ftell函数
安卓音频处理库:
https://www.cnblogs.com/lsjwzh/p/4361457.html
https://blog.csdn.net/tongsiw/article/details/51469686

你可能感兴趣的:(语音识别CMUSphinx(3)特征信息的提取)