为了能在安卓Demo中提取出有用的信息,先来参考.c
源码(因为Android Demo是用JNI封装了一下.c
实现的)
.c源码中所需要的声学分数提取
这里我想要的就是
start
end
ascr
等参数,查看continuous.c
源代码,主要的逻辑如下
//当能够读到音频时一直进行
while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) {
ps_process_raw(ps, adbuf, k, FALSE, FALSE);
//in_speech=TRUE表示在说话,为FALSE表示为静音silence
in_speech = ps_get_in_speech(ps);
//utterance 开始
if (in_speech && !utt_started) {
utt_started = TRUE;
}
//如果开始之后又没有在说话了(即到达了一句话尾部的silence停顿)
if (!in_speech && utt_started) {
//调试信息是在这个函数里面输出的
ps_end_utt(ps);
hyp = ps_get_hyp(ps, NULL);
if (hyp != NULL)
printf("%s\n", hyp);
if (print_times)
print_word_times();
fflush(stdout);
ps_start_utt(ps);
utt_started = FALSE;
}
}//while结束
//音频读取完毕 再输出一次
ps_end_utt(ps);
if (utt_started) {
hyp = ps_get_hyp(ps, NULL);
if (hyp != NULL) {
printf("%s\n", hyp);
if (print_times) {
print_word_times();
}
}
}
来到ps_end_utt
对应行
E_INFO("%s (%d)\n", hyp, score);
E_INFO_NOFN("%-20s %-5s %-5s %-5s %-10s %-10s %-3s\n",
"word", "start", "end", "pprob", "ascr", "lscr", "lback");
for (seg = ps_seg_iter(ps); seg;
seg = ps_seg_next(seg)) {
char const *word;
int sf, ef;
int32 post, lscr, ascr, lback;
word = ps_seg_word(seg);
ps_seg_frames(seg, &sf, &ef);
post = ps_seg_prob(seg, &ascr, &lscr, &lback);
E_INFO_NOFN("%-20s %-5d %-5d %-1.3f %-10d %-10d %-3d\n",
word, sf, ef, logmath_exp(ps_get_logmath(ps), post),
ascr, lscr, lback);
}
对应开头的
所需要的函数就是ps_seg_frame
ps_seg_prob
https://sourceforge.net/p/cmusphinx/discussion/help/thread/dd80eb2a/
也印证了这一点
安卓环境下的
Hypothesis
就是识别结果了。
不过相比于Kaldi而言,没办法识别音素对应的重音比较遗憾。
https://sourceforge.net/p/cmusphinx/discussion/sphinx4/thread/736c772a/?limit=25#d425
Android中的工作模式
onPartialResult
和onResult
等回掉函数的工作形式则需要查看SpeechRecognizer.class
里面的逻辑,是对Decoder.class
的一层封装,添加语法等等可以在这一层进行(也可以在Decoder层进行,不过偏底层罢了,参考下一小节),另外在这个类中涉及到音频的处理,使用安卓的类AudioRecord
来获取录音(AudioRecord的使用可以参考https://blog.csdn.net/qq_36982160/article/details/79383046),存入buffer
,再根据是否结束等等进行逻辑判断,大体和.c
中的一致。
public void run() {
SpeechRecognizer.this.recorder.startRecording();
if (SpeechRecognizer.this.recorder.getRecordingState() == 1) {
SpeechRecognizer.this.recorder.stop();
IOException ioe = new IOException("Failed to start recording. Microphone might be already in use.");
SpeechRecognizer.this.mainHandler.post(SpeechRecognizer.this.new OnErrorEvent(ioe));
} else {
Log.d(SpeechRecognizer.TAG, "Starting decoding");
SpeechRecognizer.this.decoder.startUtt();
short[] buffer = new short[SpeechRecognizer.this.bufferSize];
//这里和.c源代码里面一样,同样是获取是否有要识别的内容
boolean inSpeech = SpeechRecognizer.this.decoder.getInSpeech();
SpeechRecognizer.this.recorder.read(buffer, 0, buffer.length);
while(!interrupted() && (this.timeoutSamples == -1 || this.remainingSamples > 0)) {
//nread为读取的样本数量
int nread = SpeechRecognizer.this.recorder.read(buffer, 0, buffer.length);
if (-1 == nread) {
throw new RuntimeException("error reading audio buffer");
}
if (nread > 0) {
SpeechRecognizer.this.decoder.processRaw(buffer, (long)nread, false, false);
if (SpeechRecognizer.this.decoder.getInSpeech() != inSpeech) {
inSpeech = SpeechRecognizer.this.decoder.getInSpeech();
SpeechRecognizer.this.mainHandler.post(SpeechRecognizer.this.new InSpeechChangeEvent(inSpeech));
}
if (inSpeech) {
this.remainingSamples = this.timeoutSamples;
}
Hypothesis hypothesis = SpeechRecognizer.this.decoder.hyp();
//这里的false跟踪过去表示还没有完全结束,即回调的函数是onPartialResult
SpeechRecognizer.this.mainHandler.post(SpeechRecognizer.this.new ResultEvent(hypothesis, false));
}
if (this.timeoutSamples != -1) {
this.remainingSamples -= nread;
}
}
//这里stop之后就会调用onResult
SpeechRecognizer.this.recorder.stop();
SpeechRecognizer.this.decoder.endUtt();
SpeechRecognizer.this.mainHandler.removeCallbacksAndMessages((Object)null);
if (this.timeoutSamples != -1 && this.remainingSamples <= 0) {
SpeechRecognizer.this.mainHandler.post(SpeechRecognizer.this.new TimeoutEvent());
}
}
}
Decoder.class
从上面的逻辑可以看出,和底层音频数据以及JNI打交道的是Decoder.class
,想要处理每个buffer
里面储存的PCM格式音频的话可以使用getRawdata
,其他的通过命令行设置的参数应该也可以通过该接口实现,参考
https://stackoverflow.com/questions/29008111/give-a-file-as-input-to-pocketsphinx-on-android
Config c = Decoder.defaultConfig();
c.setString("-hmm", "../../model/en-us/en-us");
c.setString("-lm", "../../model/en-us/en-us.lm.dmp");
c.setString("-dict", "../../model/en-us/cmudict-en-us.dict");
Decoder d = new Decoder(c);
URL testwav = new URL("file:../../test/data/goforward.wav");
FileInputStream stream = new FileInputStream(new File(testwav)));
d.startUtt();
byte[] b = new byte[4096];
try {
int nbytes;
while ((nbytes = stream.read(b)) >= 0) {
ByteBuffer bb = ByteBuffer.wrap(b, 0, nbytes);
// Not needed on desktop but required on android
bb.order(ByteOrder.LITTLE_ENDIAN);
short[] s = new short[nbytes/2];
bb.asShortBuffer().get(s);
d.processRaw(s, nbytes/2, false, false);
}
} catch (IOException e) {
fail("Error when reading goforward.wav" + e.getMessage());
}
d.endUtt();
System.out.println(d.hyp().getHypstr());
for (Segment seg : d.seg()) {
System.out.println(seg.getWord());
}
}
对齐信息调整
有时候每个单词出现的对齐信息不是从0开始的,而是和上一段录音结束时间有关,这点有点麻烦,应该怎么把这个信息重置呢?
/**
* Get the offset of the utterance start of the current stream, helpful for stream-wide timing.
*/
int32
acmod_stream_offset(acmod_t *acmod)
{
return acmod->utt_start_frame;
}
/**
* Reset the current stream
*/
void acmod_start_stream(acmod_t *acmod)
{
fe_start_stream(acmod->fe);
acmod->utt_start_frame = 0;
}
跟踪到
int
ps_start_stream(ps_decoder_t *ps)
{
acmod_start_stream(ps->acmod);
return 0;
}
也就是Decoder.class
中的
public void startStream() {
PocketSphinxJNI.Decoder_startStream(this.swigCPtr, this);
}
在每次重新识别的时候,调用这个函数就可以重置了。然而在安卓实测的时候发现可能是重置需要一定时间,重置未完成的时候再开始识别,重置有时候会失败。于是换了最暴力的重置,在生成解码器的时候保存Config
,然后重新开始新识别的时候直接重置该最初的Config
文件储存
Java class直接向JNI中指定了rawlogdir
,再从C源码中查看参数保存的位置
if (ps->rawlogdir) {
char *logfn = string_join(ps->rawlogdir, "/",
uttid, ".raw", NULL);
FILE *rawfh;
E_INFO("Writing raw audio file: %s\n", logfn);
if ((rawfh = fopen(logfn, "wb")) == NULL) {
E_ERROR_SYSTEM("Failed to open raw audio file %s", logfn);
ckd_free(logfn);
return -1;
}
ckd_free(logfn);
acmod_set_rawfh(ps->acmod, rawfh);
}
直接查找rawlogdir
相关的引用即可。
Rawdata
if (acmod->rawfh) {
fwrite(prev_audio_inptr, sizeof(int16),
processed_samples, acmod->rawfh);
}
此处每个单位是int16
,sizeof(int16)=2
文件名格式
查找一下uttid
的引用,格式如下
最终在手机上的储存是类似
000000000.raw
的形式。MFCC特征的文件名也是同样的格式。
在每次
start_utt
的时候增加。在Android中为每次startListening
的时候调用start_utt
MFCC
一开始set的时候如下,写入了一个int
的0
acmod_set_mfcfh(acmod_t *acmod, FILE *logfh)
{
int rv = 0;
if (acmod->mfcfh)
fclose(acmod->mfcfh);
acmod->mfcfh = logfh;
fwrite(&rv, 4, 1, acmod->mfcfh);
return rv;
}
中途处理的时候如下,这里cep
是个二维数组,第一维n_frames=?
(每次是变化的),第二维feat_cepsize(acmod->fcb)=13
即每帧有多少个特征,总大小n
static int
acmod_log_mfc(acmod_t *acmod,
mfcc_t **cep, int n_frames)
{
int n = n_frames * feat_cepsize(acmod->fcb);
/* Write features. */
if (fwrite(cep[0], sizeof(mfcc_t), n, acmod->mfcfh) != n) {
E_ERROR_SYSTEM("Failed to write %d values to file", n);
}
return 0;
}
而在结束的时候写入了特征总长度(减去了开头的rv=0
这个int的大小(4))
if (acmod->mfcfh) {
long outlen;
int32 rv;
outlen = (ftell(acmod->mfcfh) - 4) / 4;
/* Try to seek and write */
if ((rv = fseek(acmod->mfcfh, 0, SEEK_SET)) == 0) {
fwrite(&outlen, 4, 1, acmod->mfcfh);
}
fclose(acmod->mfcfh);
acmod->mfcfh = NULL;
}
因此整个MFCC文件的格式就是0+(n帧*13维) MFCC特征+MFCC特征长度
,每个单位都是4个字节
在以上过程之后我们就可以对特征信息、MFCC特征、原音频文件进行后续处理了。
附录
.c
中用到的几个函数:
fwrite函数
fseek函数
ftell函数
安卓音频处理库:
https://www.cnblogs.com/lsjwzh/p/4361457.html
https://blog.csdn.net/tongsiw/article/details/51469686