HTK book中提到的参数有11种:
"LPC", "LPREFC", "LPCEPSTRA", "LPDELCEP", "IREFC", "MFCC", "FBANK", "MELSPEC","DISCRETE", "PLP","ANON"
但HTK3.4中是否都支持呢?
请看ConvertFrame()函数中的以下代码:
switch(btgt){
case LPC:
Wave2LPC(cf->s,cf->a,cf->k,&re,&te);
v = cf->a; bsize = cf->lpcOrder;
break;
case LPREFC:
Wave2LPC(cf->s,cf->a,cf->k,&re,&te);
v = cf->k; bsize = cf->lpcOrder;
break;
case LPCEPSTRA:
Wave2LPC(cf->s,cf->a,cf->k,&re,&te);
LPC2Cepstrum(cf->a,cf->c);
if (cf->cepLifter > 0)
WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);
v = cf->c; bsize = cf->numCepCoef;
break;
case MELSPEC:
case FBANK:
Wave2FBank(cf->s, cf->fbank, rawE?NULL:&te, cf->fbInfo);
v = cf->fbank; bsize = cf->numChans;
break;
case MFCC:
Wave2FBank(cf->s, cf->fbank, rawE?NULL:&te, cf->fbInfo);
FBank2MFCC(cf->fbank, cf->c, cf->numCepCoef);
if (cf->cepLifter > 0)
WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);
v = cf->c; bsize = cf->numCepCoef;
break;
case PLP:
Wave2FBank(cf->s, cf->fbank, rawE ? NULL : &te, cf->fbInfo);
FBank2ASpec(cf->fbank, cf->as, cf->eql, cf->compressFact, cf->fbInfo);
ASpec2LPCep(cf->as, cf->ac, cf->lp, cf->c, cf->cm);
if (cf->cepLifter > 0)
WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);
v = cf->c;
bsize = cf->numCepCoef;
break;
default:
HError(6321,"ConvertFrame: target %s is not a parameterised form",
ParmKind2Str(cf->tgtPK,buf));
}
可以看出HTK3.4支持7中参数:LPC,LPREFC,LPCEPSTRA,MELSPEC,FBANK,MFCC,PLP。
参数转换顺序可以参照HTK book Fig.5.9:
IOConfigRec数据结构存放着很多参数,在特征提取中:
typedef struct {
/* ------- Overrideable parameters ------- */
ParmKind srcPK; /* Source ParmKind */
FileFormat srcFF; /* Source File format */
HTime srcSampRate; /* Source Sample Rate */
Boolean zMeanSrc; /* Zero Mean the Source */
ParmKind tgtPK; /* Target ParmKind */
FileFormat tgtFF; /* Target File format */
......
}IOConfigRec;
ValidCodeParms()函数检查analysis.conf的参数是否合理。
/* ValidCodeParms: check to ensure reasonable wave->parm code params */
static void ValidCodeParms(IOConfig cf)
/* SetUpForCoding: set style, sizes and working storage */
static void SetUpForCoding(MemHeap *x, IOConfig cf, int frSize)
ValidConversion()函数检查原格式到目标格式的转换是否 可能完成。
/* EXPORT->ValidConversion: checks that src -> tgt conversion is possible */
Boolean ValidConversion (ParmKind src, ParmKind tgt)
TotalComps()函数返回 特征参数的 维度。
/* TotalComps: return the total number of components in a parameter vector
with nStatic components and ParmKind pk */
static int TotalComps(int nStatic, ParmKind pk)
在OpenAsChannel()函数中,计算特征参数所需的内存空间:
dBytes = cf->nCols * pbuf->main.maxRows * sizeof(float);
= 39 * 243 * 4 = 37908
在提取特征参数FillBufFromChannel()函数前,调用了StartBuffer()函数,那么StartBuffer()函数有什么作用呢?
/* EXPORT->StartBuffer: start audio and fill the buffer */
void StartBuffer(ParmBuf pbuf)
{
……
if (pbuf->status == PB_INIT) {
if (pbuf->cf->useSilDet) ChangeState(pbuf,PB_WAITING);
else ChangeState(pbuf,PB_FILLING);
}
……
}
typedef enum {
PB_INIT, /* Buffer is initialised and empty */
PB_WAITING, /* Buffer is waiting for speech */
PB_STOPPING, /* Buffer is waiting for silence */
PB_FILLING, /* Buffer is filling */
PB_STOPPED, /* Buffer has stopped but not yet empty */
PB_CLEARED /* Buffer has been emptied */
} PBStatus;
PBStatus status; /* status of this buffer */
通过ChangeState()函数可以看出,ParmBuf pbuf有一个状态标志PBStatus status,而StartBuffer()函数就是将ParmBuf的状态标志PBStatus改为PB_FILLING,表示正在填装数据。
FillBufFromChannel()函数提取wav数据的特征,FillBufFromChannel()函数在OpenAsChannel()函数中被调用。
/* OpenAsChannel: open and create an audio input buffer */
static ReturnStatus OpenAsChannel(ParmBuf pbuf, int maxObs,
char *fname, FileFormat ff,
TriState silMeasure)
{
……
if (maxObs==0) {
/* maxObs==0 indicates want a table straight away */
StartBuffer(pbuf);
while(pbuf->status
FillBufFromChannel(pbuf,MAX_INT);
}
……
}
FillBufFromChannel()函数中调用了 函数FramesInChannel()和函数GetFrameFromChannel()。
从FramesInChannel()函数注释看,好像是提取参数,但仔细一看,原来是返回可以读取的行数(wav语音窗数)。
/* Fill Buffer with converted static coef vectors */
newRows=FramesInChannel(pbuf,pbuf->chType);
/* Return number of frames that can be read without blocking */
/* -1 == Done, no more to read. */
/* 0 == May block on reading first frame. */
/* N == Can read N frames immediately without blocking. */
/* INT_MAX == Will not block. */
static int FramesInChannel(ParmBuf pbuf,int chType){
……
}
再来看看GetFrameFromChannel()函数,
/* Get a single frame from particular channel */
/* Return value indicates number of frames read okay */
static int GetFrameFromChannel(ParmBuf pbuf,int chType,void *vp)
FillBufFromChannel中逐窗提取语音特征参数的for循环:
/* Read the necessary frames */
for (i=0; i
/* But have final check on read just in case */
if (pbuf->dShort) {
if (GetFrameFromChannel(pbuf,pbuf->chType,sp1)!=1) {
pbuf->chClear=TRUE;
break;
}
sp1 += cf->nCols;
}
else {
if (GetFrameFromChannel(pbuf,pbuf->chType,fp1)!=1) { //提取特征参数
pbuf->chClear=TRUE;
break;
}
fp1 += cf->nCols;
}
pbuf->inRow++;pbuf->main.nRows++;
}
//fp1表示存放 特征参数的buffer。Mfcc参数是float型。
// pbuf->main.data 的原型:void *data; /* parameterised data for this block */
fp1 = (float*) pbuf->main.data + pbuf->main.nRows*cf->nCols;
static void FillBufFromChannel(ParmBuf pbuf,int minRows)
{
……
for (i=0; i
/* But have final check on read just in case */
if (pbuf->dShort) {
if (GetFrameFromChannel(pbuf,pbuf->chType,sp1)!=1) {
pbuf->chClear=TRUE;
break;
}
sp1 += cf->nCols;
}
else {
if (GetFrameFromChannel(pbuf,pbuf->chType,fp1)!=1) { //调用这里
pbuf->chClear=TRUE;
break;
}
fp1 += cf->nCols;
}
pbuf->inRow++;pbuf->main.nRows++;
}
……
}
GetFrameFromChannel()函数调用ConvertFrame()来将语音转换为特征参数。
/* Get a single frame from particular channel */
/* Return value indicates number of frames read okay */
static int GetFrameFromChannel(ParmBuf pbuf,int chType,void *vp)
{
……
/* Then convert it to a frame */
if (ConvertFrame(cf, (float *) vp) != cf->nCvrt)
……
}
ConvertFrame()函数是最直接的参数提取函数,原wav语音数据存放于cf->s 中,而cf->s 是Vector s类型,即float*类型。
如何将单声道16b的wav语音存放为float型呢?
HTK将样本点将short int强行转换为float型,在GetWave()函数实现。
/* EXPORT->GetWave: Get next nFrames from w and store in buf */
void GetWave(Wave w, int nFrames, float *buf)
{
.....
*buf++ = w->data[w->frIdx+k]; //将short int转换为float,存放于cf->s中
......
}
/* ConvertFrame: convert frame in cf->s and store in pbuf, return total
parameters stored in pbuf */
static int ConvertFrame(IOConfig cf, float *pbuf)
{
ParmKind btgt = cf->tgtPK&BASEMASK;
float re,rawte=0.0,te,*p, cepScale = 1.0;
int i,bsize=0;
Vector v=NULL;
char buf[50];
Boolean rawE;
p = pbuf;
rawE = cf->rawEnergy;
if (btgt
rawE = FALSE;
if (cf->addDither!=0.0)
for (i=1; i<=VectorSize(cf->s); i++)
cf->s[i] += (RandomValue()*2.0 - 1.0)*cf->addDither;
if (cf->zMeanSrc && !cf->v1Compat)
ZeroMeanFrame(cf->s);
if ((cf->tgtPK&HASENERGY) && rawE){
rawte = 0.0;
for (i=1; i<=VectorSize(cf->s); i++)
rawte += cf->s[i] * cf->s[i];
}
if (cf->preEmph>0.0)
PreEmphasise(cf->s,cf->preEmph);
if (cf->useHam) Ham(cf->s);
switch(btgt){
case LPC:
Wave2LPC(cf->s,cf->a,cf->k,&re,&te);
v = cf->a; bsize = cf->lpcOrder;
break;
case LPREFC:
Wave2LPC(cf->s,cf->a,cf->k,&re,&te);
v = cf->k; bsize = cf->lpcOrder;
break;
case LPCEPSTRA:
Wave2LPC(cf->s,cf->a,cf->k,&re,&te);
LPC2Cepstrum(cf->a,cf->c);
if (cf->cepLifter > 0)
WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);
v = cf->c; bsize = cf->numCepCoef;
break;
case MELSPEC:
case FBANK:
Wave2FBank(cf->s, cf->fbank, rawE?NULL:&te, cf->fbInfo);
v = cf->fbank; bsize = cf->numChans;
break;
case MFCC:
Wave2FBank(cf->s, cf->fbank, rawE?NULL:&te, cf->fbInfo);
FBank2MFCC(cf->fbank, cf->c, cf->numCepCoef);
if (cf->cepLifter > 0)
WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);
v = cf->c; bsize = cf->numCepCoef;
break;
case PLP:
Wave2FBank(cf->s, cf->fbank, rawE ? NULL : &te, cf->fbInfo);
FBank2ASpec(cf->fbank, cf->as, cf->eql, cf->compressFact, cf->fbInfo);
ASpec2LPCep(cf->as, cf->ac, cf->lp, cf->c, cf->cm);
if (cf->cepLifter > 0)
WeightCepstrum(cf->c, 1, cf->numCepCoef, cf->cepLifter);
v = cf->c;
bsize = cf->numCepCoef;
break;
default:
HError(6321,"ConvertFrame: target %s is not a parameterised form",
ParmKind2Str(cf->tgtPK,buf));
}
if (btgt == PLP || btgt == MFCC)
cepScale = (cf->v1Compat) ? 1.0 : cf->cepScale;
for (i=1; i<=bsize; i++)
*p++ = v[i] * cepScale;
if (cf->tgtPK&HASZEROC){
if (btgt == MFCC) {
*p = FBank2C0(cf->fbank) * cepScale;
if (cf->v1Compat) *p *= cf->eScale;
++p;
}
else /* For PLP include gain as C0 */
*p++ = v[bsize+1] * cepScale;
cf->curPK|=HASZEROC ;
}
if (cf->tgtPK&HASENERGY) {
if (rawE) te = rawte;
*p++ = (te
cf->curPK|=HASENERGY;
}
return p - pbuf;
}