MFCC特征提取(C语言版本)

        音频分析中,MFCC参数是经典参数之一。之前对于它的计算流程和原理,大体上是比较清楚的,所以仿真的时候,都是直接调用matlab的voicebox工具或者开发的时候直接调用第三方库。最近想整理一个纯C语言版本的MFCC函数,发现第三方开源的一部分是C++的,有些纯C的开源代码是针对语音固定了某些参数,不太灵活。干脆自己动手写一下,发现matlab写习惯了,都弱化了写C的思维,磕磕碰碰弄了2天,初版总算是完成了。

       计算的大体流程:预加重->分帧->加窗->FFT->能量->Mel滤波器组滤波->log->dct解卷积->倒谱提升->差分。

在编码的过程中碰到了一些小问题,跟大家分享一下。

问题1:预加重也就高通滤波,它是针对整段语音的。而如果先进行分帧,然后按帧预加重,这种做法比较符合实时流,一帧一帧的实时处理,感觉应该也是可以的。但是严格来说,分帧预加重和对整段语音预加重还是有区别的,例如两种做法的每帧第一样本点不一致。

此处针对整段语音做预加重,然后一帧一帧的输出特征。

问题2:Mel尺度转换,两个公式都可以。

问题3:能量

幅度谱和用幅度谱的平方都可以。

问题4:相关公式

DCT变换,有好几个形式,见wiki。

http://en.wikipedia.org/wiki/Discrete_cosine_transform。

此处用到:

Mel滤波器组: 三角滤波器

MFCC特征提取(C语言版本)

MFCC特征提取(C语言版本)

 

倒谱提升窗:

w=1+0.5*K*sin(pi*m/K);     1<=m<=K

差分计算: 也有不同的形式,但是基本上是一致的,只是系数倍数不同而已。

差分计算比较麻烦,涉及到前后好几帧,也有对此处简化的处理。

,有的此处分母是开方。

FFT变换:调用了第三方库FFTW,也可以直接使用轻量级的源码FFT计算方法。

 

头文件:

 

#ifndef  MFCC_H

#define  MFCC_H

#include <stdio.h>



typedef struct AudioInfo

{

	int sample_rate;

	int frame_len;

	int frame_shift;

	float* data_in;

	int data_len;	

	AudioInfo():sample_rate(0),frame_len(0),frame_shift(0),data_in(NULL),data_len(0){};//构造函数注释掉,变成纯C版本

};

typedef struct MelBankInfo

{

	float **filter;

	int     nfilters;

	int     nfft;

	int     low;

	int     high;

	MelBankInfo():filter(NULL),nfilters(0),nfft(0),low(0),high(0){};

};



typedef struct DctInfo

{

	float  **coeff;

	int     dctlen;

	DctInfo():coeff(NULL),dctlen(0){};

};



typedef enum MFCC_TYPE

{

	MFCC_STD,

	MFCC_DIFF_1,

	MFCC_DIFF_2,

};



typedef struct MfccInfo

{	  

	MelBankInfo   melbank;

	DctInfo       dct;



	int     nframes;

	int     out_nframes;//可输出的特征数

	float *frame_data;

	float *data_in;

	float  *window;

	float  *lift_window;	

	int     frame_len;

	int     frame_shift;

	float  *pre1;

	float  *pre2;

	float  *cur;

	float  *next1;

	float  *next2;



	float  *diff_pre1;

	float  *diff_pre2;

	float  *diff_cur;

	float  *diff_next1;

	float  *diff_next2;

	MFCC_TYPE m_type;

	MfccInfo():nframes(0),out_nframes(0),window(NULL),lift_window(NULL),cur(NULL),frame_len(0),pre1(NULL),pre2(NULL),next1(NULL),next2(NULL),m_type(MFCC_STD),frame_data(NULL),frame_shift(0){};

};



MfccInfo*  MfccInit(AudioInfo audioctx, int nfft, int low, int high, int nfilters, int ndcts, MFCC_TYPE type);

int        Mfcc_Frame_std(MfccInfo *p, int  iframe, float *out, int len);

int        Mfcc_Frame_diff1(MfccInfo *p, int iframe, float *out, int len);

int		   Mfcc_Frame_diff2(MfccInfo *p, int iframe, float *out, int len);

void       MfccDestroy(MfccInfo *data);



#endif

源文件:



 



 



#include  "mfcc.h"

#include  "Spl.h"

#include  "fftw3.h"

#include  <malloc.h>

#include  <math.h>

#include  <string.h>

#include  <assert.h>



#define  PI   3.1415926

#define  EPS  0.0000001

#pragma comment(lib, "libfftw3f-3.lib")



using namespace std;

void PreEmphasise(const float *data, int len, float *out, float preF)//预加重

{

	for(int i = len - 1; i <= 1; i--)

	{

		out[i] = data[i] - preF * data[i-1];

	}

	out[0] = data[0];

}



float HzToMel(float  f)

{

	return 1127*log(1.0 + f/700);

}

float MelToHz(float data)

{

	return  700 * (exp(data/1127) - 1);

}

int  HzToN(float f, int fs, int nfft)

{

	return  f/fs *nfft+1;

}

void MelBank( int fs, int nfft, int low, int high, int nfilters, float** coeff )//三角滤波器组。

{

	float  fre_bin = (float)fs / nfft;

	float  low_mel = HzToMel(low);

	float  high_mel = HzToMel(high);

	float  mel_bw  = (high_mel - low_mel)/(nfilters + 1);

	int  valid_nfft = nfft/2 + 1;

	



	for(int j = 1; j <= nfilters; j++)

	{	

		float  mel_cent  = j * mel_bw + low_mel;

		float  mel_left  = mel_cent - mel_bw;

		float  mel_right = mel_cent + mel_bw;

        float  freq_cent =  MelToHz(mel_cent);

		float  freq_left =  MelToHz(mel_left);

		float  freq_bw_left = freq_cent - freq_left;

		float  freq_right = MelToHz(mel_right);

		float  freq_bw_right = freq_right - freq_cent;

		for(int i = 1; i <= valid_nfft; i++)

		{			

			float freq = (i-1) * fre_bin ;

			if( freq > freq_left && freq < freq_right )

			{

				if( freq <= freq_cent)

				{

					coeff[j-1][i-1] = (freq - freq_left) / freq_bw_left;

				}

				else

				{

					coeff[j-1][i-1] = (freq_right - freq) / freq_bw_right;

				}



			}	

			



		}	

		

	}	

	

}



void DctCoeff( int m, int n, float** coeff )//标准DCT变换。

{

	for( int i = 1; i <= m; i++)

	{

		for(int j = 0; j < n; j++)

		{

			coeff[i-1][j] = cos( (2*j + 1) * i *PI / (2 * n));

		}

	}

}



void lift_window(float* p, int m)//倒谱提升窗归一化。

{

	float  max_value = 0.0f;

	for(int i = 1; i <= m; i++)

	{

		p[i-1] = 1+ 0.5 * m * sin( PI * i/m );

		if( p[i-1] > max_value)

		{

			max_value = p[i-1];

		}

	}

	for(int i = 1; i <= m; i++)

	{

		p[i-1] /= max_value;

	}

}



float Product(float *data1, float* data2, int len)

{

	float result = 0.0;

	for(int i = 0; i < len; i++)

	{

		result += data1[i] * data2[i];

	}

	return result;

}



float** MallocMatrix(int m, int n)

{

	float **in = (float**)malloc(m * sizeof(float*));

	float* data = (float*)malloc( m*n*sizeof(float));

	memset( data, 0, sizeof(float)*m*n );

	for(int i = 1; i <= m; i++)

	{

		in[i-1] = &data[(i-1)*n];

	}

	return in;

}

void FreeMatrix(float **in)

{  

	float *data = *in;

	if(data != NULL)

	{

		free(data);

	}

	if(in != NULL)

	{

		free(in);

	}





}

int Mfcc_Frame_diff1_temp(MfccInfo *p, int iframe, float *out, int len);



//初始化,预加重,获取滤波器组系数,DCT系数,倒谱提升窗系数等。

MfccInfo*  MfccInit(AudioInfo audioctx, int nfft, int low, int high, int nfilters, int ndcts, MFCC_TYPE type)

{	   	  

	MfccInfo*  p = (MfccInfo*)malloc(sizeof(MfccInfo));

	p->melbank.nfft = nfft;

	p->melbank.low  = low;

	p->melbank.high = high;

	p->melbank.nfilters = nfilters;

	p->dct.dctlen = ndcts;

	p->pre1 = NULL;

	p->pre2 = NULL;

	p->cur  = NULL;

	p->next1 = NULL;

	p->next2 = NULL;

	p->m_type = type;

	p->data_in = audioctx.data_in;//整段语音的数据流

	p->frame_shift = audioctx.frame_shift;

	int valid_nfft = nfft/2 + 1;

	p->melbank.filter = MallocMatrix( nfilters, valid_nfft);

	MelBank( audioctx.sample_rate, nfft, low, high, nfilters, p->melbank.filter);//Mel滤波器系数	   

	p->dct.coeff = MallocMatrix( ndcts, nfilters);

	DctCoeff( ndcts, nfilters, p->dct.coeff );//DCT系数



	float preF = 0.9375;

	//整段语音高通滤波,预加重 

	PreEmphasise( audioctx.data_in, audioctx.data_len, audioctx.data_in, preF);

	int nframes = (audioctx.data_len - audioctx.frame_len)/audioctx.frame_shift + 1;	

	p->nframes = nframes;

	p->out_nframes = nframes;

	p->frame_len = audioctx.frame_len;

	p->window  = (float*) malloc( audioctx.frame_len * sizeof(float));

	hamming( p->window, audioctx.frame_len);//加窗

	p->lift_window = (float*)malloc( ndcts * sizeof(float));

	lift_window(p->lift_window, ndcts);//倒谱提升窗

	int  buffer_len = audioctx.frame_len > nfft ? audioctx.frame_len:nfft;

	p->frame_data = (float*) malloc( buffer_len * sizeof(float));

	switch(type)

	{

		case MFCC_DIFF_1:

		{   

			p->out_nframes = nframes - 4;

			p->pre1 = (float*)malloc( ndcts*sizeof(float));

			p->pre2 = (float*)malloc( ndcts*sizeof(float));

			p->cur  = (float*)malloc( ndcts*sizeof(float));

			p->next1 = (float*)malloc( ndcts*sizeof(float));

			p->next2 = (float*)malloc( ndcts*sizeof(float));

			Mfcc_Frame_std(p, 1, p->pre1, ndcts);

			Mfcc_Frame_std(p, 2, p->pre2,  ndcts); 

			Mfcc_Frame_std(p, 3, p->cur, ndcts);

			Mfcc_Frame_std(p, 4, p->next1,  ndcts); //一阶差分需要相邻两帧数据			

			break;

		}

		case MFCC_DIFF_2:

		{

			p->out_nframes = nframes - 8;

			p->pre1 = (float*)malloc( ndcts*sizeof(float));

			p->pre2 = (float*)malloc( ndcts*sizeof(float));

			p->cur  = (float*)malloc( ndcts*sizeof(float));

			p->next1 = (float*)malloc( ndcts*sizeof(float));

			p->next2 = (float*)malloc( ndcts*sizeof(float));

			Mfcc_Frame_std(p, 1, p->pre1, ndcts);

			Mfcc_Frame_std(p, 2, p->pre2,  ndcts); 

			Mfcc_Frame_std(p, 3, p->cur, ndcts);

			Mfcc_Frame_std(p, 4, p->next1,  ndcts); //一阶差分需要相邻两帧数据	



			p->diff_pre1 = (float*)malloc( ndcts*sizeof(float));

			p->diff_pre2 = (float*)malloc( ndcts*sizeof(float));

			p->diff_cur  = (float*)malloc( ndcts*sizeof(float));

			p->diff_next1 = (float*)malloc( ndcts*sizeof(float));

			p->diff_next2 = (float*)malloc( ndcts*sizeof(float));

			Mfcc_Frame_diff1_temp(p,1,p->diff_pre1,ndcts);

			Mfcc_Frame_diff1_temp(p,2,p->diff_pre2,ndcts);

			Mfcc_Frame_diff1_temp(p,3,p->diff_cur,ndcts);

			Mfcc_Frame_diff1_temp(p,4,p->diff_next1,ndcts);//二阶差分需要相邻一阶差分数据	

		}

	}

	return p;

}





int Mfcc_Frame_std(MfccInfo *p, int  iframe, float *out, int len)//输出mfcc,任意帧输出

{			

	if(iframe > p->nframes)

	{

		return -1;

	}

	memcpy(p->frame_data, p->data_in + (iframe - 1) * p->frame_shift, sizeof(float) * p->frame_len);

	apply_window( p->frame_data, p->window, p->frame_len);



	int  nfft = p->melbank.nfft;

	int  valid_nfft = nfft/2 + 1;



	fftwf_plan r2cP;

	fftwf_complex* temp = (fftwf_complex*)fftwf_malloc(sizeof( fftwf_complex ) * valid_nfft);		

	r2cP = fftwf_plan_dft_r2c_1d( p->frame_len, p->frame_data, temp, FFTW_ESTIMATE ); //完成FFT运算

	fftwf_execute( r2cP );

	

	for (int j = 0; j < valid_nfft; ++j)

	{

		p->frame_data[j] = pow( temp[j][0], 2 ) + pow( temp[j][1], 2 );//平方能量值,也可以用谱幅度值

		

	}

	

	fftwf_destroy_plan( r2cP );

	

	for(int i = 1; i <= p->dct.dctlen; i++)

	{

		float temp = 0.0;

		for(int j = 1; j <= p->melbank.nfilters; j++)

		{

			//DCT变换,解卷积

			temp += p->dct.coeff[i-1][j-1] * log(Product(p->frame_data, p->melbank.filter[j-1], valid_nfft)+ EPS)/log(10.0);			

		}

		out[i-1] = temp * p->lift_window[i-1];//倒谱提升

	}

	

	fftwf_free(temp);



	return 0;

}



int Mfcc_Frame_diff1(MfccInfo *p, int iframe, float *out, int len)//标准一阶差分,输出 mfcc + 一阶差分。 逐帧输出

{

   assert(p->nframes >= 5 && iframe <= p->nframes -4 && p->m_type == MFCC_DIFF_1);

   int ret = Mfcc_Frame_std(p, iframe + 4, p->next2, len);   

   int dctlen = p->dct.dctlen;

   memcpy( out, p->cur, sizeof(float)* dctlen);//mfcc

   float  factor = sqrt(10.0);

   for(int i = 0; i < dctlen; i++)

   {

	   out[i + dctlen] = (2 * p->next2[i] + p->next1[i] - 2*p->pre1[i] - p->pre2[i])/factor ;//一阶差分

   }



   float *temp = p->pre1;

   p->pre1 = p->pre2;   

   p->pre2 = p->cur;

   p->cur  = p->next1;

   p->next1 = p->next2;

   p->next2 = temp;

   return ret;

}



int Mfcc_Frame_diff1_temp(MfccInfo *p, int iframe, float *out, int len)//输出一阶差分

{

	int ret = Mfcc_Frame_std(p, iframe + 4, p->next2, len);   

	int dctlen = p->dct.dctlen;

	float  factor = sqrt(10.0);

	for(int i = 0; i < dctlen; i++)

	{

		out[i] = (2 * p->next2[i] + p->next1[i] - 2*p->pre1[i] - p->pre2[i])/factor ;//一阶差分

	}



	float *temp = p->pre1;

	p->pre1 = p->pre2;   

	p->pre2 = p->cur;

	p->cur  = p->next1;

	p->next1 = p->next2;

	p->next2 = temp;

	return ret;

}



int Mfcc_Frame_diff2(MfccInfo *p, int iframe, float *out, int len)//输出mfcc+1+2

{

	assert(p->nframes >= 9 && iframe <= p->nframes -8 && p->m_type == MFCC_DIFF_2);



	int ret = Mfcc_Frame_diff1_temp(p, iframe + 8, p->diff_next2, len);  



	int dctlen = p->dct.dctlen;

	memcpy( out, p->next2, sizeof(float)* dctlen);//mfcc

	memcpy( out + dctlen, p->diff_cur, sizeof(float)* dctlen);//一阶差分

	float  factor = sqrt(10.0);

	for(int i = 0; i < dctlen; i++)

	{

		out[i + 2*dctlen] = (2 * p->diff_next2[i] + p->diff_next1[i] - 2*p->diff_pre1[i] - p->diff_pre2[i])/factor ;//二阶差分

	}



	float *temp = p->diff_pre1;

	p->diff_pre1 = p->diff_pre2;   

	p->diff_pre2 = p->diff_cur;

	p->diff_cur  = p->diff_next1;

	p->diff_next1 = p->diff_next2;

	p->diff_next2 = temp;

	return ret;

}







void MfccDestroy(MfccInfo *data)

{

	FreeMatrix(data->melbank.filter);

	FreeMatrix(data->dct.coeff);

	if(data->window)

	{

		free(data->window);

		data->window = NULL;

	}

	if(data->lift_window)

	{

		free(data->lift_window);

		data->lift_window = NULL;

	}

	if(data->pre1)

	{

		free(data->pre1);

		data->pre1 = NULL;

	}

	if(data->pre2)

	{

		free(data->pre2);

		data->pre2 = NULL;

	}

	if(data->cur)

	{

		free(data->cur);

		data->cur = NULL;

	}

	if(data->next1)

	{

		free(data->next1);

		data->next1 = NULL;

	}

	if(data->next2)

	{

		free(data->next2);

		data->next2 = NULL;

	}



	if(data->frame_data)

	{

		free(data->frame_data);

		data->frame_data = NULL;

	}



	if(data->diff_pre1)

	{

		free(data->pre1);

		data->pre1 = NULL;

	}

	if(data->diff_pre2)

	{

		free(data->pre2);

		data->pre2 = NULL;

	}



	if(data->diff_cur)

	{

		free(data->cur);

		data->cur = NULL;

	}

	if(data->diff_next1)

	{

		free(data->next1);

		data->next1 = NULL;

	}

	if(data->diff_next2)

	{

		free(data->next2);

		data->next2 = NULL;

	}

}



其它文件:



 



 



#include "Spl.h"

#include <math.h>

#define  TWOPI	 6.283185307179586



void hanning( float *win, int N)

{	

	int half = 0;

	if ( N % 2 == 0 )

	{

		half = N / 2;



		for (int i = 1; i <= half; ++i)

		{

			win[i - 1] = 0.5 - 0.5*cos(TWOPI*i / (N + 1.0));

		}



		int index = half + 1;

		for (int i = half; i >= 1; i--)

		{

			win[index - 1] = win[i - 1];

			index++;

		}



	}

	else

	{

		half = (N + 1) / 2;



		for (int i = 1; i <= half; ++i)

		{

			win[i - 1] = 0.5 - 0.5*cos(TWOPI*i / (N + 1.0));

		}



		int index = half + 1;

		for (int i = half-1; i >= 1; i--)

		{

			win[index - 1] = win[i - 1];

			index++;

		}



	}

}

void hamming( float *win, int N)

{

	int half = 0;

	if ( N % 2 == 0 )

	{

		half = N / 2;



		for (int i = 1; i <= half; ++i)

		{

			win[i - 1] = 0.54 - 0.46*cos(TWOPI*i / (N + 1.0));

		}



		int index = half + 1;

		for (int i = half; i >= 1; i--)

		{

			win[index - 1] = win[i - 1];

			index++;

		}



	}

	else

	{

		half = (N + 1) / 2;



		for (int i = 1; i <= half; ++i)

		{

			win[i - 1] = 0.54 - 0.46*cos(TWOPI*i / (N + 1.0));

		}



		int index = half + 1;

		for (int i = half-1; i >= 1; i--)

		{

			win[index - 1] = win[i - 1];

			index++;

		}



	}







}

void apply_window(float* data, float* window, int window_len)

{

	for(int i = 0; i< window_len; i++)

	{

		data[i] = data[i] * window[i];

	}



}





测试代码:



 



 



#include  "mfcc.h"

#include  <malloc.h>



int main(int argc, char* argv[])

{

	int samples = 1024*4;

	float *data = (float*)malloc(samples * sizeof(float));

	for(int i = 0; i < samples; i++)

	{

		data[i] = i;

	}

	AudioInfo audioctx;

	audioctx.data_in = data;

	audioctx.data_len = samples;

	audioctx.frame_len = 512;

	audioctx.frame_shift = 128;

	audioctx.sample_rate = 8000;



	int nfft = audioctx.frame_len; 

	int low = 0; 

	int high = audioctx.sample_rate/2; 

	int nfilters = 24;

	int ndcts = 12;

#if   0

	MFCC_TYPE type = MFCC_STD;

	MfccInfo* p = MfccInit( audioctx, nfft, low, high, nfilters, ndcts, type);

	int len = ndcts * 1;

	float *out = (float*) malloc( sizeof(float)* (len));



	for(int j = 0; j < p->out_nframes; j++)//能指定输出第i帧的mfcc

	{

		Mfcc_Frame_std(p, j+1, out, len);

		printf("第 %d 帧:\n", j+1);

		for(int i = 0; i < len; i++)

		{

			printf("%f ", out[i]);

		}

		printf("\n");

	}





#endif



#if   0

	MFCC_TYPE type = MFCC_DIFF_1;

	MfccInfo* p = MfccInit( audioctx, nfft, low, high, nfilters, ndcts, type);

	int len = ndcts * 2;

	float *out = (float*) malloc( sizeof(float)* (len));



	for(int j = 0; j < p->out_nframes; j++)//必须按顺序输出,不能直接获取指定帧的mfcc差分

	{

		Mfcc_Frame_diff1(p, j+1, out, len);

		printf("第 %d 帧:\n", j+1);

		for(int i = 0; i < len; i++)

		{

			printf("%f ", out[i]);

		}

		printf("\n");

	}





#endif



#if  1

	MFCC_TYPE type = MFCC_DIFF_2;

	MfccInfo* p = MfccInit( audioctx, nfft, low, high, nfilters, ndcts, type);

	int len = ndcts * 3;

	float *out = (float*) malloc( sizeof(float)* (len));



	for(int j = 0; j < p->out_nframes; j++)//必须按顺序输出,不能直接获取指定帧的mfcc差分

	{

		Mfcc_Frame_diff2(p, j+1, out, len);

		printf("第 %d 帧:\n", j+1);

		for(int i = 0; i < len; i++)

		{

			printf("%f ", out[i]);

		}

		printf("\n");

	}

#endif

	free(data);

	data = NULL;

	free(out);

	out = NULL;

	MfccDestroy(p);

	return 0;

}

 

  

你可能感兴趣的:(mfc)