MP3解码之DCT(32→64)快速算法的展开

MP3解码的最后一步是“多相合成滤波”,多相合成滤波算法见ISO/IEC 11172-3 ANNEX_B Figure 3-A.2,经过5个步骤,将输入序列X[0..31]的32个采样值,变换为32个PCM样本输出:

C代码

 

// ①Shift 64 to 1024 FIFO
for i = 64 to 1023
    V[i] = V[i-64]

// ②Calculate 64 values V[i] by matrixing
for i = 0 to 63
    for k = 0 to 31
        V[i] += N[i][k] * X[k]
// 其中 N[i][K]=cos((16+i)*(2*k+1)*PI/64)

// ③Building a 512 values vector U
for i = 0 to 7
    for j = 0 to 31 {
        U[64*i+j] += V[128*i+j]
        U[64*i+32+j] += V[128*i+96+j]
    }

// ④Multiply U vector by D window
for j = 0 to 511
    W[i] = U[i] * D[i]
// 其中D[i] 见ISO/IEC 11172-3,Table 3-B.3.

// ⑤Calculating 32 Samples
for i = 0 to 31 {
    double Si = 0
    for j = 0 to 15
        Si += W[i + 32 * j]
    //Output PCM Sample: PCMi = (short)(Si * 32768)
}

算法中第②步矩阵运算是DCT(32→64)运算,将输入序列的32个值X[k]变换为输出序列的64个值V[i],直接运算共64 * 32 = 2048次乘法。下面简要介绍快速算法。

 

用32点DCT-II代替DCT(32→64)

    余弦函数具有周期性和对称性,所以由DCT(32→32)得到输出序列的32个值,可以直接得到 DCT(32→64) 的64个输出值,方算法如下:

void in32out64(double in[32], double out[64])
{
	int i;
	for (i = 0; i < 16; i++)
		out[i] = in[i+16];
	out[16] = 0.0;
	for (i = 17; i < 48; i++)
		out[i] = -in[48-i];
	for (i = 48; i < 64; i++)
		out[i] = -in[i-48];
}


 

根据标准中对N[i][k]的规定知DCT(32→32)是32点DCT-II。DCT-II的快速算法采用:

将N点DCT-II表示为两个N/2点DCT-II之和  N点DCT-II可以表示为N/2点DCT-II和N/2点DCT-IV之和,而DCT-IV输出序列中的一个值可以用DCT-II输出序列中两个连续的值之和表示。

将N/2点DCT再表示为两个N/4点DCT之和,一直分解下去

可以将上述两步推导过程一直递归地进行下去,直到N=2。限于博客长度限制,并且输入公式不方便,这儿不写出具体推导过程。

 

为了方便查表求余弦值,先初始化N[i][k]:

#define PI 3.141592654    // 原文http://lfp001.iteye.com/
void init_N()
{
    int i,k;
    for(i = 0; i < 32; i++)
        for(k = 0; k < 32; k++)
            N[i][k] = cos(i * (2 * k + 1) * PI / 64);
}


 

2点DCT

N'[i][k] = cos(i * (2 * k + 1) * PI / 4);
out[i] += in[k] * N'[i][k];
程序代码为:

void dct2(double in[2], double out[2])
{
    int i,k;
    for(i = 0; i < 2; i++) {
        out[i] = 0;
        for(k = 0; k < 2; k++)
	out[i] += in[k] * N[16 * i][k];
    }
}


2点DCT快速算法直接展开运算,这3项N[i][k]不用查表:
N[0][k]=1,N[16][0]=cos(PI/4), N[16][1]=cos(3PI/4)=-cos(PI/4)
out[0] = in[0] + in[1];
out[1] = in[0] * N[16][0] + in[1] * N[16][1] = (in[0] - in[1]) * 0.7071067811866;

其算法如下:

void fast_dct2(double in[2], double out[2])
{
    out[0] = in[0] + in[1];
    out[1] = (in[0] - in[1]) * 0.7071067811866;
}


 

4点DCT
N'[i][k] = cos(i * (2 * k + 1) * PI / 8);
out[i] += in[k] * N'[i][k];

void dct4(double in[4], double out[4])
{
	int i,k;
	for(i = 0; i < 4; i++) {
		out[i] = 0;
		for(k = 0; k < 4; k++)
			out[i] += in[k] * N[8 * i][k];
	}
}


 

4点DCT快速算法: 分解为两个2点DCT

void fast_dct4(double in[4], double out[4])
{
	int i;
	double even_in[2], even_out[2];
	double odd_in[2], odd_out[2];

	for(i = 0; i < 2; i++) {
		even_in[i] = in[i] + in[3 - i];
		odd_in[i] = (in[i] - in[3 - i]) / (2 * N[8][i]);
	}

	fast_dct2(even_in, even_out);
	fast_dct2(odd_in, odd_out);

	out[0] = even_out[0];
	out[1] = odd_out[0] + odd_out[1];
	
	out[2] = even_out[1];
	out[3] = odd_out[1];
}


 

8点DCT
N'[i][k] = cos(i * (2 * k + 1) * PI / 16);
out[i] += in[k] * N'[i][k];

void dct8(double in[8], double out[8])
{
	int i,k;
	for(i = 0; i < 8; i++) {
		out[i] = 0;
		for(k = 0; k < 8; k++)
			out[i] += in[k] * N[4 * i][k];
	}
}


 

8点DCT快速算法: 分解为两个4点DCT,4点DCT用快速算法

N[4][i]=cos((2*i+1)/16)  i=0,1,2,3

void fast_dct8(double in[8], double out[8])
{
	int i;
	double even_in[4], even_out[4];
	double odd_in[4], odd_out[4];

	for(i = 0; i < 4; i++) {
		even_in[i] = in[i] + in[7 - i];
		odd_in[i] = (in[i] - in[7 - i]) / (2 * N[4][i]);
	}

	fast_dct4(even_in, even_out);	//直接产生out[0..7]的偶数项
	fast_dct4(odd_in, odd_out);		//间接产生out[0..7]的奇数项

	for (i = 0; i < 3; i++) {
		out[2*i] = even_out[i];
		out[2*i+1] = odd_out[i] + odd_out[i+1];	//!
	}
	out[6] = even_out[3];
	out[7] = odd_out[3];
}


 

16点DCT
N'[i][k] = cos(i * (2 * k + 1) * PI / 32);
out[i] += in[k] * N'[i][k];

void dct16(double in[16], double out[16])
{
	int i,k;
	for(i = 0; i < 16; i++) {
		out[i] = 0;
		for(k = 0; k < 16; k++)
			out[i] += in[k] * N[2 * i][k];
	}
}


 

16点DCT快速算法: 分解为两个8点DCT,8点DCT用快速算法

N[2][i]=cos((2*i+1)/32)  i=0,1,...7

void fast_dct16(double in[16], double out[16])
{
	int i;
	double even_in[8], even_out[8];
	double odd_in[8], odd_out[8];

	for(i = 0; i < 8; i++) {
		even_in[i] = in[i] + in[15 - i];
		odd_in[i] = (in[i] - in[15 - i]) / (2 * N[2][i]);  // 计算误差来源于此?
	}

	fast_dct8(even_in, even_out);    // 直接产生out[0..15]的偶数项
	fast_dct8(odd_in, odd_out);       // 间接产生out[0..15]的奇数项

	for (i = 0; i < 7; i++) {
		out[2*i] = even_out[i];
		out[2*i+1] = odd_out[i] + odd_out[i+1];	  //!
	}
	out[14] = even_out[7];
	out[15] = odd_out[7];
}


 

32点DCT
N[i][k] = cos(i * (2 * k + 1) * PI / 64);
out[i] += in[k] * N[i][k];

 

void dct32(double in[32], double out[32])
{
	int i,k;
	for(i = 0; i < 32; i++) {
		out[i] = 0;
		for(k = 0; k < 32; k++)
			out[i] += in[k] * N[i][k];
	}
}

32点DCT快速算法: 分解为两个16点DCT,用16点DCT采用快速算法.共16+2*32=80次乘法

N[1][i]=cos((2*i+1)/64)  i=0,1,...15

void fast_dct32(double in[32], double out[32])
{
	int i;
	double even_in[16], even_out[16];
	double odd_in[16], odd_out[16];

	for(i = 0; i < 16; i++) {
		even_in[i] = in[i] + in[31 - i];
		odd_in[i] = (in[i] - in[31 - i]) / (2 * N[1][i]);
	}

	fast_dct16(even_in, even_out);	//直接产生out[0..31]的偶数项
	fast_dct16(odd_in, odd_out);		//间接产生out[0..31]的奇数项

	for (i = 0; i < 15; i++) {
		out[2*i] = even_out[i];
		out[2*i+1] = odd_out[i] + odd_out[i+1];	//!
	}
	out[30] = even_out[15];
	out[31] = odd_out[15];
}


 

各点DCT快速算法用到的N[i][j]:
DCT32: N[1][i]=cos((2*i+1)/64)  i=0..15
DCT16: N[2][i]=cos((2*i+1)/32)  i=0..7
DCT8:   N[4][i]=cos((2*i+1)/16)  i=0..3
DCT4:   N[8][i]=cos((2*i+1)/8)    i=0,1
DCT2:   cos(PI/4) = 0.7071067811866
快速算法只需事先存储16+8+4+2+1=31个值到N[0..30],取代N[32][32]。

 

DCT-II(32→64)的快速算法的展开算法

      解码一帧双声道的MP3,共要调用ISO/IEC 11172-3 ANNEX_B Figure 3-A.2第二步给出的矩阵运算2*2*18=72次,进行一次矩阵运算要进行浮点乘法是2048次,采用快速算法降低到80次,快速算法很完美。为什么还要将快速算法展开呢?由于矩阵运算调用频度极高,是影响解码速度的关键模块。分析以上快速算法函数可以看出两个特点:使用迭代和函数内大量的循环语句,应用迭代这种规律使展开成为可能,使用展开方法可以去掉中间各点DCT快速算法用到的循环语句,所以可以使矩阵运算的速度进一步提高。通过我对比实测用展开方法解码速度提升10%以上。下面给出的DCT-II(32->64)展开的快速算法是从我前几年写的MP3解码程序中直接COPY过来的,有比较详细的注解,对比上文的各点DCT的快速算法函数,很容易看懂。

void dct32to64(double in32[32], double out64[64])
{
	double in0,in1,in2,in3,in4,in5,in6,in7,in8,in9,in10,in11,in12,in13,in14,in15;
	double out0,out1,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15;
	double d8_0,d8_1,d8_2,d8_3,d8_4,d8_5,d8_6,d8_7;
	double ein0, ein1, oin0, oin1;

	//>>>>>>>>>>>>>>>>
	// 用DCT16计算DCT32输出[0..31]的偶数下标元素
	in0 = in32[0] + in32[31];
	in1 = in32[1] + in32[30];
	in2 = in32[2] + in32[29];
	in3 = in32[3] + in32[28];
	in4 = in32[4] + in32[27];
	in5 = in32[5] + in32[26];
	in6 = in32[6] + in32[25];
	in7 = in32[7] + in32[24];
	in8 = in32[8] + in32[23];
	in9 = in32[9] + in32[22];
	in10 = in32[10] + in32[21];
	in11 = in32[11] + in32[20];
	in12 = in32[12] + in32[19];
	in13 = in32[13] + in32[18];
	in14 = in32[14] + in32[17];
	in15 = in32[15] + in32[16];

	//DCT16
	{
		//>>>>>>>> 用DCT8计算DCT16输出[0..15]的偶数下标元素
		d8_0 = in0 + in15;
		d8_1 = in1 + in14;
		d8_2 = in2 + in13;
		d8_3 = in3 + in12;
		d8_4 = in4 + in11;
		d8_5 = in5 + in10;
		d8_6 = in6 + in9;
		d8_7 = in7 + in8;

		//DCT8. 加(减)法29,乘法12次
		{
			//>>>>e 用DCT4计算DCT8的输出[0..7]的偶数下标元素
			out1 = d8_0 + d8_7;
			out3 = d8_1 + d8_6;
			out5 = d8_2 + d8_5;
			out7 = d8_3 + d8_4;

			//>>e DCT2
			ein0 = out1 + out7;
			ein1 = out3 + out5;
			out64[48] =  -ein0 - ein1;
			out64[0] = (ein0 - ein1) * 0.70710678118654752;	// 0.5/cos(PI/4)

			//>>o DCT2
			oin0 = (out1 - out7) * 0.54119610014619698;		// 0.5/cos( PI/8)
			oin1 = (out3 - out5) * 1.30656296487637653;		// 0.5/cos(3PI/8)

			out2 =  oin0 + oin1;
			out12 = (oin0 - oin1) * 0.70710678118654752;	// cos(PI/4)

			out64[40] = out64[56] = -out2 - out12;
			out64[8] = out12;
			//<<<<e 完成计算DCT8的输出[0..7]的偶数下标元素

			//>>>>o 用DCT4计算DCT8的输出[0..7]的奇数下标元素
			//o DCT4 part1
			out1 = (d8_0 - d8_7) * 0.50979557910415917;		// 0.5/cos( PI/16)
			out3 = (d8_1 - d8_6) * 0.60134488693504528;		// 0.5/cos(3PI/16)
			out5 = (d8_2 - d8_5) * 0.89997622313641570;		// 0.5/cos(5PI/16)
			out7 = (d8_3 - d8_4) * 2.56291544774150618;		// 0.5/cos(7PI/16)

			//o DCT4 part2

			//e DCT2 part1
			ein0 = out1 + out7;
			ein1 = out3 + out5;

			//o DCT2 part1
			oin0 = (out1 - out7) * 0.54119610014619698;	// 0.5/cos(PI/8)
			oin1 = (out3 - out5) * 1.30656296487637653;	// 0.5/cos(3PI/8)

			//e DCT2 part2
			out1 =  ein0 + ein1;
			out5 = (ein0 - ein1) * 0.70710678118654752;	// cos(PI/4)

			//o DCT2 part2
			out3 = oin0 + oin1;
			out7 = (oin0 - oin1) * 0.70710678118654752;	// cos(PI/4)
			out3 += out7;

			//o DCT4 part3
			out64[44] = out64[52] = -out1 - out3;	//out1+=out3
			out64[36] = out64[60] = -out3 - out5;	//out3+=out5
			out64[4] = out5 + out7;					//out5+=out7
			out64[12] = out7;
			//<<<<o 完成计算DCT8的输出[0..7]的奇数下标元素
		}
		//<<<<<<<< 完成计算DCT16输出[0..15]的偶数下标元素

		//-------------------------------------------------------------------------

		//>>>>>>>> 用DCT8计算DCT16输出[0..15]的奇数下标元素
		d8_0 = (in0 - in15) * 0.50241928618815571;	// 0.5/cos( 1 * PI/32)
		d8_1 = (in1 - in14) * 0.52249861493968888;	// 0.5/cos( 3 * PI/32)
		d8_2 = (in2 - in13) * 0.56694403481635770;	// 0.5/cos( 5 * PI/32)
		d8_3 = (in3 - in12) * 0.64682178335999013;	// 0.5/cos( 7 * PI/32)
		d8_4 = (in4 - in11) * 0.78815462345125022;	// 0.5/cos( 9 * PI/32)
		d8_5 = (in5 - in10) * 1.06067768599034747;	// 0.5/cos(11 * PI/32)
		d8_6 = (in6 - in9) * 1.72244709823833393;	// 0.5/cos(13 * PI/32)
		d8_7 = (in7 - in8) * 5.10114861868916386;	// 0.5/cos(15 * PI/32)

		//DCT8
		{
			//>>>>e 用DCT4计算DCT8的输出[0..7]的偶数下标元素.
			out3 = d8_0 + d8_7;
			out7 = d8_1 + d8_6;
			out11 = d8_2 + d8_5;
			out15 = d8_3 + d8_4;

			//>>e DCT2
			ein0 = out3 + out15;
			ein1 = out7 + out11;
			out1 =  ein0 + ein1;
			out9 = (ein0 - ein1) * 0.70710678118654752;		// 0.5/cos(PI/4)

			//>>o DCT2
			oin0 = (out3 - out15) * 0.54119610014619698;	// 0.5/cos( PI/8)
			oin1 = (out7 - out11) * 1.30656296487637653;	// 0.5/cos(3PI/8)

			out5 =  oin0 + oin1;
			out13 = (oin0 - oin1) * 0.70710678118654752;	// cos(PI/4)

			out5 += out13;
			//<<<<e 完成计算DCT8的输出[0..7]的偶数下标元素

			//>>>>o 用DCT4计算DCT8的输出[0..7]的奇数下标元素
			//o DCT4 part1
			out3 = (d8_0 - d8_7) * 0.50979557910415917;		// 0.5/cos( PI/16)
			out7 = (d8_1 - d8_6) * 0.60134488693504528;		// 0.5/cos(3PI/16)
			out11 = (d8_2 - d8_5) * 0.89997622313641570;	// 0.5/cos(5PI/16)
			out15 = (d8_3 - d8_4) * 2.56291544774150618;	// 0.5/cos(7PI/16)

			//o DCT4 part2

			//e DCT2 part1
			ein0 = out3 + out15;
			ein1 = out7 + out11;

			//o DCT2 part1
			oin0 = (out3 - out15) * 0.54119610014619698;	// 0.5/cos(PI/8)
			oin1 = (out7 - out11) * 1.30656296487637653;	// 0.5/cos(3PI/8)

			//e DCT2 part2
			out3 =  ein0 + ein1;
			out11 = (ein0 - ein1) * 0.70710678118654752;	// cos(PI/4)

			//o DCT2 part2
			out7 = oin0 + oin1;
			out15 = (oin0 - oin1) * 0.70710678118654752;	// cos(PI/4)
			out7 += out15;

			//o DCT4 part3
			out3 += out7;
			out7 += out11;
			out11 += out15;
			//<<<<o 完成计算DCT8的输出[0..7]的奇数下标元素
		}

		out64[46] = out64[50] = -out1 - out3;	//out1 += out3
		out64[42] = out64[54] = -out3 - out5;	//out3 += out5
		out64[38] = out64[58] = -out5 - out7;	//out5 += out7
		out64[34] = out64[62] = -out7 - out9;	//out7 += out9
		out64[2] = out9 + out11;				//out9 += out11
		out64[6] = out11 + out13;				//out11 += out13
		out64[10] = out13 + out15;				//out13 += out15
		//<<<<<<<< 完成计算DCT16输出[0..15]的奇数下标元素
	}
	out64[14] = out15;	//out64[14]=out32[30]
	//<<<<<<<<<<<<<<<<
	// 完成计算DCT32输出[0..31]的偶数下标元素

//=============================================================================

	//>>>>>>>>>>>>>>>>
	// 用DCT16计算计算DCT32输出[0..31]的奇数下标元素
	in0 = (in32[0] - in32[31]) * 0.50060299823519630;	// 0.5/cos( 1 * PI/64)
	in1 = (in32[1] - in32[30]) * 0.50547095989754366;	// 0.5/cos( 3 * PI/64)
	in2 = (in32[2] - in32[29]) * 0.51544730992262455;	// 0.5/cos( 5 * PI/64)
	in3 = (in32[3] - in32[28]) * 0.53104259108978417;	// 0.5/cos( 7 * PI/64)
	in4 = (in32[4] - in32[27]) * 0.55310389603444453;	// 0.5/cos( 9 * PI/64)
	in5 = (in32[5] - in32[26]) * 0.58293496820613387;	// 0.5/cos(11 * PI/64)
	in6 = (in32[6] - in32[25]) * 0.62250412303566482;	// 0.5/cos(13 * PI/64)
	in7 = (in32[7] - in32[24]) * 0.67480834145500575;	// 0.5/cos(15 * PI/64)
	in8 = (in32[8] - in32[23]) * 0.74453627100229845;	// 0.5/cos(17 * PI/64)
	in9 = (in32[9] - in32[22]) * 0.83934964541552704;	// 0.5/cos(19 * PI/64)
	in10 = (in32[10] - in32[21]) * 0.97256823786196069;	// 0.5/cos(21 * PI/64)
	in11 = (in32[11] - in32[20]) * 1.16943993343288495;	// 0.5/cos(23 * PI/64)
	in12 = (in32[12] - in32[19]) * 1.48416461631416628;	// 0.5/cos(25 * PI/64)
	in13 = (in32[13] - in32[18]) * 2.05778100995341155;	// 0.5/cos(27 * PI/64)
	in14 = (in32[14] - in32[17]) * 3.40760841846871879;	// 0.5/cos(29 * PI/64)
	in15 = (in32[15] - in32[16]) * 10.1900081235480568;	// 0.5/cos(31 * PI/64)

	//DCT16
	{
		//>>>>>>>> 用DCT8计算DCT16输出[0..15]的偶数下标元素:
		d8_0 = in0 + in15;
		d8_1 = in1 + in14;
		d8_2 = in2 + in13;
		d8_3 = in3 + in12;
		d8_4 = in4 + in11;
		d8_5 = in5 + in10;
		d8_6 = in6 + in9;
		d8_7 = in7 + in8;

		//DCT8
		{
			//>>>>e 用DCT4计算DCT8的输出[0..7]的偶数下标元素
			out1 = d8_0 + d8_7;
			out3 = d8_1 + d8_6;
			out5 = d8_2 + d8_5;
			out7 = d8_3 + d8_4;

			//>>e DCT2
			ein0 = out1 + out7;
			ein1 = out3 + out5;
			out0 =  ein0 + ein1;
			out8 = (ein0 - ein1) * 0.70710678118654752;		// 0.5/cos(PI/4)

			//>>o DCT2
			oin0 = (out1 - out7) * 0.54119610014619698;		// 0.5/cos( PI/8)
			oin1 = (out3 - out5) * 1.30656296487637653;		// 0.5/cos(3PI/8)

			out4 =  oin0 + oin1;
			out12 = (oin0 - oin1) * 0.70710678118654752;	// cos(PI/4)

			out4 += out12;
			//<<<<e 完成计算DCT8的输出[0..7]的偶数下标元素

			//>>>>o 用DCT4计算DCT8的输出[0..7]的奇数下标元素
			//o DCT4 part1
			out1 = (d8_0 - d8_7) * 0.50979557910415917;		// 0.5/cos( PI/16)
			out3 = (d8_1 - d8_6) * 0.60134488693504528;		// 0.5/cos(3PI/16)
			out5 = (d8_2 - d8_5) * 0.89997622313641570;		// 0.5/cos(5PI/16)
			out7 = (d8_3 - d8_4) * 2.56291544774150618;		// 0.5/cos(7PI/16)

			//o DCT4 part2

			//e DCT2 part1
			ein0 = out1 + out7;
			ein1 = out3 + out5;

			//o DCT2 part1
			oin0 = (out1 - out7) * 0.54119610014619698;		// 0.5/cos(PI/8)
			oin1 = (out3 - out5) * 1.30656296487637653;		// 0.5/cos(3PI/8)

			//e DCT2 part2
			out2 =  ein0 + ein1;
			out10 = (ein0 - ein1) * 0.70710678118654752;	// cos(PI/4)

			//o DCT2 part2
			out6 = oin0 + oin1;
			out14 = (oin0 - oin1) * 0.70710678118654752;
			out6 += out14;

			//o DCT4 part3
			out2 += out6;
			out6 += out10;
			out10 += out14;
			//<<<<o 完成计算DCT8的输出[0..7]的奇数下标元素
		}
		//<<<<<<<< 完成计算DCT16输出[0..15]的偶数下标元素

		//-------------------------------------------------------------------------

		//>>>>>>>> 用DCT8计算DCT16输出[0..15]的奇数下标元素
		d8_0 = (in0 - in15) * 0.50241928618815571;	// 0.5/cos( 1 * PI/32)
		d8_1 = (in1 - in14) * 0.52249861493968888;	// 0.5/cos( 3 * PI/32)
		d8_2 = (in2 - in13) * 0.56694403481635770;	// 0.5/cos( 5 * PI/32)
		d8_3 = (in3 - in12) * 0.64682178335999013;	// 0.5/cos( 7 * PI/32)
		d8_4 = (in4 - in11) * 0.78815462345125022;	// 0.5/cos( 9 * PI/32)
		d8_5 = (in5 - in10) * 1.06067768599034747;	// 0.5/cos(11 * PI/32)
		d8_6 = (in6 - in9) * 1.72244709823833393;	// 0.5/cos(13 * PI/32)
		d8_7 = (in7 - in8) * 5.10114861868916386;	// 0.5/cos(15 * PI/32)

		//DCT8
		{
			//>>>>e 用DCT4计算DCT8的输出[0..7]的偶数下标元素.
			out1 = d8_0 + d8_7;
			out3 = d8_1 + d8_6;
			out5 = d8_2 + d8_5;
			out7 = d8_3 + d8_4;

			//>>e DCT2
			ein0 = out1 + out7;
			ein1 = out3 + out5;
			in0 =  ein0 + ein1;	//out0->in0,out4->in4
			in4 = (ein0 - ein1) * 0.70710678118654752;	// 0.5/cos(PI/4)

			//>>o DCT2
			oin0 = (out1 - out7) * 0.54119610014619698;	// 0.5/cos( PI/8)
			oin1 = (out3 - out5) * 1.30656296487637653;	// 0.5/cos(3PI/8)

			in2 =  oin0 + oin1;	//out2->in2,out6->in6
			in6 = (oin0 - oin1) * 0.70710678118654752;	// cos(PI/4)

			in2 += in6;
			//<<<<e 完成计算DCT8的输出[0..7]的偶数下标元素

			//>>>>o 用DCT4计算DCT8的输出[0..7]的奇数下标元素
			//o DCT4 part1
			out1 = (d8_0 - d8_7) * 0.50979557910415917;	// 0.5/cos( PI/16)
			out3 = (d8_1 - d8_6) * 0.60134488693504528;	// 0.5/cos(3PI/16)
			out5 = (d8_2 - d8_5) * 0.89997622313641570;	// 0.5/cos(5PI/16)
			out7 = (d8_3 - d8_4) * 2.56291544774150618;	// 0.5/cos(7PI/16)

			//o DCT4 part2

			//e DCT2 part1
			ein0 = out1 + out7;
			ein1 = out3 + out5;

			//o DCT2 part1
			oin0 = (out1 - out7) * 0.54119610014619698;	// 0.5/cos(PI/8)
			oin1 = (out3 - out5) * 1.30656296487637653;	// 0.5/cos(3PI/8)

			//e DCT2 part2
			out1 =  ein0 + ein1;
			out5 = (ein0 - ein1) * 0.70710678118654752;	// cos(PI/4)

			//o DCT2 part2
			out3 = oin0 + oin1;
			out15 = (oin0 - oin1) * 0.70710678118654752;
			out3 += out15;

			//o DCT4 part3
			out1 += out3;
			out3 += out5;
			out5 += out15;
			//<<<<o 完成计算DCT8的输出[0..7]的奇数下标元素
		}
								//out15=out7
		out13 = in6 + out15;	//out13=out6+ou7
		out11 = out5 + in6;		//out11=out5+ou6
		out9 = in4 + out5;		//out9 =out4+ou5
		out7 = out3 + in4;		//out7 =out3+ou4
		out5 = in2 + out3;		//out5 =out2+ou3
		out3 = out1 + in2;		//out3 =out1+ou2
		out1 += in0;			//out1 =out0+ou1
		//<<<<<<<< 完成计算DCT16输出[0..15]的奇数下标元素
	}

	//out32[i]=out[i]+out[i+1]; out32[31]=out[15]
	out64[47] = out64[49] = -out0 - out1;
	out64[45] = out64[51] = -out1 - out2;
	out64[43] = out64[53] = -out2 - out3;
	out64[41] = out64[55] = -out3 - out4;
	out64[39] = out64[57] = -out4 - out5;
	out64[37] = out64[59] = -out5 - out6;
	out64[35] = out64[61] = -out6 - out7;
	out64[33] = out64[63] = -out7 - out8;
	out64[1] = out8 + out9;
	out64[3] = out9 + out10;
	out64[5] = out10 + out11;
	out64[7] = out11 + out12;
	out64[9] = out12 + out13;
	out64[11] = out13 + out14;
	out64[13] = out14 + out15;
	out64[15] = out15;
	//<<<<<<<<<<<<<<<<

	out64[16] = 0;

	out64[17] = -out64[15];
	out64[18] = -out64[14];
	out64[19] = -out64[13];
	out64[20] = -out64[12];
	out64[21] = -out64[11];
	out64[22] = -out64[10];
	out64[23] = -out64[9];
	out64[24] = -out64[8];
	out64[25] = -out64[7];
	out64[26] = -out64[6];
	out64[27] = -out64[5];
	out64[28] = -out64[4];
	out64[29] = -out64[3];
	out64[30] = -out64[2];
	out64[31] = -out64[1];
	out64[32] = -out64[0];
}


 

本文的DCT-II(32→32)快速算法的作者是Michael Hipp,开源程序mpg123使用的是该算法。贴出mpg123的DCT快速算法代码:

void dct64(real *out0,real *out1,real *samples)
{
  real bufs[64];

 {
  register int i,j;
  register real *b1,*b2,*bs,*costab;

  b1 = samples;
  bs = bufs;
  costab = pnts[0]+16;
  b2 = b1 + 32;

  for(i=15;i>=0;i--)
    *bs++ = (*b1++ + *--b2); 
  for(i=15;i>=0;i--)
    *bs++ = REAL_MUL((*--b2 - *b1++), *--costab);

  b1 = bufs;
  costab = pnts[1]+8;
  b2 = b1 + 16;

  {
    for(i=7;i>=0;i--)
      *bs++ = (*b1++ + *--b2); 
    for(i=7;i>=0;i--)
      *bs++ = REAL_MUL((*--b2 - *b1++), *--costab);
    b2 += 32;
    costab += 8;
    for(i=7;i>=0;i--)
      *bs++ = (*b1++ + *--b2); 
    for(i=7;i>=0;i--)
      *bs++ = REAL_MUL((*b1++ - *--b2), *--costab);
    b2 += 32;
  }

  bs = bufs;
  costab = pnts[2];
  b2 = b1 + 8;

  for(j=2;j;j--)
  {
    for(i=3;i>=0;i--)
      *bs++ = (*b1++ + *--b2); 
    for(i=3;i>=0;i--)
      *bs++ = REAL_MUL((*--b2 - *b1++), costab[i]);
    b2 += 16;
    for(i=3;i>=0;i--)
      *bs++ = (*b1++ + *--b2); 
    for(i=3;i>=0;i--)
      *bs++ = REAL_MUL((*b1++ - *--b2), costab[i]);
    b2 += 16;
  }

  b1 = bufs;
  costab = pnts[3];
  b2 = b1 + 4;

  for(j=4;j;j--)
  {
    *bs++ = (*b1++ + *--b2); 
    *bs++ = (*b1++ + *--b2);
    *bs++ = REAL_MUL((*--b2 - *b1++), costab[1]);
    *bs++ = REAL_MUL((*--b2 - *b1++), costab[0]);
    b2 += 8;
    *bs++ = (*b1++ + *--b2); 
    *bs++ = (*b1++ + *--b2);
    *bs++ = REAL_MUL((*b1++ - *--b2), costab[1]);
    *bs++ = REAL_MUL((*b1++ - *--b2), costab[0]);
    b2 += 8;
  }
  bs = bufs;
  costab = pnts[4];

  for(j=8;j;j--)
  {
    real v0,v1;
    v0=*b1++; v1 = *b1++;
    *bs++ = (v0 + v1);
    *bs++ = REAL_MUL((v0 - v1), (*costab));
    v0=*b1++; v1 = *b1++;
    *bs++ = (v0 + v1);
    *bs++ = REAL_MUL((v1 - v0), (*costab));
  }
 }

 {
  register real *b1;
  register int i;

  for(b1=bufs,i=8;i;i--,b1+=4)
    b1[2] += b1[3];

  for(b1=bufs,i=4;i;i--,b1+=8)
  {
    b1[4] += b1[6];
    b1[6] += b1[5];
    b1[5] += b1[7];
  }

  for(b1=bufs,i=2;i;i--,b1+=16)
  {
    b1[8]  += b1[12];
    b1[12] += b1[10];
    b1[10] += b1[14];
    b1[14] += b1[9];
    b1[9]  += b1[13];
    b1[13] += b1[11];
    b1[11] += b1[15];
  }
 }

  out0[0x10*16] = bufs[0];
  out0[0x10*15] = bufs[16+0]  + bufs[16+8];
  out0[0x10*14] = bufs[8];
  out0[0x10*13] = bufs[16+8]  + bufs[16+4];
  out0[0x10*12] = bufs[4];
  out0[0x10*11] = bufs[16+4]  + bufs[16+12];
  out0[0x10*10] = bufs[12];
  out0[0x10* 9] = bufs[16+12] + bufs[16+2];
  out0[0x10* 8] = bufs[2];
  out0[0x10* 7] = bufs[16+2]  + bufs[16+10];
  out0[0x10* 6] = bufs[10];
  out0[0x10* 5] = bufs[16+10] + bufs[16+6];
  out0[0x10* 4] = bufs[6];
  out0[0x10* 3] = bufs[16+6]  + bufs[16+14];
  out0[0x10* 2] = bufs[14];
  out0[0x10* 1] = bufs[16+14] + bufs[16+1];
  out0[0x10* 0] = bufs[1];

  out1[0x10* 0] = bufs[1];
  out1[0x10* 1] = bufs[16+1]  + bufs[16+9];
  out1[0x10* 2] = bufs[9];
  out1[0x10* 3] = bufs[16+9]  + bufs[16+5];
  out1[0x10* 4] = bufs[5];
  out1[0x10* 5] = bufs[16+5]  + bufs[16+13];
  out1[0x10* 6] = bufs[13];
  out1[0x10* 7] = bufs[16+13] + bufs[16+3];
  out1[0x10* 8] = bufs[3];
  out1[0x10* 9] = bufs[16+3]  + bufs[16+11];
  out1[0x10*10] = bufs[11];
  out1[0x10*11] = bufs[16+11] + bufs[16+7];
  out1[0x10*12] = bufs[7];
  out1[0x10*13] = bufs[16+7]  + bufs[16+15];
  out1[0x10*14] = bufs[15];
  out1[0x10*15] = bufs[16+15];
}


 

本文的“MP3解码之DCT(32→64)快速算法的展开”的JAVA代码是jmp123开源项目的一部分。
【jmp123下载地址】http://jmp123.sourceforge.net/


 

你可能感兴趣的:(MP3解码之DCT(32→64)快速算法的展开)