SSE2实现HAAR小波变换(dwt2与idwt2)

wiki链接:http://en.wikipedia.org/wiki/Haar_wavelet

 

可用SSE2实现HAAR小波变换,达到实时,关于HAAR小波的介绍可参考以上维基链接

 参考MATLAB中dwt2与idwt2的函数原型,基于OpenCV的框架进行了汇编优化实现

HAAR小波也可用于图像的压缩,将CH,CV,CD中的分量值小于某一阈值则归为0,从而这三个矩阵将成为稀疏矩阵(Sparse Matrix),反变换后的图像的质量将取决于选择阈值的大小。

 

实际实现时,可选择使用浮点数(单精度或双精度)进行矩阵计算,使用整数计算能得到更快的速度但不能进行完整的压缩与解压缩。

 

附代码:

 

inline void dwt2_row(__out double* ca0,
					 __out double* ch0,
					 __out double* cv0,
					 __out double* cd0,
					 __in unsigned char* row0,
					 __in unsigned char* row1,
					 __in int col)
{
	__asm
	{
		mov			eax_ptr, ca0;
		mov			ebx_ptr, ch0;
		mov			ecx_ptr, cv0;
		mov			edx_ptr, cd0;
		mov			esi_ptr, row0;
		mov			edi_ptr, row1;
		pxor		xmm3, xmm3;
		movapd		xmm7, g_halfd;
		sub			col, 4;
		jl			loop_2;
loop_4:
		movd		xmm1, [esi_ptr];
		movd		xmm5, [edi_ptr];
		punpcklbw	xmm1, xmm3;
		punpcklbw	xmm5, xmm3;
		punpcklwd	xmm1, xmm3;
		punpcklwd	xmm5, xmm3;
		cvtdq2pd	xmm0, xmm1;
		cvtdq2pd	xmm4, xmm5;
		shufpd		xmm1, xmm1, 1;
		shufpd		xmm5, xmm5, 1;
		cvtdq2pd	xmm1, xmm1;
		cvtdq2pd	xmm5, xmm5;
		addpd		xmm4, xmm0;
		addpd		xmm5, xmm1;
		mulpd		xmm4, xmm7;
		mulpd		xmm5, xmm7;
		subpd		xmm0, xmm4;
		subpd		xmm1, xmm5;
		movapd		xmm6, xmm4;
		movapd		xmm2, xmm0;
		shufpd		xmm4, xmm5, 0;
		shufpd		xmm6, xmm5, 3;
		shufpd		xmm0, xmm1, 0;
		shufpd		xmm2, xmm1, 3;
		addpd		xmm6, xmm4;
		addpd		xmm2, xmm0;
		mulpd		xmm6, xmm7;
		mulpd		xmm2, xmm7;
		subpd		xmm4, xmm6;
		subpd		xmm0, xmm2;
		movupd		[eax_ptr], xmm6;
		movupd		[ebx_ptr], xmm4;
		movupd		[ecx_ptr], xmm2;
		movupd		[edx_ptr], xmm0;
		add			esi_ptr, 4;
		add			edi_ptr, 4;
		add			eax_ptr, 0x10;
		add			ebx_ptr, 0x10;
		add			ecx_ptr, 0x10;
		add			edx_ptr, 0x10;
		sub			col, 4;
		jge			loop_4;
loop_2:
		cmp			col, -2;
		jl			loop_end;
		pinsrw		xmm0, [esi_ptr], 0;
		pinsrw		xmm4, [edi_ptr], 0;
		punpcklbw	xmm0, xmm3;
		punpcklbw	xmm4, xmm3;
		punpcklwd	xmm0, xmm3;
		punpcklwd	xmm4, xmm3;
		cvtdq2pd	xmm0, xmm0;
		cvtdq2pd	xmm4, xmm4;
		addpd		xmm4, xmm0;
		mulpd		xmm4, xmm7;
		subpd		xmm0, xmm4;
		movapd		xmm5, xmm4;
		shufpd		xmm4, xmm0, 0;
		shufpd		xmm5, xmm0, 3;
		addpd		xmm5, xmm4;
		mulpd		xmm5, xmm7;
		subpd		xmm4, xmm5;
		movsd		[eax_ptr], xmm5;
		shufpd		xmm5, xmm5, 1;
		movsd		[ebx_ptr], xmm4;
		shufpd		xmm4, xmm4, 1;
		movsd		[ecx_ptr], xmm5;
		movsd		[edx_ptr], xmm4;
loop_end:
	}
}

inline void idwt2_row(__out unsigned char* row0,
					  __out unsigned char* row1,
					  __in double* ca0,
					  __in double* ch0,
					  __in double* cv0,
					  __in double* cd0,
					  __in int col)
{
	__asm
	{
		mov			eax_ptr, ca0;
		mov			ebx_ptr, ch0;
		mov			ecx_ptr, cv0;
		mov			edx_ptr, cd0;
		mov			esi_ptr, row0;
		mov			edi_ptr, row1;
		sub			col, 4;
		jl			loop_2;
loop_4:
		movupd		xmm0, [eax_ptr];
		movupd		xmm1, [ebx_ptr];
		movupd		xmm4, [ecx_ptr];
		movupd		xmm5, [edx_ptr];
		addpd		xmm1, xmm0;
		addpd		xmm5, xmm4;
		addpd		xmm0, xmm0;
		addpd		xmm4, xmm4;
		subpd		xmm0, xmm1;
		subpd		xmm4, xmm5;
		movapd		xmm2, xmm1;
		movapd		xmm6, xmm5;
		shufpd		xmm1, xmm0, 0;
		shufpd		xmm2, xmm0, 3;
		shufpd		xmm5, xmm4, 0;
		shufpd		xmm6, xmm4, 3;
		addpd		xmm5, xmm1;
		addpd		xmm6, xmm2;
		addpd		xmm1, xmm1;
		addpd		xmm2, xmm2;
		subpd		xmm1, xmm5;
		subpd		xmm2, xmm6;
		cvttpd2dq	xmm5, xmm5;
		cvttpd2dq	xmm6, xmm6;
		cvttpd2dq	xmm1, xmm1;
		cvttpd2dq	xmm2, xmm2;
		shufpd		xmm5, xmm6, 0;
		shufpd		xmm1, xmm2, 0;
		packssdw	xmm5, xmm1;
		packuswb	xmm5, xmm5;
		pshufd		xmm1, xmm5, 1;
		movd		[esi_ptr], xmm5;
		movd		[edi_ptr], xmm1;
		add			esi_ptr, 4;
		add			edi_ptr, 4;
		add			eax_ptr, 0x10;
		add			ebx_ptr, 0x10;
		add			ecx_ptr, 0x10;
		add			edx_ptr, 0x10;
		sub			col, 4;
		jge			loop_4;
loop_2:
		cmp			col, -2;
		jl			loop_end;
		movsd		xmm0, [eax_ptr];
		movsd		xmm1, [ebx_ptr];
		movsd		xmm4, [ecx_ptr];
		movsd		xmm5, [edx_ptr];
		addpd		xmm1, xmm0;
		addpd		xmm5, xmm4;
		addpd		xmm0, xmm0;
		addpd		xmm4, xmm4;
		subpd		xmm0, xmm1;
		subpd		xmm4, xmm5;
		shufpd		xmm1, xmm0, 0;
		shufpd		xmm5, xmm4, 0;
		addpd		xmm5, xmm1;
		addpd		xmm1, xmm1;
		subpd		xmm1, xmm5;
		cvttpd2dq	xmm5, xmm5;
		cvttpd2dq	xmm1, xmm1;
		packssdw	xmm5, xmm1;
		packuswb	xmm5, xmm5;
		movd		eax_ptr, xmm5;
		mov			[esi_ptr], ax;
		shr			eax_ptr, 16;
		stosw;
loop_end:
	}
}

inline void dwt2(__out cv::Mat& CA,
				 __out cv::Mat& CH,
				 __out cv::Mat& CV,
				 __out cv::Mat& CD,
				 __in cv::Mat const& I)
{
	if(CA.type() != CV_64FC1 || CH.type() != CV_64FC1 || CV.type() != CV_64FC1 || CD.type() != CV_64FC1 || I.channels() != 1)
		return;

	double* ca = reinterpret_cast<double*>(CA.data);
	double* ch = reinterpret_cast<double*>(CH.data);
	double* cv = reinterpret_cast<double*>(CV.data);
	double* cd = reinterpret_cast<double*>(CD.data);
	unsigned char* row = reinterpret_cast<unsigned char*>(I.data);

	for(int i=0; i<I.rows; i+=2)
	{
		dwt2_row(ca, ch, cv, cd, row, row+I.cols, I.cols);
		ca += CA.cols;
		ch += CH.cols;
		cv += CV.cols;
		cd += CD.cols;
		row += I.cols*2;
	}
}

inline void idwt2(__out cv::Mat& I,
				  __in cv::Mat const& CA,
				  __in cv::Mat const& CH,
				  __in cv::Mat const& CV,
				  __in cv::Mat const& CD)
{
	if(CA.type() != CV_64FC1 || CH.type() != CV_64FC1 || CV.type() != CV_64FC1 || CD.type() != CV_64FC1 || I.channels() != 1)
		return;

	double* ca = reinterpret_cast<double*>(CA.data);
	double* ch = reinterpret_cast<double*>(CH.data);
	double* cv = reinterpret_cast<double*>(CV.data);
	double* cd = reinterpret_cast<double*>(CD.data);
	unsigned char* row = reinterpret_cast<unsigned char*>(I.data);

	for(int i=0; i<I.rows; i+=2)
	{
		idwt2_row(row, row+I.cols, ca, ch, cv, cd, I.cols);
		ca += CA.cols;
		ch += CH.cols;
		cv += CV.cols;
		cd += CD.cols;
		row += I.cols*2;
	}
}


版权归作者所有,转载请注明出处!

 

你可能感兴趣的:(Algorithm,Algorithm,matlab,matlab,matlab,opencv,opencv,opencv,opencv,opencv,SIMD,SSE2)