MMX and SSE asm routines for per-pixel alpha-blending

http://www.softkam.ro/index.php?pid=alphablend
//  Copyright (C) 2005 SoftKAM. All Rights Reserved.
//
//  AlphaBlend is free software; you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation; either version 2 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program; if not, write to the Free Software
//  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
//  USA.

#ifndef _ALPHA_H_INCLUDED
#define _ALPHA_H_INCLUDED

void AlphaBlt(unsigned char *dst, unsigned char *src, int w, int h);
void AlphaBltMMX(unsigned char *dst, unsigned char *src, int w, int h);
void AlphaBltSSE(unsigned char *dst, unsigned char *src, int w, int h);

#endif // _ALPHA_H_INCLUDED

//  Copyright (C) 2005 SoftKAM. All Rights Reserved.
//
//  AlphaBlend is free software; you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation; either version 2 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program; if not, write to the Free Software
//  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
//  USA.

#include "alpha.h"

/******************************************************************************/
void AlphaBlt(unsigned char *dst, unsigned char *src, int w, int h)
{
	int i;
	int srcb, srcg, srcr, srca;
	int dstb, dstg, dstr, dsta;

	for (i=0; i < w * h * 4; i += 4)
	{
		srcb = src[i + 0];
		srcg = src[i + 1];
		srcr = src[i + 2];
		srca = src[i + 3];
		dstb = dst[i + 0];
		dstg = dst[i + 1];
		dstr = dst[i + 2];
		dsta = dst[i + 3];
		dstb = (srca * (srcb - dstb) + dstb * 256) / 256;
		dstg = (srca * (srcg - dstg) + dstg * 256) / 256;
		dstr = (srca * (srcr - dstr) + dstr * 256) / 256;
		dsta = srca;
		dst[i + 0] = dstb;
		dst[i + 1] = dstg;
		dst[i + 2] = dstr;
		dst[i + 3] = dsta;
	}
}

/******************************************************************************/
void AlphaBltSSE(unsigned char *dst, unsigned char *src, int w, int h)
{
	int	wmul4 = w << 2;

	if (w==0) return;
	w >>= 1;
	_asm {
// For each pixel: dst = (src_alpha * (src-dst)  + dst * 256) / 256
		mov			edi,dst
		mov			esi,src
		mov			edx,h
		pxor		mm6,mm6
		pxor		mm7,mm7
		xor			eax,eax
scan_loop:
		mov			ecx,w
		xor			ebx,ebx
pix_loop:
		movq		mm4,[esi+ebx*8]		// mm0 = src (RG BA RG BA)
		movq		mm5,[edi+ebx*8]		// mm1 = dst (RG BA RG BA)
// FIRST PIXEL
		movq		mm0,mm4				// mm0 = 00 00 RG BA
		movq		mm1,mm5				// mm1 = 00 00 RG BA
		punpcklbw	mm0,mm6				// mm0 = (0R 0G 0B 0A)
		punpcklbw	mm1,mm7				// mm0 = (0R 0G 0B 0A)
		pshufw		mm2,mm0,0ffh		// mm2 = 0A 0A 0A 0A
		movq		mm3,mm1				// mm3 = mm1
		psubw		mm0,mm1				// mm0 = mm0 - mm1
		psllw		mm3,8				// mm3 = mm1 * 256
		pmullw		mm0,mm2				// mm0 = (src-dst)*alpha
		paddw		mm0,mm3				// mm0 = (src-dst)*alpha+dst*256
		psrlw		mm0,8				// mm0 = ((src - dst) * alpha + dst * 256) / 256
// SECOND PIXEL
		punpckhbw	mm5,mm7				// mm5 = (0R 0G 0B 0A)
		punpckhbw	mm4,mm6				// mm4 = (0R 0G 0B 0A)
		movq		mm3,mm5				// mm3 = mm5
		pshufw		mm2,mm4,0ffh		// mm2 = 0A 0A 0A 0A
		psllw		mm3,8				// mm3 = mm5 * 256
		psubw		mm4,mm5				// mm4 = mm4 - mm5
		pmullw		mm4,mm2				// mm4 = (src-dst)*alpha
		paddw		mm4,mm3				// mm4 = (src-dst)*alpha+dst*256
		psrlw		mm4,8				// mm4 = ((src - dst) * alpha + dst * 256) / 256
		packuswb	mm0,mm4				// mm0 = RG BA RG BA
		movq		[edi+ebx*8],mm0		// dst = mm0
		inc			ebx
		loop		pix_loop
//
		mov			ebx, wmul4
		add			esi, ebx
		add			edi, ebx
		dec			edx
		jnz			scan_loop
	}
}

/******************************************************************************/
void AlphaBltMMX(unsigned char *dst, unsigned char *src, int w, int h)
{
	int	wmul4 = w << 2;

	if (w==0) return;
	w >>= 1;
	_asm {
// For each pixel: dst = (src_alpha * (src-dst)  + dst * 256) / 256
		mov			edi,dst
		mov			esi,src
		mov			edx,h
		pxor		mm6,mm6
		pxor		mm7,mm7
		xor			eax,eax
scan_loop:
		mov			ecx,w
		xor			ebx,ebx
pix_loop:
		movq		mm4,[esi+ebx*8]		// mm4 = src (RG BA RG BA)
		movq		mm5,[edi+ebx*8]		// mm5 = dst (RG BA RG BA)
// FIRST PIXEL
		movq		mm0,mm4				// mm0 = src (-- -- RG BA)
		movq		mm1,mm5				// mm1 = dst (-- -- RG BA)
		punpcklbw	mm0,mm6				// mm0 = (0R 0G 0B 0A)
		mov			al,[esi+ebx*8+3]	// eax = pixel alpha (0 - 255)
		punpcklbw	mm1,mm7				// mm1 = (0R 0G 0B 0A)
		movd		mm2,eax				// 00 00 00 0A
		movq		mm3,mm1				// mm3 = mm1: dst (0R 0G 0B 0A)
		punpcklwd	mm2,mm2				// 00 00 0A 0A
		psubw		mm0,mm1				// mm0 = mm0 - mm1
		punpckldq	mm2,mm2				// 0A 0A 0A 0A
		psllw		mm3,8				// mm3 = mm1 * 256
		pmullw		mm0,mm2				// mm0 = (src - dst) * alpha
		paddw		mm0,mm3				// mm0 = (src - dst) * alpha + dst * 256
		psrlw		mm0,8				// mm0 = ((src - dst) * alpha + dst * 256) / 256
		packuswb	mm0,mm6				// mm0 = RGBA
// SECOND PIXEL
		punpckhbw	mm4,mm6				// mm4 = (0R 0G 0B 0A)
		mov			al,[esi+ebx*8+7]	// eax = pixel alpha (0 - 255)
		punpckhbw	mm5,mm7				// mm5 = (0R 0G 0B 0A)
		movd		mm2,eax				// 00 00 00 0A
		movq		mm3,mm5				// mm3 = mm5: dst (0R 0G 0B 0A)
		punpcklwd	mm2,mm2				// 00 00 0A 0A
		psubw		mm4,mm5				// mm4 = mm4 - mm5
		punpckldq	mm2,mm2				// 0A 0A 0A 0A
		psllw		mm3,8				// mm3 = mm5 * 256
		pmullw		mm4,mm2				// mm4 = (src - dst) * alpha
		paddw		mm4,mm3				// mm4 = (src - dst) * alpha + dst * 256
		psrlw		mm4,8				// mm4 = ((src - dst) * alpha + dst * 256) / 256
		packuswb	mm4,mm6				// mm4 = RGBA
		punpckldq	mm0,mm4				// mm0 = RG BA RG BA
		movq		[edi+ebx*8],mm0		// dst = mm0
		inc			ebx
// REPEAT
		loop		pix_loop
		mov			ebx, wmul4
		add			esi, ebx
		add			edi, ebx
		dec			edx
		jnz			scan_loop
	}
}



你可能感兴趣的:(MMX and SSE asm routines for per-pixel alpha-blending)