图像处理的简单优化-06: SIMD

软件方面且先到此为止。再试试硬件的威力。这里只考虑Intel的CPU。

关于SIMD及为CPU做优化的指引文章请参考这里。


内存的特殊要求

SIMD使用的内存必须在16字节边界对齐,通过使用_mm_malloc()函数,很容易就实现。另一个问题是,如果扫描线长度不是16字节的整数,会导致下一扫描线没有从16字节边界开始,就会导致_mm_xx_xxx()函数出错。所以,需要对扫描线长度进行16字节补完处理。这会稍微浪费一点内存,但相对于性能的提升,还是值得的。

先看内存分配(__USE_SSE__是自己定义的开关):

        virtual bool create(unsigned int width, unsigned int height)
		{
			if (this->close())
			{
				m_width = width;
				m_height = height;
				m_stride = width * BYTES_PER_PIXEL;
#ifdef __USE_SSE__
				m_data = (unsigned char *)_mm_malloc(m_height * (m_stride + 16 - (m_stride & 3)), 16);
#else
				m_data = (unsigned char *) malloc(m_height * m_stride);
#endif
				if (m_data != NULL)
				{
					return true;
				}
			}

			return false;
		}

相应地,释放内存也要调用_mm_free():

        virtual bool close()
		{
			if (m_data != NULL)
			{
#ifdef __USE_SSE__
				_mm_free(m_data);
#else
				free(m_data);
#endif
				m_data = NULL;
			}

			m_width = m_height = 0;
			return true;
		}

使用SIMD进行灰度处理的类

该类的定义如下:

class SIMD_Grayscale: public Effect
{
....
public:
    virtual bool execute(....);

....
}

SIMD的灰度算法

    virtual bool SIMD_Grayscale::execute(unsigned char* buffer, unsigned int width, unsigned int height)
	{
		unsigned int _mstride = width * 4 + 16 - (width & 3); //  扫描线长度是16字节的整数倍
		unsigned int _mwidth = _mstride / 16;	// 扫描线有多少个16字节

		__m128i* mptr = (__m128i *)buffer;	// 指针

		// 灰度因子,填充到16字节,

		__m128i factor = ::_mm_set_epi16((unsigned int) (0.21 * 256), (unsigned int) (0.71 * 256),
					(unsigned int) (0.07 * 256), 0x0, (unsigned int) (0.21 * 256), (unsigned int) (0.71 * 256),
					(unsigned int) (0.07 * 256), 0x0);
		/*
		__m128i factor = ::_mm_set_epi16(
				(unsigned int) (0.07 * 256), (unsigned int) (0.71 * 256), (unsigned int) (0.21 * 256), 0x0,
				(unsigned int) (0.07 * 256), (unsigned int) (0.71 * 256),(unsigned int) (0.21 * 256), 0x0);
		*/

		// 颜色分量的掩码,算法是这样设计:
		// 1.取得三个颜色分量;
		// 2.与灰度因子相乘;
		// 3.再与红色对齐;
		// 4.三色相加;
		// 5.因为灰度因子提前做了定点运算的原因,需要将其还原,这时就得到灰度;
		// 6.通过移位,用灰度值填充其它两个颜色分量;
		// 7.三个颜色再合成一个_m128i值;
		// 注意:Intel的数据存储方式是倒着放,所以掩码也要到着。
		__m128i redMask = _mm_set_epi16(0x0000, 0x0000, 0xFF00, 0x0000, 0x0000, 0x0000, 0x00FF, 0x0000);
		__m128i greenMask = _mm_set_epi16(0x0000, 0xFF00, 0x0000, 0x0000, 0x0000, 0xFF00, 0x0000, 0x0000);
		__m128i blueMask = _mm_set_epi16(0xFF00, 0x0000, 0x0000, 0x0000, 0xFF00, 0x0000, 0x0000, 0x0000);

		__m128i red, green, blue;
		__m128i pixel_01;
		__m128i pixel_23;

		for (unsigned int h = 0; h < height; h++)
		{
			for (unsigned int w = 0; w < _mwidth; w++)
			{
				// 四个像素排列方式: 0x00RRGGBB, 0x11RRGGBB, 0x22RRGGBB, 0x33RRGGBB
				// 在unpack后,字节顺序会反过来.
				// 就变成这样:0xBBGGRR00
				pixel_01 = _mm_unpacklo_epi8(*mptr, _mm_setzero_si128());
				pixel_23 = _mm_unpackhi_epi8(*mptr, _mm_setzero_si128());

				// 先与灰度因子相乘
				pixel_01 = _mm_mullo_epi16(pixel_01, factor);
				// 取得颜色分量
				red = _mm_and_si128(pixel_01, redMask);
				green = _mm_and_si128(pixel_01, greenMask);
				blue = _mm_and_si128(pixel_01, blueMask);
				// 绿色和蓝色都与红色对齐
				green = ::_mm_slli_epi64(green, 16);
				blue = _mm_slli_epi64(blue, 32);
				//三个分量相加
				red = _mm_add_epi16(red, green);
				red = _mm_add_epi16(red, blue);
				//做定点乘法的反运算
				red = _mm_srli_epi16(red, 8);

				// 用红色填充其它两个颜色
				green = _mm_srli_epi64(red, 16);
				blue = _mm_srli_epi64(red, 32);

				// 三色合成一个像素
				red = _mm_or_si128(red, green);
				red = _mm_or_si128(red, blue);

				// 保存
				pixel_01 = red;

				// 另两个像素
				pixel_23 = _mm_mullo_epi16(pixel_23, factor);

				red = _mm_and_si128(pixel_23, redMask);
				green = _mm_and_si128(pixel_23, greenMask);
				blue = _mm_and_si128(pixel_23, blueMask);

				// Align green and blue with red
				green = ::_mm_slli_epi64(green, 16);
				blue = _mm_slli_epi64(blue, 32);

				// Add R,G and B
				red = _mm_add_epi16(red, green);
				red = _mm_add_epi16(red, blue);
				red = _mm_srli_epi16(red, 8);

				// Move green and blue to original position
				green = _mm_srli_epi64(red, 16);
				blue = _mm_srli_epi64(red, 32);

				// combine R, G, B
				red = ::_mm_or_si128(red, green);
				red = _mm_or_si128(red, blue);

				// Save result
				pixel_23 = red;

				*mptr++ = ::_mm_packs_epi16(pixel_23, pixel_01);
			} // for width
		} // for height

		return true;
	} // func

结果统计

.....
11ms, 11ms, 11ms, 11ms, 11ms, 11ms, 11ms, 12ms, 
11ms, 12ms, 11ms, 12ms, 11ms, 11ms, 11ms, 12ms, 
11ms, 11ms, 11ms, 11ms, 12ms, 11ms, 11ms, 11ms, 
12ms, 12ms, 11ms, 11ms, 11ms, 12ms, 12ms, 11ms, 
11ms, 12ms, 12ms, 12ms, 12ms, 11ms, 11ms, 12ms, 
11ms, 12ms, 11ms, 11ms, 11ms, 12ms, 11ms, 12ms, 
11ms, 12ms, 11ms, 11ms, 11ms, 11ms, 11ms, 11ms, 
12ms, 11ms, 11ms, 11ms, 11ms, 11ms, 11ms, 12ms, 
Max:13.14 Min:10.97
Average:11.71
Done.

可以看到,已经很接近10毫秒了。尝试了一次2组8个像素,4组16像素,以及8组32像素的性能,与1组4个像素相差不大,在11至12毫秒之间,始终不能突破10这个关口。

可惜的是,无论在这里花费多少精力,提升多少性能,最终放到整个运行环境中时,随便一个其它耗时的操作都可以浪费你的努力。比如,将其放到网络环境下,随便一个HTTP请求都会消耗掉几十数百毫秒,所有种种,会让人觉得这种努力毫无用处。




你可能感兴趣的:(C++,优化,SIMD,图像处理)