ISPC官方例程里有很多ISPC和MSVC编译器生成代码性能对比的例子, 这次用个自己写的简单的图像旋转的算法来试试。
当图形软件包仅提供绕坐标系原点旋转函数时,我们可通过完成下列 平移-旋转-平移操作序列来实现绕任何选定的 基准点( xr, yr)的旋转。
图里的公式,用旋转后的坐标乘以旋转矩阵,就可以得到旋转前的采样坐标位置点。
这个变换序列如图所示。利用矩阵合并可以得到该序列的复合变换矩阵方程:
可以使用下列形式表示:
采样点坐标因为矩阵乘法里包含三角函数的原因,得到的是一个包含小数的坐标,这时候要基于采样点四周的四个点的颜色做一个双线性插值滤波,计算出采样点坐标的颜色
// *srcImg, 输入灰度图像的像素buffer, 颜色为8bit灰度图像
// *dstImg, 旋转后图像的buffer,8bit灰度,图像的长宽和srcImg相等
// center_x, center_y, 旋转点的坐标位置
// Width, Height, srcImg和dstImg的长宽
// float RotationAngle 旋转角度
void image_rotate_double(const unsigned char *srcImg, unsigned char *dstImg, float center_x, float center_y, int Width, int Height, float RotationAngle)
{
double angle = (double)RotationAngle*M_PI / 180.0;
double alpha = cos(angle);
double beta = sin(angle);
double m[6];
m[0] = alpha;
m[1] = -beta;
m[2] = (1.0 - alpha) * (double)center_x + beta * (double)center_y;
m[3] = beta;
m[4] = alpha;
m[5] = (1.0 - alpha) * (double)center_y - beta * (double)center_x;
for (int row = 0; row < Height; row++)
for (int col = 0; col < Width; col++) {
double x, y;
int leftX, rightX, topY, bottomY;
double w00, w01, w10, w11;
double fxy;
//计算采样点坐标
x = m[0] * (double)col + m[1] * (double)row + m[2];
y = m[3] * (double)col + m[4] * (double)row + m[5];
leftX = floor(x);
topY = floor(y);
rightX = leftX + 1.0;
bottomY = topY + 1.0;
//采样点四周四个点的权重
w11 = fabs(x - leftX)*fabs(y - topY);
w01 = fabs(1.0 - (x - leftX))*fabs(y - topY);
w10 = fabs(x - leftX)*fabs(1 - (y - topY));
w00 = fabs(1.0 - (x - leftX))*fabs(1.0 - (y - topY));
//判断采样点是不是在srcImg的有效范围里
if ((int)leftX >= 0 && (int)rightX < Width && (int)topY >= 0 && (int)bottomY < Height) {
fxy = (double)srcImg[topY*Width + leftX] * w00 +
(double)srcImg[bottomY*Width + leftX] * w01 +
(double)srcImg[topY*Width + rightX] * w10 +
(double)srcImg[bottomY*Width + rightX] * w11;
//计算结束后把像素clip到[0,255]之间
fxy = round(fxy);
if (fxy < 0)
fxy = 0;
if (fxy > 255)
fxy = 255;
dstImg[row*Width + col] = (unsigned char)(fxy);
}
else
dstImg[row*Width + col] = 0;
};
};
运行一下,外面加了个for循环循环180次,旋转角度从0开始一度一度转到180度,
MSVC Win64 Release模式下耗时: 4301ms
#define M_PI_D 3.14159265358979323846d
export void image_rotate_double_ispc(uniform const uint8 srcImg[], uniform uint8 dstImg[], uniform float center_x,uniform float center_y, uniform int Width, uniform int Height, uniform float RotateDegree)
{
uniform double angle = (double)RotateDegree*M_PI_D / 180.0;
uniform double alpha = cos(angle);
uniform double beta = sin(angle);
uniform double m[6];
m[0] = alpha;
m[1] = -beta;
m[2] = (1.0 - alpha) * (double)center_x + beta * (double)center_y ;
m[3] = beta;
m[4] = alpha;
m[5] = (1.0 - alpha) * (double)center_y - beta * (double)center_x;
for (uniform int row = 0; row < Height; row++)
foreach (col = 0 ... Width) {
double x, y;
int leftX, rightX, topY, bottomY;
double w00, w01, w10, w11;
double fxy;
x = m[0] * (double)col + m[1] * (double)row + m[2];
y = m[3] * (double)col + m[4] * (double)row + m[5];
leftX = floor(x);
topY = floor(y);
rightX = leftX + 1.0;
bottomY = topY + 1.0;
w11 = abs(x - leftX)*abs(y - topY);
w01 = abs(1.0 - (x - leftX))*abs(y - topY);
w10 = abs(x - leftX)*abs(1 - (y - topY));
w00 = abs(1.0 - (x - leftX))*abs(1.0 - (y - topY));
if ((int)leftX >= 0 && (int)rightX < Width && (int)topY >= 0 && (int)bottomY < Height) {
fxy = (double)srcImg[topY * Width+ leftX] * w00 +
(double)srcImg[bottomY * Width+ leftX] * w01 +
(double)srcImg[topY*Width+ rightX]*w10 +
(double)srcImg[bottomY*Width+ rightX]*w11;
fxy = round(fxy);
if (fxy < 0)
fxy = 0;
if (fxy > 255)
fxy = 255;
dstImg[row*Width+ col] = (uint8)(fxy);
}
else
dstImg[row*Width + col] = 0;
};
};
ISPC 版代码下耗时: 1148ms
MSVC在对于比较复杂的算法,自动矢量化的编译就会失败,改为SISD (单指令单数据)的处理方式。对于支持AVX2的机器来说,一个ymm寄存器的宽度是256bit, 一个double数据类型是64bit, 所以使用AVX2 SIMD来处理数据会得到256/64=4X的理论性能提升
用了ISPC编译以后,虽然代码还是以单指令单数据的方式写的,但是生成的是SIMD版本,性能的提升是
4301ms/1148ms = 3.74X
我已经非常满意了,没有写任何SIMD汇编,仅仅C代码级的重编译就可以得到3.74X的性能提升, 已经非常接近于4X的理论性能提升了。