这次performance的实验质量一般,尤其是第二个,有点脑筋急转弯的感觉。可能也是因此,新的CSAPP已经取消了此lab
第一个翻转实验本质上是优化如下的方程式
dst[RIDX(dim-1-j, i, dim)] = src[RIDX(i, j, dim)];
dst的行变化时对应src的列变化
原始代码为
void naive_rotate(int dim, pixel *src, pixel *dst)
{
int i, j;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(dim-1-j, i, dim)] = src[RIDX(i, j, dim)];
}
可以使得src的访问命中率达到最优,但对于dst,同一个i下的每一个j都要按不同行去遍历,内存跨度大,命中率低。
与之前恰好相反,这样可以使得dst的访问命中率达到最优,但对于src,同一个j下的每一个i都要按不同行去遍历,内存跨度大,命中率低。
两者本质上相同,换汤不换药
由题目可知,图片均为正方形格式,且长宽为128 256 512等,为了增强命中率,可以每次按4、8、16大小的块来处理方程。这样跨度不会过大,最多也就是左右(上下) 4、8、16列(行)
void rotate(int dim, pixel *src, pixel *dst)
{
int i, j, ii, jj;
int sdim = dim - 1;
int batchSize = 8;
for (i = 0; i < dim; i += batchSize)
{
for (j = 0; j < dim; j += batchSize)
{
for (ii = i; ii < i + batchSize; ii++)
{
for (jj = j; jj < j + batchSize; jj++)
{
dst[RIDX(sdim - jj, ii, dim)] = src[RIDX(ii, jj, dim)];
}
}
}
}
}
我愿称之为脑筋急转弯…原始代码为
void naive_smooth(int dim, pixel *src, pixel *dst)
{
int i, j;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(i, j, dim)] = avg(dim, i, j, src);
}
主要的问题出在 avg函数中,avg函数因为是对周围一圈的像素做average,所以对于边界条件需要判断。边界的四个边在整个中占极小部分,却占用了一大部分的分支预测。故优化代码的思路就是简单的把各个角与边界的情形写出,然后把非边界的情形也写出…
void smooth(int dim, pixel *src, pixel *dst)
{
int i, j;
pixel current_pixel;
pixel *pcurrent_pixel = ¤t_pixel;
//左上角
i = 0;
j = 0;
pcurrent_pixel->red =
(unsigned short)(((int)(src[RIDX(0, 0, dim)].red + src[RIDX(0, 1, dim)].red +
src[RIDX(1, 0, dim)].red + src[RIDX(1, 1, dim)].red)) /
4);
pcurrent_pixel->green =
(unsigned short)(((int)(src[RIDX(0, 0, dim)].green + src[RIDX(0, 1, dim)].green +
src[RIDX(1, 0, dim)].green + src[RIDX(1, 1, dim)].green)) /
4);
pcurrent_pixel->blue =
(unsigned short)(((int)(src[RIDX(0, 0, dim)].blue + src[RIDX(0, 1, dim)].blue +
src[RIDX(1, 0, dim)].blue + src[RIDX(1, 1, dim)].blue)) /
4);
dst[RIDX(0, 0, dim)] = current_pixel;
//右上角
i = 0;
j = dim - 1;
pcurrent_pixel->red =
(unsigned short)(((int)(src[RIDX(i, j, dim)].red + src[RIDX(i + 1, j, dim)].red +
src[RIDX(i, j - 1, dim)].red + src[RIDX(i + 1, j - 1, dim)].red)) /
4);
pcurrent_pixel->green =
(unsigned short)(((int)(src[RIDX(i, j, dim)].green + src[RIDX(i + 1, j, dim)].green +
src[RIDX(i, j - 1, dim)].green + src[RIDX(i + 1, j - 1, dim)].green)) /
4);
pcurrent_pixel->blue =
(unsigned short)(((int)(src[RIDX(i, j, dim)].blue + src[RIDX(i + 1, j, dim)].blue +
src[RIDX(i, j - 1, dim)].blue + src[RIDX(i + 1, j - 1, dim)].blue)) /
4);
dst[RIDX(i, j, dim)] = current_pixel;
//左下角
i = dim - 1;
j = 0;
pcurrent_pixel->red =
(unsigned short)(((int)(src[RIDX(i, j, dim)].red + src[RIDX(i - 1, j, dim)].red +
src[RIDX(i, j + 1, dim)].red + src[RIDX(i - 1, j + 1, dim)].red)) /
4);
pcurrent_pixel->green =
(unsigned short)(((int)(src[RIDX(i, j, dim)].green + src[RIDX(i - 1, j, dim)].green +
src[RIDX(i, j + 1, dim)].green + src[RIDX(i - 1, j + 1, dim)].green)) /
4);
pcurrent_pixel->blue =
(unsigned short)(((int)(src[RIDX(i, j, dim)].blue + src[RIDX(i - 1, j, dim)].blue +
src[RIDX(i, j + 1, dim)].blue + src[RIDX(i - 1, j + 1, dim)].blue)) /
4);
dst[RIDX(i, j, dim)] = current_pixel;
// 右下角
i = dim - 1;
j = dim - 1;
pcurrent_pixel->red =
(unsigned short)(((int)(src[RIDX(i, j, dim)].red + src[RIDX(i - 1, j, dim)].red +
src[RIDX(i, j - 1, dim)].red + src[RIDX(i - 1, j - 1, dim)].red)) /
4);
pcurrent_pixel->green =
(unsigned short)(((int)(src[RIDX(i, j, dim)].green + src[RIDX(i - 1, j, dim)].green +
src[RIDX(i, j - 1, dim)].green + src[RIDX(i - 1, j - 1, dim)].green)) /
4);
pcurrent_pixel->blue =
(unsigned short)(((int)(src[RIDX(i, j, dim)].blue + src[RIDX(i - 1, j, dim)].blue +
src[RIDX(i, j - 1, dim)].blue + src[RIDX(i - 1, j - 1, dim)].blue)) /
4);
dst[RIDX(i, j, dim)] = current_pixel;
//图片第一列
j = 0;
for (i = 1; i < dim - 1; i++)
{
pcurrent_pixel->red =
(unsigned short)(((int)(src[RIDX(i - 1, j, dim)].red + src[RIDX(i - 1, j + 1, dim)].red +
src[RIDX(i, j, dim)].red + src[RIDX(i, j + 1, dim)].red +
src[RIDX(i + 1, j, dim)].red + src[RIDX(i + 1, j + 1, dim)].red)) /
6);
pcurrent_pixel->green =
(unsigned short)(((int)(src[RIDX(i - 1, j, dim)].green + src[RIDX(i - 1, j + 1, dim)].green +
src[RIDX(i, j, dim)].green + src[RIDX(i, j + 1, dim)].green +
src[RIDX(i + 1, j, dim)].green + src[RIDX(i + 1, j + 1, dim)].green)) /
6);
pcurrent_pixel->blue =
(unsigned short)(((int)(src[RIDX(i - 1, j, dim)].blue + src[RIDX(i - 1, j + 1, dim)].blue +
src[RIDX(i, j, dim)].blue + src[RIDX(i, j + 1, dim)].blue +
src[RIDX(i + 1, j, dim)].blue + src[RIDX(i + 1, j + 1, dim)].blue)) /
6);
dst[RIDX(i, j, dim)] = current_pixel;
}
//图片最后一行
i = dim - 1;
for (j = 1; j < dim - 1; j++)
{
pcurrent_pixel->red =
(unsigned short)(((int)(src[RIDX(i, j, dim)].red + src[RIDX(i - 1, j, dim)].red +
src[RIDX(i, j - 1, dim)].red + src[RIDX(i - 1, j - 1, dim)].red +
src[RIDX(i, j + 1, dim)].red + src[RIDX(i - 1, j + 1, dim)].red)) /
6);
pcurrent_pixel->green =
(unsigned short)(((int)(src[RIDX(i, j, dim)].green + src[RIDX(i - 1, j, dim)].green +
src[RIDX(i, j - 1, dim)].green + src[RIDX(i - 1, j - 1, dim)].green +
src[RIDX(i, j + 1, dim)].green + src[RIDX(i - 1, j + 1, dim)].green)) /
6);
pcurrent_pixel->blue =
(unsigned short)(((int)(src[RIDX(i, j, dim)].blue + src[RIDX(i - 1, j, dim)].blue +
src[RIDX(i, j - 1, dim)].blue + src[RIDX(i - 1, j - 1, dim)].blue +
src[RIDX(i, j + 1, dim)].blue + src[RIDX(i - 1, j + 1, dim)].blue)) /
6);
dst[RIDX(i, j, dim)] = current_pixel;
}
//图片最右侧一列
j = dim - 1;
for (i = 1; i < dim - 1; i++)
{
pcurrent_pixel->red =
(unsigned short)(((int)(src[RIDX(i - 1, j, dim)].red + src[RIDX(i - 1, j - 1, dim)].red +
src[RIDX(i, j, dim)].red + src[RIDX(i, j - 1, dim)].red +
src[RIDX(i + 1, j, dim)].red + src[RIDX(i + 1, j - 1, dim)].red)) /
6);
pcurrent_pixel->green =
(unsigned short)(((int)(src[RIDX(i - 1, j, dim)].green + src[RIDX(i - 1, j - 1, dim)].green +
src[RIDX(i, j, dim)].green + src[RIDX(i, j - 1, dim)].green +
src[RIDX(i + 1, j, dim)].green + src[RIDX(i + 1, j - 1, dim)].green)) /
6);
pcurrent_pixel->blue =
(unsigned short)(((int)(src[RIDX(i - 1, j, dim)].blue + src[RIDX(i - 1, j - 1, dim)].blue +
src[RIDX(i, j, dim)].blue + src[RIDX(i, j - 1, dim)].blue +
src[RIDX(i + 1, j, dim)].blue + src[RIDX(i + 1, j - 1, dim)].blue)) /
6);
dst[RIDX(i, j, dim)] = current_pixel;
}
//图片最上一行
i = 0;
for (j = 1; j < dim - 1; j++)
{
pcurrent_pixel->red =
(unsigned short)(((int)(src[RIDX(i, j, dim)].red + src[RIDX(i + 1, j, dim)].red +
src[RIDX(i, j - 1, dim)].red + src[RIDX(i + 1, j - 1, dim)].red +
src[RIDX(i, j + 1, dim)].red + src[RIDX(i + 1, j + 1, dim)].red)) /
6);
pcurrent_pixel->green =
(unsigned short)(((int)(src[RIDX(i, j, dim)].green + src[RIDX(i + 1, j, dim)].green +
src[RIDX(i, j - 1, dim)].green + src[RIDX(i + 1, j - 1, dim)].green +
src[RIDX(i, j + 1, dim)].green + src[RIDX(i + 1, j + 1, dim)].green)) /
6);
pcurrent_pixel->blue =
(unsigned short)(((int)(src[RIDX(i, j, dim)].blue + src[RIDX(i + 1, j, dim)].blue +
src[RIDX(i, j - 1, dim)].blue + src[RIDX(i + 1, j - 1, dim)].blue +
src[RIDX(i, j + 1, dim)].blue + src[RIDX(i + 1, j + 1, dim)].blue)) /
6);
dst[RIDX(i, j, dim)] = current_pixel;
}
//里边的所有像素
for (i = 1; i < dim - 1; i++)
{
for (j = 1; j < dim - 1; j++)
{
pcurrent_pixel->red =
(unsigned short)(((int)(src[RIDX(i + 1, j, dim)].red + src[RIDX(i + 1, j - 1, dim)].red +
src[RIDX(i, j, dim)].red + src[RIDX(i - 1, j, dim)].red +
src[RIDX(i, j - 1, dim)].red + src[RIDX(i - 1, j - 1, dim)].red +
src[RIDX(i, j + 1, dim)].red + src[RIDX(i - 1, j + 1, dim)].red +
src[RIDX(i + 1, j + 1, dim)].red)) /
9);
pcurrent_pixel->green =
(unsigned short)(((int)(src[RIDX(i + 1, j, dim)].green + src[RIDX(i + 1, j - 1, dim)].green +
src[RIDX(i, j, dim)].green + src[RIDX(i - 1, j, dim)].green +
src[RIDX(i, j - 1, dim)].green + src[RIDX(i - 1, j - 1, dim)].green +
src[RIDX(i, j + 1, dim)].green + src[RIDX(i - 1, j + 1, dim)].green +
src[RIDX(i + 1, j + 1, dim)].green)) /
9);
pcurrent_pixel->blue =
(unsigned short)(((int)(src[RIDX(i + 1, j, dim)].blue + src[RIDX(i + 1, j - 1, dim)].blue +
src[RIDX(i, j, dim)].blue + src[RIDX(i - 1, j, dim)].blue +
src[RIDX(i, j - 1, dim)].blue + src[RIDX(i - 1, j - 1, dim)].blue +
src[RIDX(i, j + 1, dim)].blue + src[RIDX(i - 1, j + 1, dim)].blue +
src[RIDX(i + 1, j + 1, dim)].blue)) /
9);
dst[RIDX(i, j, dim)] = current_pixel;
}
}
}
快是快了,但好啰嗦啊…