CSAPP: 极度舒适的Perflab

最近的lab和pj太多了5555
每天写一点叭…要加油鸭!!

Perflab

共两个函数 rotate & smooth
需要优化性能 CPE越低越好
but竟然没有评分标准的嘛…

5.15:rotate优化至13.9

原始版本:
CSAPP: 极度舒适的Perflab_第1张图片
性能:
CSAPP: 极度舒适的Perflab_第2张图片

改进1:32*32块划分
注意这里调换了原本的RIDX(dim-1-j, i, dim)的i/j顺序,性能提升不少CSAPP: 极度舒适的Perflab_第3张图片性能:
CSAPP: 极度舒适的Perflab_第4张图片
改进2:循环4路展开
CSAPP: 极度舒适的Perflab_第5张图片
性能:
CSAPP: 极度舒适的Perflab_第6张图片

改进3:32*32块+指针
but为什么用指针更好呢?应该是避免重复计算地址叭~
这里是我自己的构思,分32*32块
CSAPP: 极度舒适的Perflab_第7张图片
CSAPP: 极度舒适的Perflab_第8张图片

性能:
CSAPP: 极度舒适的Perflab_第9张图片

改进4:32*dim+指针
这是参考大佬的,32*dim一块,直接整行拷走
CSAPP: 极度舒适的Perflab_第10张图片
CSAPP: 极度舒适的Perflab_第11张图片

今日最佳:
CSAPP: 极度舒适的Perflab_第12张图片

几个很奇怪的地方:
1.同样的代码,大佬们跑出来能到20+??(电脑的问题嘛…
2.指针一次取6个字节,不知道取内存时是不是4字节对齐性能更好,明天继续冲冲冲~

5.17 完结撒花~

发现大家得到的基准都不一样,所以应该是看加速比哒

首先对rotate做了最后的优化,即把所有dp++改为++dp(如果是dp++在汇编中会多用一个寄存器保存加之前的值)
性能:
CSAPP: 极度舒适的Perflab_第13张图片
然后是smooth函数:
原始版本:

void naive_smooth(int dim, pixel *src, pixel *dst) 
{
    int i, j;

    for (i = 0; i < dim; i++)
	for (j = 0; j < dim; j++)
	    dst[RIDX(i, j, dim)] = avg(dim, i, j, src);
}

性能:
CSAPP: 极度舒适的Perflab_第14张图片

改进1:减少对函数的调用,改为define

#define min(a, b)  (a < b ? a : b)
#define max(a, b)  (a > b ? a : b)

void smooth_inline(int dim, pixel *src, pixel *dst) 
{
    	int i, j, i1, j1, max1, max2, min1, min2;
	pixel_sum sum;
	pixel tmp;

    for (i = 0; i < dim; i++)
	for (j = 0; j < dim; j++)
	    {	initialize_pixel_sum(&sum);
		max1=max(i-1, 0); max2=max(j-1, 0);
		min1=min(i+1, dim-1);min2=min(j+1, dim-1);
		for(i1=max1; i1<=min1; i1++)
			for(j1=max2; j1<=min2; j1++)
			accumulate_sum(&sum, src[RIDX(i1, j1, dim)]);
			assign_sum_to_pixel(&tmp, sum);
			dst[RIDX(i, j, dim)]=tmp;
		}
}

性能忘记截图了…改进不是很大

改进2:重写avg函数,不调用原始函数,按像素位置分别计算:

void smooth_avg(int dim, pixel *src, pixel *dst) 
{
	int i, j, tmp=dim+1, tmp2=2*dim-1, tmp3=dim*(dim-2), urc=dim-1, llc=dim*(dim-1), lrc=dim*dim-1, i1=0, j1=0;

	//upper left corner
	dst[0].red=(src[0].red+src[1].red+src[dim].red+src[tmp].red)>>2;
	dst[0].blue=(src[0].blue+src[1].blue+src[dim].blue+src[tmp].blue)>>2;
	dst[0].green=(src[0].green+src[1].green+src[dim].green+src[tmp].green)>>2;
	//upper right corner
	dst[urc].red=(src[urc].red+src[urc-1].red+src[tmp2].red+src[tmp2-1].red)>>2;
	dst[urc].blue=(src[urc].blue+src[urc-1].blue+src[tmp2].blue+src[tmp2-1].blue)>>2;
	dst[urc].green=(src[urc].green+src[urc-1].green+src[tmp2].green+src[tmp2-1].green)>>2;
	//lower left corner
	dst[llc].red=(src[llc].red+src[llc+1].red+src[tmp3].red+src[tmp3+1].red)>>2;
	dst[llc].blue=(src[llc].blue+src[llc+1].blue+src[tmp3].blue+src[tmp3+1].blue)>>2;
	dst[llc].green=(src[llc].green+src[llc+1].green+src[tmp3].green+src[tmp3+1].green)>>2;
	//lower right corner
	dst[lrc].red=(src[lrc].red+src[lrc-1].red+src[llc-1].red+src[llc-2].red)>>2;
	dst[lrc].blue=(src[lrc].blue+src[lrc-1].blue+src[llc-1].blue+src[llc-2].blue)>>2;
	dst[lrc].green=(src[lrc].green+src[lrc-1].green+src[llc-1].green+src[llc-2].green)>>2;

	//upper side
	for(j=1; j<dim-1; j++)
	{dst[j].red=(src[j].red+src[j-1].red+src[j+1].red+src[j+dim].red+src[j+dim-1].red+src[j+dim+1].red)/6;
	dst[j].blue=(src[j].blue+src[j-1].blue+src[j+1].blue+src[j+dim].blue+src[j+dim-1].blue+src[j+dim+1].blue)/6;
	dst[j].green=(src[j].green+src[j-1].green+src[j+1].green+src[j+dim].green+src[j+dim-1].green+src[j+dim+1].green)/6;}

	//lower side
	for(j=llc+1; j<lrc; j++)
	{dst[j].red=(src[j].red+src[j-1].red+src[j+1].red+src[j-dim].red+src[j-dim-1].red+src[j-dim+1].red)/6;
	dst[j].blue=(src[j].blue+src[j-1].blue+src[j+1].blue+src[j-dim].blue+src[j-dim-1].blue+src[j-dim+1].blue)/6;
	dst[j].green=(src[j].green+src[j-1].green+src[j+1].green+src[j-dim].green+src[j-dim-1].green+src[j-dim+1].green)/6;}

	//left side
	for(j=dim; j<llc; j+=dim)
	{dst[j].red=(src[j].red+src[j-dim].red+src[j+dim].red+src[j+1].red+src[j-dim+1].red+src[j+dim+1].red)/6;
	dst[j].blue=(src[j].blue+src[j-dim].blue+src[j+dim].blue+src[j+1].blue+src[j-dim+1].blue+src[j+dim+1].blue)/6;
	dst[j].green=(src[j].green+src[j-dim].green+src[j+dim].green+src[j+1].green+src[j-dim+1].green+src[j+dim+1].green)/6;}

	//right side
	for(j=2*dim-1; j<lrc; j+=dim)
	{dst[j].red=(src[j].red+src[j-1].red+src[j-dim].red+src[j+dim].red+src[j-dim-1].red+src[j+dim-1].red)/6;
	dst[j].blue=(src[j].blue+src[j-1].blue+src[j-dim].blue+src[j+dim].blue+src[j-dim-1].blue+src[j+dim-1].blue)/6;
	dst[j].green=(src[j].green+src[j-1].green+src[j-dim].green+src[j+dim].green+src[j-dim-1].green+src[j+dim-1].green)/6;}

	//others
	for(i=1; i<urc; i++)
	{i1=i*dim;
		for(j=1; j<dim-1; j++)
		{j1=i1+j;
		dst[j1].red=(src[j1].red+src[j1-1].red+src[j1+1].red+src[j1-dim].red+src[j1-dim-1].red+src[j1-dim+1].red+src[j1+dim].red+src[j1+dim-1].red+src[j1+dim+1].red)/9;
		dst[j1].blue=(src[j1].blue+src[j1-1].blue+src[j1+1].blue+src[j1-dim].blue+src[j1-dim-1].blue+src[j1-dim+1].blue+src[j1+dim].blue+src[j1+dim-1].blue+src[j1+dim+1].blue)/9;
		dst[j1].green=(src[j1].green+src[j1-1].green+src[j1+1].green+src[j1-dim].green+src[j1-dim-1].green+src[j1-dim+1].green+src[j1+dim].green+src[j1+dim-1].green+src[j1+dim+1].green)/9;}
	}

}

性能有大幅提升啦!最后用点小聪明
改进3:中间部分像素2*2划分块,循环展开

void smooth(int dim, pixel *src, pixel *dst) 
{
	int i, j, tmp=dim+1, tmp2=2*dim-1, tmp3=dim*(dim-2), urc=dim-1, llc=dim*(dim-1), lrc=dim*dim-1, i1=0, j1=0;

	//upper left corner
	dst[0].red=(src[0].red+src[1].red+src[dim].red+src[tmp].red)>>2;
	dst[0].blue=(src[0].blue+src[1].blue+src[dim].blue+src[tmp].blue)>>2;
	dst[0].green=(src[0].green+src[1].green+src[dim].green+src[tmp].green)>>2;
	//upper right corner
	dst[urc].red=(src[urc].red+src[urc-1].red+src[tmp2].red+src[tmp2-1].red)>>2;
	dst[urc].blue=(src[urc].blue+src[urc-1].blue+src[tmp2].blue+src[tmp2-1].blue)>>2;
	dst[urc].green=(src[urc].green+src[urc-1].green+src[tmp2].green+src[tmp2-1].green)>>2;
	//lower left corner
	dst[llc].red=(src[llc].red+src[llc+1].red+src[tmp3].red+src[tmp3+1].red)>>2;
	dst[llc].blue=(src[llc].blue+src[llc+1].blue+src[tmp3].blue+src[tmp3+1].blue)>>2;
	dst[llc].green=(src[llc].green+src[llc+1].green+src[tmp3].green+src[tmp3+1].green)>>2;
	//lower right corner
	dst[lrc].red=(src[lrc].red+src[lrc-1].red+src[llc-1].red+src[llc-2].red)>>2;
	dst[lrc].blue=(src[lrc].blue+src[lrc-1].blue+src[llc-1].blue+src[llc-2].blue)>>2;
	dst[lrc].green=(src[lrc].green+src[lrc-1].green+src[llc-1].green+src[llc-2].green)>>2;

	//upper side
	for(j=1; j<dim-1; ++j)
	{dst[j].red=(src[j].red+src[j-1].red+src[j+1].red+src[j+dim].red+src[j+dim-1].red+src[j+dim+1].red)/6;
	dst[j].blue=(src[j].blue+src[j-1].blue+src[j+1].blue+src[j+dim].blue+src[j+dim-1].blue+src[j+dim+1].blue)/6;
	dst[j].green=(src[j].green+src[j-1].green+src[j+1].green+src[j+dim].green+src[j+dim-1].green+src[j+dim+1].green)/6;}

	//lower side
	for(j=llc+1; j<lrc; ++j)
	{dst[j].red=(src[j].red+src[j-1].red+src[j+1].red+src[j-dim].red+src[j-dim-1].red+src[j-dim+1].red)/6;
	dst[j].blue=(src[j].blue+src[j-1].blue+src[j+1].blue+src[j-dim].blue+src[j-dim-1].blue+src[j-dim+1].blue)/6;
	dst[j].green=(src[j].green+src[j-1].green+src[j+1].green+src[j-dim].green+src[j-dim-1].green+src[j-dim+1].green)/6;}

	//left side
	for(j=dim; j<llc; j+=dim)
	{dst[j].red=(src[j].red+src[j-dim].red+src[j+dim].red+src[j+1].red+src[j-dim+1].red+src[j+dim+1].red)/6;
	dst[j].blue=(src[j].blue+src[j-dim].blue+src[j+dim].blue+src[j+1].blue+src[j-dim+1].blue+src[j+dim+1].blue)/6;
	dst[j].green=(src[j].green+src[j-dim].green+src[j+dim].green+src[j+1].green+src[j-dim+1].green+src[j+dim+1].green)/6;}

	//right side
	for(j=2*dim-1; j<lrc; j+=dim)
	{dst[j].red=(src[j].red+src[j-1].red+src[j-dim].red+src[j+dim].red+src[j-dim-1].red+src[j+dim-1].red)/6;
	dst[j].blue=(src[j].blue+src[j-1].blue+src[j-dim].blue+src[j+dim].blue+src[j-dim-1].blue+src[j+dim-1].blue)/6;
	dst[j].green=(src[j].green+src[j-1].green+src[j-dim].green+src[j+dim].green+src[j-dim-1].green+src[j+dim-1].green)/6;}

	//others
	for(i=1; i<urc-1; i+=2)
	{i1=i*dim;
		for(j=1; j<dim-2; j+=2)
		{j1=i1+j;
		dst[j1].red=(src[j1].red+src[j1-1].red+src[j1+1].red+src[j1-dim].red+src[j1-dim-1].red+src[j1-dim+1].red+src[j1+dim].red+src[j1+dim-1].red+src[j1+dim+1].red)/9;
		dst[j1].blue=(src[j1].blue+src[j1-1].blue+src[j1+1].blue+src[j1-dim].blue+src[j1-dim-1].blue+src[j1-dim+1].blue+src[j1+dim].blue+src[j1+dim-1].blue+src[j1+dim+1].blue)/9;
		dst[j1].green=(src[j1].green+src[j1-1].green+src[j1+1].green+src[j1-dim].green+src[j1-dim-1].green+src[j1-dim+1].green+src[j1+dim].green+src[j1+dim-1].green+src[j1+dim+1].green)/9;
		j1=j1+1;
		dst[j1].red=(src[j1].red+src[j1-1].red+src[j1+1].red+src[j1-dim].red+src[j1-dim-1].red+src[j1-dim+1].red+src[j1+dim].red+src[j1+dim-1].red+src[j1+dim+1].red)/9;
		dst[j1].blue=(src[j1].blue+src[j1-1].blue+src[j1+1].blue+src[j1-dim].blue+src[j1-dim-1].blue+src[j1-dim+1].blue+src[j1+dim].blue+src[j1+dim-1].blue+src[j1+dim+1].blue)/9;
		dst[j1].green=(src[j1].green+src[j1-1].green+src[j1+1].green+src[j1-dim].green+src[j1-dim-1].green+src[j1-dim+1].green+src[j1+dim].green+src[j1+dim-1].green+src[j1+dim+1].green)/9;
		}
		i1=(i+1)*dim;
		for(j=1; j<dim-2; j+=2)
		{j1=i1+j;
		dst[j1].red=(src[j1].red+src[j1-1].red+src[j1+1].red+src[j1-dim].red+src[j1-dim-1].red+src[j1-dim+1].red+src[j1+dim].red+src[j1+dim-1].red+src[j1+dim+1].red)/9;
		dst[j1].blue=(src[j1].blue+src[j1-1].blue+src[j1+1].blue+src[j1-dim].blue+src[j1-dim-1].blue+src[j1-dim+1].blue+src[j1+dim].blue+src[j1+dim-1].blue+src[j1+dim+1].blue)/9;
		dst[j1].green=(src[j1].green+src[j1-1].green+src[j1+1].green+src[j1-dim].green+src[j1-dim-1].green+src[j1-dim+1].green+src[j1+dim].green+src[j1+dim-1].green+src[j1+dim+1].green)/9;
		j1=j1+1;
		dst[j1].red=(src[j1].red+src[j1-1].red+src[j1+1].red+src[j1-dim].red+src[j1-dim-1].red+src[j1-dim+1].red+src[j1+dim].red+src[j1+dim-1].red+src[j1+dim+1].red)/9;
		dst[j1].blue=(src[j1].blue+src[j1-1].blue+src[j1+1].blue+src[j1-dim].blue+src[j1-dim-1].blue+src[j1-dim+1].blue+src[j1+dim].blue+src[j1+dim-1].blue+src[j1+dim+1].blue)/9;
		dst[j1].green=(src[j1].green+src[j1-1].green+src[j1+1].green+src[j1-dim].green+src[j1-dim-1].green+src[j1-dim+1].green+src[j1+dim].green+src[j1+dim-1].green+src[j1+dim+1].green)/9;
		}
	}

}

性能:
CSAPP: 极度舒适的Perflab_第15张图片

最终结果
在这里插入图片描述

这种没有明确评分的lab…就很虚( •̥́ ˍ •̀ू )
助教手下留情~
相比起前两个lab,感觉这个回到了上学期还在学基础理论的时候…
谁让机智的老师跳着上呢~
加油冲冲冲!

你可能感兴趣的:(CSAPP: 极度舒适的Perflab)