需要优化的函数有两个 rotate 和 smooth,CPE 值越低越好,mean综合评分越高越好
本次实验中,图像以一个二维矩阵 M 来表示的,并以 Mi,j 来标记(i,j)位置的像素值。像素值是由红,绿,蓝(RGB)三个值构成的三元组,仅考虑方形图像。以 N 来表记图像的行数(同时也是列数)。行和列均以C风格进行编号——从0到 N - 1 。
在这种表示方法之下,rotate 操作可以借由以下两种矩阵操作的结合来简单实现:
第一步:转置:对于每个(i,j),交换 Mi,j 与 Mj,i
第二步:行交换:交换第 i 行与第 N - 1 - i 行
smooth 操作可以通过求每个像素与周围像素(最多是以该像素为中心的3×3的九宫格)的均值来实现。详见图2,像素 M2[1][1] 与 M2[N - 1][N - 1] 由下式给出
参考实现
RGB 值拥有 16位的表示(16位色彩)。图像以一个一维的像素数组表示,(i,j)位置的像素表示为 I[RIDX(i,j,n)]。此处 n 表示图像矩阵的大小,RIDX 是一个宏,定义如下(像素的定义可查看defs.h的相关代码):
#define RIDX(i,j,n) ((i)*(n)+(j))
Rotate
kernels.c的函数naive_rotate用于计算源图像 src 旋转90°后的结果,并将结果保存在目标图像 dst 中。dim 表示图像的大小。
以上代码逐行扫描源图像,将元素拷贝至目标图像的列。
本函数是正确的,本次实验的任务是优化这段代码,通过 代码移动(code motion),循环展开(loop unrolling),分块技术(blocking)等技巧使其尽可能加速运行。
smooth
kernels.c的函数naive_smooth传入源图像 src 作为参数,并以目标图像 dst 的形式返回平滑化的结果。此处是部分实现(其中avg 函数用于返回(i,j)位置周围像素的均值):
本函数是正确的,本次实验的任务是优化此函数(以及avg)函数,使其尽可能加速运行。(avg 是一个本地函数,你可以完全弃之不用,以其他方法实现smooth)
附上初始版本代码
/********************************************************
* Kernels to be optimized for the CS:APP Performance Lab
********************************************************/
#include
#include
#include "defs.h"
/*
* Please fill in the following team struct
*/
team_t team = {
"bovik", /* Team name */
"Harry Q. Bovik", /* First member full name */
"[email protected]", /* First member email address */
"", /* Second member full name (leave blank if none) */
"" /* Second member email addr (leave blank if none) */
};
/***************
* ROTATE KERNEL
***************/
/******************************************************
* Your different versions of the rotate kernel go here
******************************************************/
/*
* naive_rotate - The naive baseline version of rotate
*/
char naive_rotate_descr[] = "naive_rotate: Naive baseline implementation";
void naive_rotate(int dim, pixel *src, pixel *dst)
{
int i, j;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(dim-1-j, i, dim)] = src[RIDX(i, j, dim)];
}
/*
* rotate - Your current working version of rotate
* IMPORTANT: This is the version you will be graded on
*/
char rotate_descr[] = "rotate: Current working version";
void rotate(int dim, pixel *src, pixel *dst)
{
naive_rotate(dim, src, dst);
}
/*********************************************************************
* register_rotate_functions - Register all of your different versions
* of the rotate kernel with the driver by calling the
* add_rotate_function() for each test function. When you run the
* driver program, it will test and report the performance of each
* registered test function.
*********************************************************************/
void register_rotate_functions()
{
add_rotate_function(&naive_rotate, naive_rotate_descr);
add_rotate_function(&rotate, rotate_descr);
/* ... Register additional test functions here */
}
/***************
* SMOOTH KERNEL
**************/
/***************************************************************
* Various typedefs and helper functions for the smooth function
* You may modify these any way you like.
**************************************************************/
/* A struct used to compute averaged pixel value */
typedef struct {
int red;
int green;
int blue;
int num;
} pixel_sum;
/* Compute min and max of two integers, respectively */
static int min(int a, int b) { return (a < b ? a : b); }
static int max(int a, int b) { return (a > b ? a : b); }
/*
* initialize_pixel_sum - Initializes all fields of sum to 0
*/
static void initialize_pixel_sum(pixel_sum *sum)
{
sum->red = sum->green = sum->blue = 0;
sum->num = 0;
return;
}
/*
* accumulate_sum - Accumulates field values of p in corresponding
* fields of sum
*/
static void accumulate_sum(pixel_sum *sum, pixel p)
{
sum->red += (int) p.red;
sum->green += (int) p.green;
sum->blue += (int) p.blue;
sum->num++;
return;
}
/*
* assign_sum_to_pixel - Computes averaged pixel value in current_pixel
*/
static void assign_sum_to_pixel(pixel *current_pixel, pixel_sum sum)
{
current_pixel->red = (unsigned short) (sum.red/sum.num);
current_pixel->green = (unsigned short) (sum.green/sum.num);
current_pixel->blue = (unsigned short) (sum.blue/sum.num);
return;
}
/*
* avg - Returns averaged pixel value at (i,j)
*/
static pixel avg(int dim, int i, int j, pixel *src)
{
int ii, jj;
pixel_sum sum;
pixel current_pixel;
initialize_pixel_sum(&sum);
for(ii = max(i-1, 0); ii <= min(i+1, dim-1); ii++)
for(jj = max(j-1, 0); jj <= min(j+1, dim-1); jj++)
accumulate_sum(&sum, src[RIDX(ii, jj, dim)]);
assign_sum_to_pixel(¤t_pixel, sum);
return current_pixel;
}
/******************************************************
* Your different versions of the smooth kernel go here
******************************************************/
/*
* naive_smooth - The naive baseline version of smooth
*/
char naive_smooth_descr[] = "naive_smooth: Naive baseline implementation";
void naive_smooth(int dim, pixel *src, pixel *dst)
{
int i, j;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(i, j, dim)] = avg(dim, i, j, src);
}
/*
* smooth - Your current working version of smooth.
* IMPORTANT: This is the version you will be graded on
*/
char smooth_descr[] = "smooth: Current working version";
void smooth(int dim, pixel *src, pixel *dst)
{
naive_smooth(dim, src, dst);
}
/*********************************************************************
* register_smooth_functions - Register all of your different versions
* of the smooth kernel with the driver by calling the
* add_smooth_function() for each test function. When you run the
* driver program, it will test and report the performance of each
* registered test function.
*********************************************************************/
void register_smooth_functions() {
add_smooth_function(&smooth, smooth_descr);
add_smooth_function(&naive_smooth, naive_smooth_descr);
/* ... Register additional test functions here */
}
基础操作流程和分析
第一个操作是必须更改结构体中的值换为自己的邮箱名称,
第二个操作在获取评分信息时make后测试./driver 要加上脱离远程服务器的-g默认测试smooth函数和rotate函数选项。
team_t team = {
"bovik", /* Team name */
"Harry Q. Bovik", /* First member full name */
"[email protected]", /* First member email address */
"", /* Second member full name (leave blank if none) */
"" /* Second member email addr (leave blank if none) */
};
第三个操作看一下makefile文件是不是正确,在我操作有一次过程发现makefile文件少包含了一个库卡死
# Student's Makefile for the CS:APP Performance Lab
TEAM = bovik
VERSION = 1
HANDINDIR =
CC = gcc
CFLAGS = -Wall -O2 -m32
LIBS = -lm
OBJS = driver.o kernels.o fcyc.o clock.o
all: driver
driver: $(OBJS) fcyc.h clock.h defs.h config.h
$(CC) $(CFLAGS) $(OBJS) $(LIBS) -o driver
handin:
cp kernels.c $(HANDINDIR)/$(TEAM)-$(VERSION)-kernels.c
clean:
-rm -f $(OBJS) driver core *~ *.o
解题过程分析
原始版本
void naive_rotate(int dim, pixel *src, pixel *dst)
{
int i, j;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(dim-1-j, i, dim)] = src[RIDX(i, j, dim)];
}
消除循环内部的操作次数
void naive_rotate(int dim, pixel *src, pixel *dst)
{
int i, j;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(i, j, dim)] = src[RIDX(j, dim-i-1, dim)];
}
优化过程中产生的多余地址,寻址如果更加连续速率应该会更快
int i,j,t,n;
n=dim-1;
for(i=0;i<dim;i++){
t=n-i;
t*=dim;
for(j=0;j<dim;j++){
dst[t+i]=src[RIDX(i,j,dim)];
}
}
矩阵如果分快为8x8的块,可以实现更加迅速的翻转,分块矩阵依次操作即可得到合理的答案
int i,j,k,l;
for(i=0;i<dim;i+=8){
for(j=0;j<dim;j+=8){
for(k=i;k<i+8;k++){
for(l=j;l<j+8;l++){
dst[RIDX(dim-1-l,k,dim)]=src[RIDX(k,l,dim)];
}}}}
原始版本
int i, j;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(i, j, dim)] = avg(dim, i, j, src);
跑分
#define min(a,b) (a<b?a:b)
#define max(a,b) (a>b?a:b)
对于四个顶点,取临近的四个平均值,边界点取六个,对于其他像素点取九个。
将除法过程全部减化。
void naive_smooth(int dim, pixel *src, pixel *dst)
{
// int i, j;
// for (i = 0; i < dim; i++)
// for (j = 0; j < dim; j++)
// dst[RIDX(i, j, dim)] = avg(dim, i, j, src);
int i=1,j=0;
//s1
dst[0].red=(src[0].red+src[1].red+src[dim].red+src[dim+1].red)/4;
dst[0].green=(src[0].green+src[1].green+src[dim].green+src[dim+1].green)/4;
dst[0].blue=(src[0].blue+src[1].blue+src[dim].blue+src[dim+1].blue)/4;
//s2
for(j=1; j<dim-1; j++) {
dst[j].red=(src[j-1].red+src[j].red+src[j+1].red+src[dim+j-1].red+src[dim+j].red+src[dim+j+1].red)/6;
dst[j].green=(src[j-1].green+src[j].green+src[j+1].green+src[dim+j-1].green+src[dim+j].green+src[dim+j+1].green)/6;
dst[j].blue=(src[j-1].blue+src[j].blue+src[j+1].blue+src[dim+j-1].blue+src[dim+j].blue+src[dim+j+1].blue)/6;
}
//s3
dst[j].red=(src[j].red+src[j-1].red+src[dim+j].red+src[dim+j-1].red)/4;
dst[j].green=(src[j].green+src[j-1].green+src[dim+j].green+src[dim+j-1].green)/4;
dst[j].blue=(src[j].blue+src[j-1].blue+src[dim+j].blue+src[dim+j-1].blue)/4;
//s4
for(; i<dim-1; i++) {
dst[i*dim].red=(src[(i-1)*dim].red+src[(i-1)*dim+1].red+src[i*dim].red+src[i*dim+1].red+src[(i+1)*dim].red+src[(i+1)*dim+1].red)/6;
dst[i*dim].green=(src[(i-1)*dim].green+src[(i-1)*dim+1].green+src[i*dim].green+src[i*dim+1].green+src[(i+1)*dim].green+src[(i+1)*dim+1].green)/6;
dst[i*dim].blue=(src[(i-1)*dim].blue+src[(i-1)*dim+1].blue+src[i*dim].blue+src[i*dim+1].blue+src[(i+1)*dim].blue+src[(i+1)*dim+1].blue)/6;
for(j=1; j<dim-1; j++) {
dst[i*dim+j].red=(src[(i-1)*dim+j-1].red+src[(i-1)*dim+j].red+src[(i-1)*dim+j+1].red+src[i*dim+j-1].red+src[i*dim+j].red+src[i*dim+j+1].red+src[(i+1)*dim+j-1].red+src[(i+1)*dim+j].red+src[(i+1)*dim+j+1].red)/9;
dst[i*dim+j].green=(src[(i-1)*dim+j-1].green+src[(i-1)*dim+j].green+src[(i-1)*dim+j+1].green+src[i*dim+j-1].green+src[i*dim+j].green+src[i*dim+j+1].green+src[(i+1)*dim+j-1].green+src[(i+1)*dim+j].green+src[(i+1)*dim+j+1].green)/9;
dst[i*dim+j].blue=(src[(i-1)*dim+j-1].blue+src[(i-1)*dim+j].blue+src[(i-1)*dim+j+1].blue+src[i*dim+j-1].blue+src[i*dim+j].blue+src[i*dim+j+1].blue+src[(i+1)*dim+j-1].blue+src[(i+1)*dim+j].blue+src[(i+1)*dim+j+1].blue)/9;
}
dst[i*dim+j].red=(src[(i-1)*dim+j-1].red+src[(i-1)*dim+j].red+src[i*dim+j-1].red+src[i*dim+j].red+src[(i+1)*dim+j-1].red+src[(i+1)*dim+j].red)/6;
dst[i*dim+j].green=(src[(i-1)*dim+j-1].green+src[(i-1)*dim+j].green+src[i*dim+j-1].green+src[i*dim+j].green+src[(i+1)*dim+j-1].green+src[(i+1)*dim+j].green)/6;
dst[i*dim+j].blue=(src[(i-1)*dim+j-1].blue+src[(i-1)*dim+j].blue+src[i*dim+j-1].blue+src[i*dim+j].blue+src[(i+1)*dim+j-1].blue+src[(i+1)*dim+j].blue)/6;
}
dst[i*dim].red=(src[(i-1)*dim].red+src[(i-1)*dim+1].red+src[i*dim].red+src[i*dim+1].red)/4;
dst[i*dim].green=(src[(i-1)*dim].green+src[(i-1)*dim+1].green+src[i*dim].green+src[i*dim+1].green)/4;
dst[i*dim].blue=(src[(i-1)*dim].blue+src[(i-1)*dim+1].blue+src[i*dim].blue+src[i*dim+1].blue)/4;
for(j=1; j<dim-1; j++) {
dst[i*dim+j].red=(src[(i-1)*dim+j-1].red+src[(i-1)*dim+j].red+src[(i-1)*dim+j+1].red+src[i*dim+j-1].red+src[i*dim+j].red+src[i*dim+j+1].red)/6;
dst[i*dim+j].green=(src[(i-1)*dim+j-1].green+src[(i-1)*dim+j].green+src[(i-1)*dim+j+1].green+src[i*dim+j-1].green+src[i*dim+j].green+src[i*dim+j+1].green)/6;
dst[i*dim+j].blue=(src[(i-1)*dim+j-1].blue+src[(i-1)*dim+j].blue+src[(i-1)*dim+j+1].blue+src[i*dim+j-1].blue+src[i*dim+j].blue+src[i*dim+j+1].blue)/6;
}
//s5
dst[i*dim+j].red=(src[(i-1)*dim+j-1].red+src[(i-1)*dim+j].red+src[i*dim+j-1].red+src[i*dim+j].red)/4;
dst[i*dim+j].green=(src[(i-1)*dim+j-1].green+src[(i-1)*dim+j].green+src[i*dim+j-1].green+src[i*dim+j].green)/4;
dst[i*dim+j].blue=(src[(i-1)*dim+j-1].blue+src[(i-1)*dim+j].blue+src[i*dim+j-1].blue+src[i*dim+j].blue)/4;
}
通过读取程序汇编代码进行性能优化补全了程序的短板
虽然在过程中遇到了不少的问题,但是最终优化效果还是不错。
在T2的优化2中对于不同区块的表的编写遇到了一些问题,但是最终解决。
性能得到了非常高的优化。
基本的优化方式,包括降低循环,降低变量,降低内存重复率等等办法都尝试并实践。