csapp——perflab解题报告

csapp——perflab解题报告

需求分析:

需要优化的函数有两个 rotate 和 smooth,CPE 值越低越好,mean综合评分越高越好

基本的程序优化方法:

  1. 避免循环内部的乘(除)法以及冗余计算
  2. 避免循环内部有过多依赖和跳转,使cpu能流水起来
  3. 空间换时间最经典的就是查表法了,某些计算相当耗时,但其自变量的值域是比较有限的,这样的情况可以预先计算好每个自变量对应的函数值,存在一个表格中,每次根据自变量的  值去索引对应的函数值即可(强烈推荐)
  4. 预分配内存

实验题目中文翻译版本

本次实验中,图像以一个二维矩阵 M 来表示的,并以 Mi,j 来标记(i,j)位置的像素值。像素值是由红,绿,蓝(RGB)三个值构成的三元组,仅考虑方形图像。以 N 来表记图像的行数(同时也是列数)。行和列均以C风格进行编号——从0到 N - 1 。
在这种表示方法之下,rotate 操作可以借由以下两种矩阵操作的结合来简单实现:
第一步:转置:对于每个(i,j),交换 Mi,j 与 Mj,i
第二步:行交换:交换第 i 行与第 N - 1 - i 行

csapp——perflab解题报告_第1张图片
smooth 操作可以通过求每个像素与周围像素(最多是以该像素为中心的3×3的九宫格)的均值来实现。详见图2,像素 M2[1][1] 与 M2[N - 1][N - 1] 由下式给出
csapp——perflab解题报告_第2张图片
csapp——perflab解题报告_第3张图片
参考实现
csapp——perflab解题报告_第4张图片
RGB 值拥有 16位的表示(16位色彩)。图像以一个一维的像素数组表示,(i,j)位置的像素表示为 I[RIDX(i,j,n)]。此处 n 表示图像矩阵的大小,RIDX 是一个宏,定义如下(像素的定义可查看defs.h的相关代码):
#define RIDX(i,j,n) ((i)*(n)+(j))
Rotate
kernels.c的函数naive_rotate用于计算源图像 src 旋转90°后的结果,并将结果保存在目标图像 dst 中。dim 表示图像的大小。
csapp——perflab解题报告_第5张图片
以上代码逐行扫描源图像,将元素拷贝至目标图像的列。
本函数是正确的,本次实验的任务是优化这段代码,通过 代码移动(code motion),循环展开(loop unrolling),分块技术(blocking)等技巧使其尽可能加速运行。

smooth
kernels.c的函数naive_smooth传入源图像 src 作为参数,并以目标图像 dst 的形式返回平滑化的结果。此处是部分实现(其中avg 函数用于返回(i,j)位置周围像素的均值):
csapp——perflab解题报告_第6张图片
本函数是正确的,本次实验的任务是优化此函数(以及avg)函数,使其尽可能加速运行。(avg 是一个本地函数,你可以完全弃之不用,以其他方法实现smooth)

附上初始版本代码

/********************************************************
 * Kernels to be optimized for the CS:APP Performance Lab
 ********************************************************/

#include 
#include 
#include "defs.h"

/* 
 * Please fill in the following team struct 
 */
team_t team = {
    "bovik",              /* Team name */

    "Harry Q. Bovik",     /* First member full name */
    "[email protected]",  /* First member email address */

    "",                   /* Second member full name (leave blank if none) */
    ""                    /* Second member email addr (leave blank if none) */
};

/***************
 * ROTATE KERNEL
 ***************/

/******************************************************
 * Your different versions of the rotate kernel go here
 ******************************************************/

/* 
 * naive_rotate - The naive baseline version of rotate 
 */
char naive_rotate_descr[] = "naive_rotate: Naive baseline implementation";
void naive_rotate(int dim, pixel *src, pixel *dst) 
{
    int i, j;

    for (i = 0; i < dim; i++)
	for (j = 0; j < dim; j++)
	    dst[RIDX(dim-1-j, i, dim)] = src[RIDX(i, j, dim)];
}

/* 
 * rotate - Your current working version of rotate
 * IMPORTANT: This is the version you will be graded on
 */
char rotate_descr[] = "rotate: Current working version";
void rotate(int dim, pixel *src, pixel *dst) 
{
    naive_rotate(dim, src, dst);
}

/*********************************************************************
 * register_rotate_functions - Register all of your different versions
 *     of the rotate kernel with the driver by calling the
 *     add_rotate_function() for each test function. When you run the
 *     driver program, it will test and report the performance of each
 *     registered test function.  
 *********************************************************************/

void register_rotate_functions() 
{
    add_rotate_function(&naive_rotate, naive_rotate_descr);   
    add_rotate_function(&rotate, rotate_descr);   
    /* ... Register additional test functions here */
}


/***************
 * SMOOTH KERNEL
 **************/

/***************************************************************
 * Various typedefs and helper functions for the smooth function
 * You may modify these any way you like.
 **************************************************************/

/* A struct used to compute averaged pixel value */
typedef struct {
    int red;
    int green;
    int blue;
    int num;
} pixel_sum;

/* Compute min and max of two integers, respectively */
static int min(int a, int b) { return (a < b ? a : b); }
static int max(int a, int b) { return (a > b ? a : b); }

/* 
 * initialize_pixel_sum - Initializes all fields of sum to 0 
 */
static void initialize_pixel_sum(pixel_sum *sum) 
{
    sum->red = sum->green = sum->blue = 0;
    sum->num = 0;
    return;
}

/* 
 * accumulate_sum - Accumulates field values of p in corresponding 
 * fields of sum 
 */
static void accumulate_sum(pixel_sum *sum, pixel p) 
{
    sum->red += (int) p.red;
    sum->green += (int) p.green;
    sum->blue += (int) p.blue;
    sum->num++;
    return;
}

/* 
 * assign_sum_to_pixel - Computes averaged pixel value in current_pixel 
 */
static void assign_sum_to_pixel(pixel *current_pixel, pixel_sum sum) 
{
    current_pixel->red = (unsigned short) (sum.red/sum.num);
    current_pixel->green = (unsigned short) (sum.green/sum.num);
    current_pixel->blue = (unsigned short) (sum.blue/sum.num);
    return;
}

/* 
 * avg - Returns averaged pixel value at (i,j) 
 */
static pixel avg(int dim, int i, int j, pixel *src) 
{
    int ii, jj;
    pixel_sum sum;
    pixel current_pixel;

    initialize_pixel_sum(&sum);
    for(ii = max(i-1, 0); ii <= min(i+1, dim-1); ii++) 
	for(jj = max(j-1, 0); jj <= min(j+1, dim-1); jj++) 
	    accumulate_sum(&sum, src[RIDX(ii, jj, dim)]);

    assign_sum_to_pixel(&current_pixel, sum);
    return current_pixel;
}

/******************************************************
 * Your different versions of the smooth kernel go here
 ******************************************************/

/*
 * naive_smooth - The naive baseline version of smooth 
 */
char naive_smooth_descr[] = "naive_smooth: Naive baseline implementation";
void naive_smooth(int dim, pixel *src, pixel *dst) 
{
    int i, j;

    for (i = 0; i < dim; i++)
	for (j = 0; j < dim; j++)
	    dst[RIDX(i, j, dim)] = avg(dim, i, j, src);
}

/*
 * smooth - Your current working version of smooth. 
 * IMPORTANT: This is the version you will be graded on
 */
char smooth_descr[] = "smooth: Current working version";
void smooth(int dim, pixel *src, pixel *dst) 
{
    naive_smooth(dim, src, dst);
}


/********************************************************************* 
 * register_smooth_functions - Register all of your different versions
 *     of the smooth kernel with the driver by calling the
 *     add_smooth_function() for each test function.  When you run the
 *     driver program, it will test and report the performance of each
 *     registered test function.  
 *********************************************************************/

void register_smooth_functions() {
    add_smooth_function(&smooth, smooth_descr);
    add_smooth_function(&naive_smooth, naive_smooth_descr);
    /* ... Register additional test functions here */
}


基础操作流程和分析
第一个操作是必须更改结构体中的值换为自己的邮箱名称,
第二个操作在获取评分信息时make后测试./driver 要加上脱离远程服务器的-g默认测试smooth函数和rotate函数选项。

team_t team = {
    "bovik",              /* Team name */

    "Harry Q. Bovik",     /* First member full name */
    "[email protected]",  /* First member email address */

    "",                   /* Second member full name (leave blank if none) */
    ""                    /* Second member email addr (leave blank if none) */
};

第三个操作看一下makefile文件是不是正确,在我操作有一次过程发现makefile文件少包含了一个库卡死

# Student's Makefile for the CS:APP Performance Lab
TEAM = bovik
VERSION = 1
HANDINDIR = 

CC = gcc
CFLAGS = -Wall -O2 -m32
LIBS = -lm

OBJS = driver.o kernels.o fcyc.o clock.o

all: driver

driver: $(OBJS) fcyc.h clock.h defs.h config.h
	$(CC) $(CFLAGS) $(OBJS) $(LIBS) -o driver

handin:
	cp kernels.c $(HANDINDIR)/$(TEAM)-$(VERSION)-kernels.c

clean: 
	-rm -f $(OBJS) driver core *~ *.o

解题过程分析

1.rotate函数优化

原始版本

void naive_rotate(int dim, pixel *src, pixel *dst) 
{
    int i, j;

    for (i = 0; i < dim; i++)
	for (j = 0; j < dim; j++)
	    dst[RIDX(dim-1-j, i, dim)] = src[RIDX(i, j, dim)];
}



跑分基础得分
csapp——perflab解题报告_第7张图片
1.优化1

消除循环内部的操作次数

void naive_rotate(int dim, pixel *src, pixel *dst) 
{
    int i, j;

    for (i = 0; i < dim; i++)
	for (j = 0; j < dim; j++)
	    dst[RIDX(i, j, dim)] = src[RIDX(j, dim-i-1, dim)];  
}

性能
csapp——perflab解题报告_第8张图片
第二次优化

优化过程中产生的多余地址,寻址如果更加连续速率应该会更快

    int i,j,t,n;
    n=dim-1;
    for(i=0;i<dim;i++){
        t=n-i;
        t*=dim;
        for(j=0;j<dim;j++){
            dst[t+i]=src[RIDX(i,j,dim)];
        }
    }

跑分
csapp——perflab解题报告_第9张图片
第三次优化:矩阵分块

矩阵如果分快为8x8的块,可以实现更加迅速的翻转,分块矩阵依次操作即可得到合理的答案

    int i,j,k,l;
    for(i=0;i<dim;i+=8){
    for(j=0;j<dim;j+=8){
    for(k=i;k<i+8;k++){
    for(l=j;l<j+8;l++){
    dst[RIDX(dim-1-l,k,dim)]=src[RIDX(k,l,dim)];
    }}}}

跑分
csapp——perflab解题报告_第10张图片

2.Smooth函数优化

原始版本

    int i, j;

    for (i = 0; i < dim; i++)
	for (j = 0; j < dim; j++)
	    dst[RIDX(i, j, dim)] = avg(dim, i, j, src);


跑分

csapp——perflab解题报告_第11张图片
优化1:减少对函数的调用更改为define

#define min(a,b) (a<b?a:b)
#define max(a,b) (a>b?a:b)

跑分
csapp——perflab解题报告_第12张图片
优化2:对于不同功能的部分打不同的表进行查表法处理

对于四个顶点,取临近的四个平均值,边界点取六个,对于其他像素点取九个。

将除法过程全部减化。

void naive_smooth(int dim, pixel *src, pixel *dst) 
{
    // int i, j;

    // for (i = 0; i < dim; i++)
	// for (j = 0; j < dim; j++)
	//     dst[RIDX(i, j, dim)] = avg(dim, i, j, src);

    
	int i=1,j=0;
	//s1
	dst[0].red=(src[0].red+src[1].red+src[dim].red+src[dim+1].red)/4;
	dst[0].green=(src[0].green+src[1].green+src[dim].green+src[dim+1].green)/4;
	dst[0].blue=(src[0].blue+src[1].blue+src[dim].blue+src[dim+1].blue)/4;
	//s2
	for(j=1; j<dim-1; j++) {
		dst[j].red=(src[j-1].red+src[j].red+src[j+1].red+src[dim+j-1].red+src[dim+j].red+src[dim+j+1].red)/6;
		dst[j].green=(src[j-1].green+src[j].green+src[j+1].green+src[dim+j-1].green+src[dim+j].green+src[dim+j+1].green)/6;
		dst[j].blue=(src[j-1].blue+src[j].blue+src[j+1].blue+src[dim+j-1].blue+src[dim+j].blue+src[dim+j+1].blue)/6;
	}
	//s3
	dst[j].red=(src[j].red+src[j-1].red+src[dim+j].red+src[dim+j-1].red)/4;
	dst[j].green=(src[j].green+src[j-1].green+src[dim+j].green+src[dim+j-1].green)/4;
	dst[j].blue=(src[j].blue+src[j-1].blue+src[dim+j].blue+src[dim+j-1].blue)/4;
	//s4
	for(; i<dim-1; i++) {
		dst[i*dim].red=(src[(i-1)*dim].red+src[(i-1)*dim+1].red+src[i*dim].red+src[i*dim+1].red+src[(i+1)*dim].red+src[(i+1)*dim+1].red)/6;
		dst[i*dim].green=(src[(i-1)*dim].green+src[(i-1)*dim+1].green+src[i*dim].green+src[i*dim+1].green+src[(i+1)*dim].green+src[(i+1)*dim+1].green)/6;
		dst[i*dim].blue=(src[(i-1)*dim].blue+src[(i-1)*dim+1].blue+src[i*dim].blue+src[i*dim+1].blue+src[(i+1)*dim].blue+src[(i+1)*dim+1].blue)/6;
		for(j=1; j<dim-1; j++) {
			dst[i*dim+j].red=(src[(i-1)*dim+j-1].red+src[(i-1)*dim+j].red+src[(i-1)*dim+j+1].red+src[i*dim+j-1].red+src[i*dim+j].red+src[i*dim+j+1].red+src[(i+1)*dim+j-1].red+src[(i+1)*dim+j].red+src[(i+1)*dim+j+1].red)/9;
			dst[i*dim+j].green=(src[(i-1)*dim+j-1].green+src[(i-1)*dim+j].green+src[(i-1)*dim+j+1].green+src[i*dim+j-1].green+src[i*dim+j].green+src[i*dim+j+1].green+src[(i+1)*dim+j-1].green+src[(i+1)*dim+j].green+src[(i+1)*dim+j+1].green)/9;
			dst[i*dim+j].blue=(src[(i-1)*dim+j-1].blue+src[(i-1)*dim+j].blue+src[(i-1)*dim+j+1].blue+src[i*dim+j-1].blue+src[i*dim+j].blue+src[i*dim+j+1].blue+src[(i+1)*dim+j-1].blue+src[(i+1)*dim+j].blue+src[(i+1)*dim+j+1].blue)/9;
		}
		dst[i*dim+j].red=(src[(i-1)*dim+j-1].red+src[(i-1)*dim+j].red+src[i*dim+j-1].red+src[i*dim+j].red+src[(i+1)*dim+j-1].red+src[(i+1)*dim+j].red)/6;
		dst[i*dim+j].green=(src[(i-1)*dim+j-1].green+src[(i-1)*dim+j].green+src[i*dim+j-1].green+src[i*dim+j].green+src[(i+1)*dim+j-1].green+src[(i+1)*dim+j].green)/6;
		dst[i*dim+j].blue=(src[(i-1)*dim+j-1].blue+src[(i-1)*dim+j].blue+src[i*dim+j-1].blue+src[i*dim+j].blue+src[(i+1)*dim+j-1].blue+src[(i+1)*dim+j].blue)/6;
	}
	dst[i*dim].red=(src[(i-1)*dim].red+src[(i-1)*dim+1].red+src[i*dim].red+src[i*dim+1].red)/4;
	dst[i*dim].green=(src[(i-1)*dim].green+src[(i-1)*dim+1].green+src[i*dim].green+src[i*dim+1].green)/4;
	dst[i*dim].blue=(src[(i-1)*dim].blue+src[(i-1)*dim+1].blue+src[i*dim].blue+src[i*dim+1].blue)/4;
	for(j=1; j<dim-1; j++) {
		dst[i*dim+j].red=(src[(i-1)*dim+j-1].red+src[(i-1)*dim+j].red+src[(i-1)*dim+j+1].red+src[i*dim+j-1].red+src[i*dim+j].red+src[i*dim+j+1].red)/6;
		dst[i*dim+j].green=(src[(i-1)*dim+j-1].green+src[(i-1)*dim+j].green+src[(i-1)*dim+j+1].green+src[i*dim+j-1].green+src[i*dim+j].green+src[i*dim+j+1].green)/6;
		dst[i*dim+j].blue=(src[(i-1)*dim+j-1].blue+src[(i-1)*dim+j].blue+src[(i-1)*dim+j+1].blue+src[i*dim+j-1].blue+src[i*dim+j].blue+src[i*dim+j+1].blue)/6;
	}
    //s5
	dst[i*dim+j].red=(src[(i-1)*dim+j-1].red+src[(i-1)*dim+j].red+src[i*dim+j-1].red+src[i*dim+j].red)/4;
	dst[i*dim+j].green=(src[(i-1)*dim+j-1].green+src[(i-1)*dim+j].green+src[i*dim+j-1].green+src[i*dim+j].green)/4;
	dst[i*dim+j].blue=(src[(i-1)*dim+j-1].blue+src[(i-1)*dim+j].blue+src[i*dim+j-1].blue+src[i*dim+j].blue)/4;	
}

跑分
csapp——perflab解题报告_第13张图片

实验结论或体会

通过读取程序汇编代码进行性能优化补全了程序的短板
虽然在过程中遇到了不少的问题,但是最终优化效果还是不错。
在T2的优化2中对于不同区块的表的编写遇到了一些问题,但是最终解决。
性能得到了非常高的优化。
基本的优化方式,包括降低循环,降低变量,降低内存重复率等等办法都尝试并实践。

你可能感兴趣的:(嵌入式开发,csapp,c语言,linux)