《基于 CUDA 的 GPU 并行程序开发指南》中的 imrotate Rotate7 是怎么做优化的

目录

    • imrotateMC Rotate7 是怎么做优化的

书名:基于 CUDA 的 GPU 并行程序开发指南

imrotateMC Rotate7 是怎么做优化的

书里面没有详细说

void *Rotate6(void* tid)
{
    long tn;            		     // My thread number (ID) is stored here
//    int row,col,h,v,c;
    int row,col,h,v,c, hp3;
	int NewRow,NewCol;
	double X, Y, newX, newY, ScaleFactor;
	double Diagonal, H, V;
	double CRA,SRA, CRAS, SRAS, SRAYS, CRAYS;
    struct Pixel pix;

    tn = *((int *) tid);           // Calculate my Thread ID
    tn *= ip.Vpixels/NumThreads;
	
			H=(double)ip.Hpixels;
			V=(double)ip.Vpixels;
			Diagonal=sqrt(H*H+V*V);
			ScaleFactor=(ip.Hpixels>ip.Vpixels) ? V/Diagonal : H/Diagonal;
			
			CRA=cos(RotAngle);	CRAS=ScaleFactor*CRA;
			SRA=sin(RotAngle);	SRAS=ScaleFactor*SRA;
			h=ip.Hpixels/2;   v=ip.Vpixels/2;	// integer div
			hp3=ip.Hpixels*3;
    for(row=tn; row<tn+ip.Vpixels/NumThreads; row++){
        col=0;
		c=0;
			Y=(double)v-(double)row;
			SRAYS=SRAS*Y;     CRAYS=CRAS*Y;
//      while(col
        while(col<hp3){
			// transpose image coordinates to Cartesian coordinates
//			c=col/3;  		h=ip.Hpixels/2;   v=ip.Vpixels/2;	// integer div
			X=(double)c-(double)h;
//			Y=(double)v-(double)row;
			
			// pixel rotation matrix
			newX=CRAS*X-SRAYS;
			newY=SRAS*X+CRAYS;
//			newX=CRA*X-SRA*Y;
//			newY=SRA*X+CRA*Y;
//			newX=cos(RotAngle)*X-sin(RotAngle)*Y;
//			newY=sin(RotAngle)*X+cos(RotAngle)*Y;
			
			// Scale to fit everything in the image box
//			H=(double)ip.Hpixels;
//			V=(double)ip.Vpixels;
//			Diagonal=sqrt(H*H+V*V);
//			ScaleFactor=(ip.Hpixels>ip.Vpixels) ? V/Diagonal : H/Diagonal;
//			newX=newX*ScaleFactor;
//			newY=newY*ScaleFactor;
			
			// convert back from Cartesian to image coordinates
			NewCol=((int) newX+h);
			NewRow=v-(int)newY;     
			if((NewCol>=0) && (NewRow>=0) && (NewCol<ip.Hpixels) && (NewRow<ip.Vpixels)){
				NewCol*=3;
				CopyImage[NewRow][NewCol]   = TheImage[row][col];
				CopyImage[NewRow][NewCol+1] = TheImage[row][col+1];
				CopyImage[NewRow][NewCol+2] = TheImage[row][col+2];
            }
			
            col+=3;
			c++;
        }
    }
    pthread_exit(NULL);
}
void *Rotate7(void* tid)
{
    long tn;            		     // My thread number (ID) is stored here
//    int row,col,h,v,c;
    int row,col,h,v,c, hp3;
	double cc, ss, k1, k2;
	int NewRow,NewCol;
	double X, Y, newX, newY, ScaleFactor;
	double Diagonal, H, V;
	double CRA,SRA, CRAS, SRAS, SRAYS, CRAYS;
    struct Pixel pix;

    tn = *((int *) tid);           // Calculate my Thread ID
    tn *= ip.Vpixels/NumThreads;
	
			H=(double)ip.Hpixels;
			V=(double)ip.Vpixels;
			Diagonal=sqrt(H*H+V*V);
			ScaleFactor=(ip.Hpixels>ip.Vpixels) ? V/Diagonal : H/Diagonal;
			
			CRA=cos(RotAngle);	CRAS=ScaleFactor*CRA;
			SRA=sin(RotAngle);	SRAS=ScaleFactor*SRA;
			h=ip.Hpixels/2;   v=ip.Vpixels/2;	// integer div
			hp3=ip.Hpixels*3;
    for(row=tn; row<tn+ip.Vpixels/NumThreads; row++){
        col=0;
		cc=0.00;
		ss=0.00;
			Y=(double)v-(double)row;
			SRAYS=SRAS*Y;     CRAYS=CRAS*Y;
			k1=CRAS*(double)h + SRAYS;
			k2=SRAS*(double)h - CRAYS;
//      while(col
        while(col<hp3){
			// transpose image coordinates to Cartesian coordinates
//			c=col/3;  		h=ip.Hpixels/2;   v=ip.Vpixels/2;	// integer div
//			X=(double)c-(double)h;
//			Y=(double)v-(double)row;
			
			// pixel rotation matrix
			newX=cc-k1;
			newY=ss-k2;
//			newX=CRA*X-SRA*Y;
//			newY=SRA*X+CRA*Y;
//			newX=cos(RotAngle)*X-sin(RotAngle)*Y;
//			newY=sin(RotAngle)*X+cos(RotAngle)*Y;
			
			// Scale to fit everything in the image box
//			H=(double)ip.Hpixels;
//			V=(double)ip.Vpixels;
//			Diagonal=sqrt(H*H+V*V);
//			ScaleFactor=(ip.Hpixels>ip.Vpixels) ? V/Diagonal : H/Diagonal;
//			newX=newX*ScaleFactor;
//			newY=newY*ScaleFactor;
			
			// convert back from Cartesian to image coordinates
			NewCol=((int) newX+h);
			NewRow=v-(int)newY;     
			if((NewCol>=0) && (NewRow>=0) && (NewCol<ip.Hpixels) && (NewRow<ip.Vpixels)){
				NewCol*=3;
				CopyImage[NewRow][NewCol]   = TheImage[row][col];
				CopyImage[NewRow][NewCol+1] = TheImage[row][col+1];
				CopyImage[NewRow][NewCol+2] = TheImage[row][col+2];
            }
			
            col+=3;
			cc += CRAS;
			ss += SRAS;
        }
    }
    pthread_exit(NULL);
}

可见,他就是多了个

double cc, ss, k1, k2;

的定义

看看他是怎么用的

可见,他就是把 X 这个变量化简了,如果 X 是一个整数,最终得到 newX newY 需要做一个浮点和整数的乘法

newX=CRAS*X-SRAYS;
newY=SRAS*X+CRAYS;

现在直接用 cc ss,以累加的形式替代了 CRAS SRAS 与计数变量的乘积,k1 k2 也就是存储一下偏置而已

变成了加法之后,速度就上来了

你可能感兴趣的:(CUDA,c++,CUDA)