矩阵乘的不同算法

这里涉及五种:普通矩阵乘、循环展开、循环分块、手工向量化(标量尾循环、矢量尾循环(只写了尾巴为16的情况))。

#include 
#include 
#include 
#define N 624

//普通矩阵乘
void matrix_multiply(float a[N][N], float b[N][N], float c[N][N]){
    int i,  j,  k;
    for(i = 0; i < N; i++){
        for(k = 0; k < N; k++){
            for(j = 0; j < N; j++){
                c[i][k] += a[i][j] * b[j][k];
            }
        }
    }
}

//四层循环展开
void matrix_unroll4(float a[N][N], float b[N][N], float c[N][N]){
    int i,  j,  k;
    int block = N/4;
    int reserve = N%4;
    for(i = 0; i < N; i++){
        for(j = 0; j < N; j++){
            for(k = 0; k < reserve; k++){
                c[i][k] += a[i][j] * b[j][k];
            }//前尾巴
            for(k = reserve; k < N; k = k + 4){
                c[i][k] += a[i][j] * b[j][k];
                c[i][k + 1] += a[i][j] * b[j][k + 1];
                c[i][k + 2] += a[i][j] * b[j][k + 2];
                c[i][k + 3] += a[i][j] * b[j][k + 3];
            }//块区域
        }
    }
}

//循环分块
void matrix_block(float a[N][N], float b[N][N], float c[N][N]){
    int i,  j,  k, l;
    int block = N / 4;
    for(l = 0; l < N; l = l + block){
        for(i = 0; i <N; i++){
            for(j = l; j < l + block; j++){
                for(k = 0; k < N; k++){
                    c[i][k] += a[i][j] * b[j][k];
                }
            }
        }
    }
}

//手工向量化-标量尾循环
void matrix_scalar(float a[N][N], float d[N][N], float c[N][N]){
    int block = N/32;
    int i,j,k;
    float p[N]={0};
    float e[N][N] = {0};
    __m256 ymm0,ymm1,ymm2,ymm3,ymm4,ymm5,ymm6,ymm7,ymm8;
    __m256 ymm15,ymm26,ymm37,ymm48,ymm12,ymm34,ymm13,ymm31,ymm3311; 

    for(i=0;i<N;i++){
        for(j=0;j<N;j++){
            for(k=0;k<block*32;k+=32){
                ymm0=_mm256_loadu_ps(p+32*k);
                ymm1=_mm256_loadu_ps(&a[i][k]);//二维数组,非:(a[i]+8*k)
                ymm2=_mm256_loadu_ps(&a[i][k+8]);
                ymm3=_mm256_loadu_ps(&a[i][k+16]);
                ymm4=_mm256_loadu_ps(&a[i][k+24]);
                ymm5=_mm256_loadu_ps(&d[j][k]); //j对于d来说相当于d[i]
                ymm6=_mm256_loadu_ps(&d[j][k+8]);
                ymm7=_mm256_loadu_ps(&d[j][k+16]);
                ymm8=_mm256_loadu_ps(&d[j][k+24]);
                ymm15=_mm256_mul_ps(ymm1,ymm5);
                ymm26=_mm256_mul_ps(ymm2,ymm6);
                ymm37=_mm256_mul_ps(ymm3,ymm7);
                ymm48=_mm256_mul_ps(ymm4,ymm8);
                ymm12=_mm256_add_ps(ymm15,ymm26);
                ymm34=_mm256_add_ps(ymm37,ymm48);
                ymm13=_mm256_add_ps(ymm12,ymm34);
                ymm31=_mm256_hadd_ps(ymm13,ymm0);
                ymm3311=_mm256_hadd_ps(ymm31,ymm0);
                _mm256_storeu_ps(p,ymm3311);
                e[i][j]=e[i][j]+p[0]+p[4];//二维数组,不应用sum计数
            }
            for(k=32*block;k<N;k++){
                c[i][j]+=a[i][k]*d[j][k];
          	}
         c[i][j]+=e[i][j];
        }
    }
}




//手工向量化-矢量尾循环
void matrix_vector(float a[N][N], float d[N][N], float c[N][N]){
    int block = N/32;
    int i,j,k;
    float p[N]={0};
    float s[N]={0};
    float e[N][N] = {0};
    __m256 ymm0,ymm1,ymm2,ymm3,ymm4,ymm5,ymm6,ymm7,ymm8,yymm0,yymm1,yymm2,yymm5,yymm6;
    __m256 ymm15,ymm26,ymm37,ymm48,ymm12,ymm34,ymm13,ymm31,ymm3311,yymm15,yymm26,yymm12,yymm31,yymm3311; 


    for(i=0;i<N;i++){
        for(j=0;j<N;j++){
            for(k=0;k<block*32;k+=32){
                ymm0=_mm256_loadu_ps(p+32*k);
                ymm1=_mm256_loadu_ps(&a[i][k]);
                ymm2=_mm256_loadu_ps(&a[i][k+8]);
                ymm3=_mm256_loadu_ps(&a[i][k+16]);
                ymm4=_mm256_loadu_ps(&a[i][k+24]);
                ymm5=_mm256_loadu_ps(&d[j][k]); //j对于d来说相当于d[i]
                ymm6=_mm256_loadu_ps(&d[j][k+8]);
                ymm7=_mm256_loadu_ps(&d[j][k+16]);
                ymm8=_mm256_loadu_ps(&d[j][k+24]);
                ymm15=_mm256_mul_ps(ymm1,ymm5);
                ymm26=_mm256_mul_ps(ymm2,ymm6);
                ymm37=_mm256_mul_ps(ymm3,ymm7);
                ymm48=_mm256_mul_ps(ymm4,ymm8);
                ymm12=_mm256_add_ps(ymm15,ymm26);
                ymm34=_mm256_add_ps(ymm37,ymm48);
                ymm13=_mm256_add_ps(ymm12,ymm34);
                ymm31=_mm256_hadd_ps(ymm13,ymm0);
                ymm3311=_mm256_hadd_ps(ymm31,ymm0);
                _mm256_storeu_ps(p,ymm3311);
                e[i][j]+=p[0]+p[4];
                }
         //  for(k=32*block;k
            for(k=block*32;k<N;k+=16){
                yymm0=_mm256_loadu_ps(s+16*k);
                yymm1=_mm256_loadu_ps(&a[i][k]);
                yymm2=_mm256_loadu_ps(&a[i][k+8]);
                yymm5=_mm256_loadu_ps(&d[j][k]); //j对于d来说相当于d[i]
                yymm6=_mm256_loadu_ps(&d[j][k+8]);
                yymm15=_mm256_mul_ps(yymm1,yymm5);
                yymm26=_mm256_mul_ps(yymm2,yymm6);
                yymm12=_mm256_add_ps(yymm15,yymm26);
                yymm31=_mm256_hadd_ps(yymm12,yymm0);
                yymm3311=_mm256_hadd_ps(yymm31,yymm0);
                _mm256_storeu_ps(s,yymm3311);
                c[i][j]+=s[0]+s[4];
			//printf("%5f  ",c[i][j]);

        	}
            c[i][j]+=e[i][j];
		//printf("%5f  ",c[i][j]);
           }
      }
 }

//重置数组
void reset(float c[N][N]){
    int i, j;
    for(i = 0; i < N; i++){
        for(j = 0; j < N; j++){
            c[i][j] = 0;
        }
    }
}

int main(){
    int i, j;
    float t = 1.0;
    float a[N][N] = {0};
    float b[N][N] = {0};
    float c[N][N] = {0};
    float d[N][N] = {0};

    //a, b数组赋值
    for(i = 0; i < N; i++){
        for(j = 0; j < N; j++){
            a[i][j] = t;
            b[i][j] = t;
        }
        t = t + 1.0;
    }
    
    //矩阵乘循环
	struct timeval starttime,endtime;
	gettimeofday(&starttime,0);
	matrix_multiply(a, b, c);
	gettimeofday(&endtime,0);
	int timeuse  = 1000000*(endtime.tv_sec - starttime.tv_sec) + endtime.tv_usec - starttime.tv_usec;
	printf("matrix_multiply time = %d us\n", timeuse);
	reset(c);
   //四层循环展开
	struct timeval starttim,endtim;
	gettimeofday(&starttim,0);
	matrix_unroll4(a, b, c);
	gettimeofday(&endtim,0);
	int timeus  = 1000000*(endtim.tv_sec - starttim.tv_sec) + endtim.tv_usec - starttim.tv_usec;
	printf("matrix_unroll4  time = %d us\n", timeus);
	reset(c);
    //循环分块
	struct timeval startti,endti;
	gettimeofday(&startti,0);
	matrix_block(a, b, c);
	gettimeofday(&endti,0);
	int timeu  = 1000000*(endti.tv_sec - startti.tv_sec) + endti.tv_usec - startti.tv_usec;
	printf("matrix_block    time = %d us\n", timeu);
	reset(c);
   
    for(i=0;i<N;i++){
        for(j=0;j<N;j++){
            d[i][j]=b[j][i];
        }
    }//b矩阵的转置
    
    //标量尾循环
	struct timeval startt,endt;
	gettimeofday(&startt,0);
	matrix_scalar(a, d, c);
	gettimeofday(&endt,0);
	int time  = 1000000*(endt.tv_sec - startt.tv_sec) + endt.tv_usec - startt.tv_usec;
	printf("matrix_s_tail   time = %d us\n", time);
	reset(c);

    //向量尾循环
	struct timeval start,end;
	gettimeofday(&start,0);
	matrix_vector(a, d, c);
	gettimeofday(&end,0);
	int tim  = 1000000*(end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec;
	printf("matrix_v_tail   time = %d us\n", tim);
	reset(c);


}

运行结果:
矩阵乘的不同算法_第1张图片
Add:

#include 
#include 
#include 

const int M = 1000;

void matrix(float **a, float **b, float **c){
	int i, j, k;
	for(i = 0; i < M; i++){
		for(k = 0; k < M; k++){
			for(j = 0; j < M; j++){
				c[i][j] += a[i][k] * b[k][j];	
			}
		}
	}
}
void value(float **a){
	int i, j, t = 1.0;
	for(i = 0; i < M; i++){
		for(j = 0; j < M; j++){
			a[i][j] = t;
			t++;
		}
	}
}

void print(float **a){
	int i, j;
	for(i = 0; i < M; i++){
		for(j = 0; j < M; j++){
			printf("%.2f	", a[i][j]);
		}
		printf("\n");
	}
}
void reset(float **a){
	int i, j;
	for(i = 0; i < M; i++){
		for(j = 0; j < M; j++){
			a[i][j] = 0.0;
		}
	}
}

int main(){

	float **a, **b, **c;				//定义申请内存的指针
	a = (float**)malloc(sizeof(float*) * M);	//申请每一行指针的内存
	b = (float**)malloc(sizeof(float*) * M);	
	c = (float**)malloc(sizeof(float*) * M);
	int i, j, k;					//定义循环需要的的变量
	a[0] = (float*)malloc(sizeof(float) * M * M); //申请各行内存
	b[0] = (float*)malloc(sizeof(float) * M * M);
	c[0] = (float*)malloc(sizeof(float) * M * M);
	for(i = 1; i < M; i++){		//保证内存连续
		a[i] = a[i-1] + M;
		b[i] = b[i-1] + M;
		c[i] = c[i-1] + M;
	}

	value(a);
	value(b);
	reset(c);
	
	struct timeval starttime,endtime;
	gettimeofday(&starttime,0);
	matrix(a, b, c);
	gettimeofday(&endtime,0);
	int timeuse  = 1000000*(endtime.tv_sec - starttime.tv_sec) + endtime.tv_usec - starttime.tv_usec;
	printf(" time = %d us\n", timeuse);

	//print(a);
	
	free(a[0]);		//释放内存
	free(b[0]);
	free(c[0]);
	free(a);
	free(b);
	free(c);	
}

你可能感兴趣的:(矩阵乘的不同算法)