这里涉及五种:普通矩阵乘、循环展开、循环分块、手工向量化(标量尾循环、矢量尾循环(只写了尾巴为16的情况))。
#include
#include
#include
#define N 624
//普通矩阵乘
void matrix_multiply(float a[N][N], float b[N][N], float c[N][N]){
int i, j, k;
for(i = 0; i < N; i++){
for(k = 0; k < N; k++){
for(j = 0; j < N; j++){
c[i][k] += a[i][j] * b[j][k];
}
}
}
}
//四层循环展开
void matrix_unroll4(float a[N][N], float b[N][N], float c[N][N]){
int i, j, k;
int block = N/4;
int reserve = N%4;
for(i = 0; i < N; i++){
for(j = 0; j < N; j++){
for(k = 0; k < reserve; k++){
c[i][k] += a[i][j] * b[j][k];
}//前尾巴
for(k = reserve; k < N; k = k + 4){
c[i][k] += a[i][j] * b[j][k];
c[i][k + 1] += a[i][j] * b[j][k + 1];
c[i][k + 2] += a[i][j] * b[j][k + 2];
c[i][k + 3] += a[i][j] * b[j][k + 3];
}//块区域
}
}
}
//循环分块
void matrix_block(float a[N][N], float b[N][N], float c[N][N]){
int i, j, k, l;
int block = N / 4;
for(l = 0; l < N; l = l + block){
for(i = 0; i <N; i++){
for(j = l; j < l + block; j++){
for(k = 0; k < N; k++){
c[i][k] += a[i][j] * b[j][k];
}
}
}
}
}
//手工向量化-标量尾循环
void matrix_scalar(float a[N][N], float d[N][N], float c[N][N]){
int block = N/32;
int i,j,k;
float p[N]={0};
float e[N][N] = {0};
__m256 ymm0,ymm1,ymm2,ymm3,ymm4,ymm5,ymm6,ymm7,ymm8;
__m256 ymm15,ymm26,ymm37,ymm48,ymm12,ymm34,ymm13,ymm31,ymm3311;
for(i=0;i<N;i++){
for(j=0;j<N;j++){
for(k=0;k<block*32;k+=32){
ymm0=_mm256_loadu_ps(p+32*k);
ymm1=_mm256_loadu_ps(&a[i][k]);//二维数组,非:(a[i]+8*k)
ymm2=_mm256_loadu_ps(&a[i][k+8]);
ymm3=_mm256_loadu_ps(&a[i][k+16]);
ymm4=_mm256_loadu_ps(&a[i][k+24]);
ymm5=_mm256_loadu_ps(&d[j][k]); //j对于d来说相当于d[i]
ymm6=_mm256_loadu_ps(&d[j][k+8]);
ymm7=_mm256_loadu_ps(&d[j][k+16]);
ymm8=_mm256_loadu_ps(&d[j][k+24]);
ymm15=_mm256_mul_ps(ymm1,ymm5);
ymm26=_mm256_mul_ps(ymm2,ymm6);
ymm37=_mm256_mul_ps(ymm3,ymm7);
ymm48=_mm256_mul_ps(ymm4,ymm8);
ymm12=_mm256_add_ps(ymm15,ymm26);
ymm34=_mm256_add_ps(ymm37,ymm48);
ymm13=_mm256_add_ps(ymm12,ymm34);
ymm31=_mm256_hadd_ps(ymm13,ymm0);
ymm3311=_mm256_hadd_ps(ymm31,ymm0);
_mm256_storeu_ps(p,ymm3311);
e[i][j]=e[i][j]+p[0]+p[4];//二维数组,不应用sum计数
}
for(k=32*block;k<N;k++){
c[i][j]+=a[i][k]*d[j][k];
}
c[i][j]+=e[i][j];
}
}
}
//手工向量化-矢量尾循环
void matrix_vector(float a[N][N], float d[N][N], float c[N][N]){
int block = N/32;
int i,j,k;
float p[N]={0};
float s[N]={0};
float e[N][N] = {0};
__m256 ymm0,ymm1,ymm2,ymm3,ymm4,ymm5,ymm6,ymm7,ymm8,yymm0,yymm1,yymm2,yymm5,yymm6;
__m256 ymm15,ymm26,ymm37,ymm48,ymm12,ymm34,ymm13,ymm31,ymm3311,yymm15,yymm26,yymm12,yymm31,yymm3311;
for(i=0;i<N;i++){
for(j=0;j<N;j++){
for(k=0;k<block*32;k+=32){
ymm0=_mm256_loadu_ps(p+32*k);
ymm1=_mm256_loadu_ps(&a[i][k]);
ymm2=_mm256_loadu_ps(&a[i][k+8]);
ymm3=_mm256_loadu_ps(&a[i][k+16]);
ymm4=_mm256_loadu_ps(&a[i][k+24]);
ymm5=_mm256_loadu_ps(&d[j][k]); //j对于d来说相当于d[i]
ymm6=_mm256_loadu_ps(&d[j][k+8]);
ymm7=_mm256_loadu_ps(&d[j][k+16]);
ymm8=_mm256_loadu_ps(&d[j][k+24]);
ymm15=_mm256_mul_ps(ymm1,ymm5);
ymm26=_mm256_mul_ps(ymm2,ymm6);
ymm37=_mm256_mul_ps(ymm3,ymm7);
ymm48=_mm256_mul_ps(ymm4,ymm8);
ymm12=_mm256_add_ps(ymm15,ymm26);
ymm34=_mm256_add_ps(ymm37,ymm48);
ymm13=_mm256_add_ps(ymm12,ymm34);
ymm31=_mm256_hadd_ps(ymm13,ymm0);
ymm3311=_mm256_hadd_ps(ymm31,ymm0);
_mm256_storeu_ps(p,ymm3311);
e[i][j]+=p[0]+p[4];
}
// for(k=32*block;k
for(k=block*32;k<N;k+=16){
yymm0=_mm256_loadu_ps(s+16*k);
yymm1=_mm256_loadu_ps(&a[i][k]);
yymm2=_mm256_loadu_ps(&a[i][k+8]);
yymm5=_mm256_loadu_ps(&d[j][k]); //j对于d来说相当于d[i]
yymm6=_mm256_loadu_ps(&d[j][k+8]);
yymm15=_mm256_mul_ps(yymm1,yymm5);
yymm26=_mm256_mul_ps(yymm2,yymm6);
yymm12=_mm256_add_ps(yymm15,yymm26);
yymm31=_mm256_hadd_ps(yymm12,yymm0);
yymm3311=_mm256_hadd_ps(yymm31,yymm0);
_mm256_storeu_ps(s,yymm3311);
c[i][j]+=s[0]+s[4];
//printf("%5f ",c[i][j]);
}
c[i][j]+=e[i][j];
//printf("%5f ",c[i][j]);
}
}
}
//重置数组
void reset(float c[N][N]){
int i, j;
for(i = 0; i < N; i++){
for(j = 0; j < N; j++){
c[i][j] = 0;
}
}
}
int main(){
int i, j;
float t = 1.0;
float a[N][N] = {0};
float b[N][N] = {0};
float c[N][N] = {0};
float d[N][N] = {0};
//a, b数组赋值
for(i = 0; i < N; i++){
for(j = 0; j < N; j++){
a[i][j] = t;
b[i][j] = t;
}
t = t + 1.0;
}
//矩阵乘循环
struct timeval starttime,endtime;
gettimeofday(&starttime,0);
matrix_multiply(a, b, c);
gettimeofday(&endtime,0);
int timeuse = 1000000*(endtime.tv_sec - starttime.tv_sec) + endtime.tv_usec - starttime.tv_usec;
printf("matrix_multiply time = %d us\n", timeuse);
reset(c);
//四层循环展开
struct timeval starttim,endtim;
gettimeofday(&starttim,0);
matrix_unroll4(a, b, c);
gettimeofday(&endtim,0);
int timeus = 1000000*(endtim.tv_sec - starttim.tv_sec) + endtim.tv_usec - starttim.tv_usec;
printf("matrix_unroll4 time = %d us\n", timeus);
reset(c);
//循环分块
struct timeval startti,endti;
gettimeofday(&startti,0);
matrix_block(a, b, c);
gettimeofday(&endti,0);
int timeu = 1000000*(endti.tv_sec - startti.tv_sec) + endti.tv_usec - startti.tv_usec;
printf("matrix_block time = %d us\n", timeu);
reset(c);
for(i=0;i<N;i++){
for(j=0;j<N;j++){
d[i][j]=b[j][i];
}
}//b矩阵的转置
//标量尾循环
struct timeval startt,endt;
gettimeofday(&startt,0);
matrix_scalar(a, d, c);
gettimeofday(&endt,0);
int time = 1000000*(endt.tv_sec - startt.tv_sec) + endt.tv_usec - startt.tv_usec;
printf("matrix_s_tail time = %d us\n", time);
reset(c);
//向量尾循环
struct timeval start,end;
gettimeofday(&start,0);
matrix_vector(a, d, c);
gettimeofday(&end,0);
int tim = 1000000*(end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec;
printf("matrix_v_tail time = %d us\n", tim);
reset(c);
}
#include
#include
#include
const int M = 1000;
void matrix(float **a, float **b, float **c){
int i, j, k;
for(i = 0; i < M; i++){
for(k = 0; k < M; k++){
for(j = 0; j < M; j++){
c[i][j] += a[i][k] * b[k][j];
}
}
}
}
void value(float **a){
int i, j, t = 1.0;
for(i = 0; i < M; i++){
for(j = 0; j < M; j++){
a[i][j] = t;
t++;
}
}
}
void print(float **a){
int i, j;
for(i = 0; i < M; i++){
for(j = 0; j < M; j++){
printf("%.2f ", a[i][j]);
}
printf("\n");
}
}
void reset(float **a){
int i, j;
for(i = 0; i < M; i++){
for(j = 0; j < M; j++){
a[i][j] = 0.0;
}
}
}
int main(){
float **a, **b, **c; //定义申请内存的指针
a = (float**)malloc(sizeof(float*) * M); //申请每一行指针的内存
b = (float**)malloc(sizeof(float*) * M);
c = (float**)malloc(sizeof(float*) * M);
int i, j, k; //定义循环需要的的变量
a[0] = (float*)malloc(sizeof(float) * M * M); //申请各行内存
b[0] = (float*)malloc(sizeof(float) * M * M);
c[0] = (float*)malloc(sizeof(float) * M * M);
for(i = 1; i < M; i++){ //保证内存连续
a[i] = a[i-1] + M;
b[i] = b[i-1] + M;
c[i] = c[i-1] + M;
}
value(a);
value(b);
reset(c);
struct timeval starttime,endtime;
gettimeofday(&starttime,0);
matrix(a, b, c);
gettimeofday(&endtime,0);
int timeuse = 1000000*(endtime.tv_sec - starttime.tv_sec) + endtime.tv_usec - starttime.tv_usec;
printf(" time = %d us\n", timeuse);
//print(a);
free(a[0]); //释放内存
free(b[0]);
free(c[0]);
free(a);
free(b);
free(c);
}