算法描述如下:
将待相乘的矩阵A和B分成p个方块Ai,j和Bi,j(0≤j,i≤√p-1),每块大小为(n/√p)×(n/√p),并将他们分配给√p×/√p,个处理器。开始时处理器Pi,j存放有Ai,j和Bi,j,并负责计算块Ci,j。然后Fox算法执行以下√p次迭代,即可完成:
①选中对角块Ai,j,并将其向所在行的√p-1个处理器进行一到多播送;
②各处理器将所收到的A阵的块和B阵原有的块进行乘加运算;
*③*B阵的块向上循环1步;
④如果Ai,j是本次播送的块,则下次应选块Ai,(j+1)mod√p向同行的√p-1个处理器播送,然后转第②步。
如上所述,算法需要多次用到各种数组,为编程方便,定义如下结构体:
typedef struct {
int n_bar;
#define Order(A) ((A)->n_bar)
float entries[MAX];
#define Entry(A,i,j) ((((A)->entries) + ((A)->n_bar)(i) + (j)))
} LOCAL_MATRIX_T;
同样,由于矩阵划分成子矩阵进行并行相乘时,会用到很多通信子域,这里也定义如下结构体
typedef struct {
int p;
MPI_Comm comm;
MPI_Comm row_comm;
MPI_Comm col_comm;
int q;
int my_row;
int my_col;
int my_rank;
} GRID_INFO_T;
整个算法主要用到读取数据、划分数据、子矩阵运算、写入结果等部分,因此定义如下函数:
LOCAL_MATRIX_T* Local_matrix_allocate(int n_bar);
void Free_local_matrix(LOCAL_MATRIX_T** local_A);
void Read_matrix(char* prompt, LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid, int n);
void Print_matrix(char* title, LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid, int n);
void write_matrix(LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid, int n);
void Set_to_zero(LOCAL_MATRIX_T* local_A);
void Local_matrix_multiply(LOCAL_MATRIX_T* local_A,
LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);
void Build_matrix_type(LOCAL_MATRIX_T* local_A);
MPI_Datatype local_matrix_mpi_t;
LOCAL_MATRIX_T* temp_mat;
void Print_local_matrices(char* title, LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid);
完整代码如下:
/ fox.c – uses Fox’s algorithm to multiply two square matrices
*
* Created by Wangcan
*/
#include “stdio.h”
#include “mpi.h”
#include “math.h”
#include “stdlib.h”
typedef struct {
int p; /* Total number of processes */
MPI_Comm comm; /* Communicator for entire grid */
MPI_Comm row_comm; /* Communicator for my row */
MPI_Comm col_comm; /* Communicator for my col */
int q; /* Order of grid */
int my_row; /* My row number */
int my_col; /* My column number */
int my_rank; /* My rank in the grid comm */
} GRID_INFO_T;
#define MAX 65536
typedef struct {
int n_bar;
#define Order(A) ((A)->n_bar)
float entries[MAX];
#define Entry(A,i,j) ((((A)->entries) + ((A)->n_bar)(i) + (j)))
} LOCAL_MATRIX_T;
FILE* fp;
/* Function Declarations */
LOCAL_MATRIX_T* Local_matrix_allocate(int n_bar);
void Free_local_matrix(LOCAL_MATRIX_T** local_A);
void Read_matrix(char* prompt, LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid, int n);
void Print_matrix(char* title, LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid, int n);
void write_matrix(LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid, int n);
void Set_to_zero(LOCAL_MATRIX_T* local_A);
void Local_matrix_multiply(LOCAL_MATRIX_T* local_A,
LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);
void Build_matrix_type(LOCAL_MATRIX_T* local_A);
MPI_Datatype local_matrix_mpi_t;
LOCAL_MATRIX_T* temp_mat;
void Print_local_matrices(char* title, LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid);
/***********************************************/
main(int argc, char* argv[]) {
int p;
int my_rank;
GRID_INFO_T grid;
LOCAL_MATRIX_T* local_A;
LOCAL_MATRIX_T* local_B;
LOCAL_MATRIX_T* local_C;
int n;
int n_bar;
double start,finish,s_t,f_t;
void Setup_grid(GRID_INFO_T* grid);
void Fox(int n, GRID_INFO_T* grid, LOCAL_MATRIX_T* local_A,
LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
Setup_grid(&grid);
if (my_rank == 0) {
printf("What's the order of the matrices?\n");
scanf("%d", &n);
}
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
n_bar = n/grid.q;
srand((unsigned) time(NULL));
local_A = Local_matrix_allocate(n_bar);
Order(local_A) = n_bar;
Read_matrix("Enter A", local_A, &grid, n);
// Print_matrix(“We read A =”, local_A, &grid, n);
fp = fopen(“/home/guest/16011011/commonpart/question3/dataInA.txt”,”w”);
write_matrix(local_A,&grid,n);
fclose(fp);
local_B = Local_matrix_allocate(n_bar);
Order(local_B) = n_bar;
Read_matrix("Enter B", local_B, &grid, n);
// Print_matrix(“We read B =”, local_B, &grid, n);
fp = fopen(“/home/guest/16011011/commonpart/question3/dataInB.txt”,”w”);
write_matrix(local_B,&grid,n);
fclose(fp);
Build_matrix_type(local_A);
temp_mat = Local_matrix_allocate(n_bar);
local_C = Local_matrix_allocate(n_bar);
Order(local_C) = n_bar;
start=MPI_Wtime();
Fox(n, &grid, local_A, local_B, local_C);
finish=MPI_Wtime();
MPI_Reduce(&start,&s_t,1,MPI_DOUBLE,MPI_MIN,0,MPI_COMM_WORLD);
MPI_Reduce(&finish,&f_t,1,MPI_DOUBLE,MPI_MIN,0,MPI_COMM_WORLD);
fp = fopen("/home/guest/SC16011011/commonpart/question3/dataOutC.txt","w");
write_matrix(local_C,&grid,n);
fclose(fp);
if(my_rank==0)
printf("time= %f\n",f_t-s_t);
// Print_matrix(“The product is”, local_C, &grid, n);
Free_local_matrix(&local_A);
Free_local_matrix(&local_B);
Free_local_matrix(&local_C);
MPI_Finalize();
} /* main */
/***********************************************/
void Setup_grid(
GRID_INFO_T* grid /* out */) {
int old_rank;
int dimensions[2];
int wrap_around[2];
int coordinates[2];
int free_coords[2];
/* Set up Global Grid Information */
MPI_Comm_size(MPI_COMM_WORLD, &(grid->p));
MPI_Comm_rank(MPI_COMM_WORLD, &old_rank);
/* We assume p is a perfect square */
grid->q = (int) sqrt((double) grid->p);
dimensions[0] = dimensions[1] = grid->q;
/* We want a circular shift in second dimension. */
/* Don't care about first */
wrap_around[0] = wrap_around[1] = 1;
MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions,
wrap_around, 1, &(grid->comm));
MPI_Comm_rank(grid->comm, &(grid->my_rank));
MPI_Cart_coords(grid->comm, grid->my_rank, 2,
coordinates);
grid->my_row = coordinates[0];
grid->my_col = coordinates[1];
/* Set up row communicators */
free_coords[0] = 0;
free_coords[1] = 1;
MPI_Cart_sub(grid->comm, free_coords,
&(grid->row_comm));
/* Set up column communicators */
free_coords[0] = 1;
free_coords[1] = 0;
MPI_Cart_sub(grid->comm, free_coords,
&(grid->col_comm));
} /* Setup_grid */
/***********************************************/
void Fox(
int n /* in */,
GRID_INFO_T* grid /* in */,
LOCAL_MATRIX_T* local_A /* in */,
LOCAL_MATRIX_T* local_B /* in */,
LOCAL_MATRIX_T* local_C /* out */) {
LOCAL_MATRIX_T* temp_A; /* Storage for the sub- */
/* matrix of A used during */
/* the current stage */
int stage;
int bcast_root;
int n_bar; /* n/sqrt(p) */
int source;
int dest;
MPI_Status status;
n_bar = n/grid->q;
Set_to_zero(local_C);
/* Calculate addresses for circular shift of B */
source = (grid->my_row + 1) % grid->q;
dest = (grid->my_row + grid->q - 1) % grid->q;
/* Set aside storage for the broadcast block of A */
temp_A = Local_matrix_allocate(n_bar);
for (stage = 0; stage < grid->q; stage++) {
bcast_root = (grid->my_row + stage) % grid->q;
if (bcast_root == grid->my_col) {
MPI_Bcast(local_A, 1, local_matrix_mpi_t,
bcast_root, grid->row_comm);
Local_matrix_multiply(local_A, local_B,
local_C);
} else {
MPI_Bcast(temp_A, 1, local_matrix_mpi_t,
bcast_root, grid->row_comm);
Local_matrix_multiply(temp_A, local_B,
local_C);
}
MPI_Sendrecv_replace(local_B, 1, local_matrix_mpi_t,
dest, 0, source, 0, grid->col_comm, &status);
} /* for */
} /* Fox */
/***********************************************/
LOCAL_MATRIX_T* Local_matrix_allocate(int local_order) {
LOCAL_MATRIX_T* temp;
temp = (LOCAL_MATRIX_T*) malloc(sizeof(LOCAL_MATRIX_T));
return temp;
} /* Local_matrix_allocate */
/***********************************************/
void Free_local_matrix(
LOCAL_MATRIX_T** local_A_ptr /* in/out */) {
free(*local_A_ptr);
} /* Free_local_matrix */
/***********************************************/
/* Read and distribute matrix:
* foreach global row of the matrix,
* foreach grid column
* read a block of n_bar floats on process 0
* and send them to the appropriate process.
*/
void Read_matrix(
char* prompt /* in */,
LOCAL_MATRIX_T* local_A /* out */,
GRID_INFO_T* grid /* in */,
int n /* in */) {
int mat_row, mat_col;
int grid_row, grid_col;
int dest;
int coords[2];
float* temp;
MPI_Status status;
if (grid->my_rank == 0) {
temp = (float*) malloc(Order(local_A)*sizeof(float));
printf("%s\n", prompt);
fflush(stdout);
for (mat_row = 0; mat_row < n; mat_row++) {
grid_row = mat_row/Order(local_A);
coords[0] = grid_row;
for (grid_col = 0; grid_col < grid->q; grid_col++) {
coords[1] = grid_col;
MPI_Cart_rank(grid->comm, coords, &dest);
if (dest == 0) {
for (mat_col = 0; mat_col < Order(local_A); mat_col++)
// scanf("%f",
//(local_A->entries)+mat_row*Order(local_A)+mat_col);
*((local_A->entries)+mat_row*Order(local_A)+mat_col)=rand()%10;
} else {
for(mat_col = 0; mat_col < Order(local_A); mat_col++)
//scanf("%f", temp + mat_col);
*(temp+mat_col)=rand()%10;
MPI_Send(temp, Order(local_A), MPI_FLOAT, dest, 0,
grid->comm);
}
}
}
free(temp);
} else {
for (mat_row = 0; mat_row < Order(local_A); mat_row++)
MPI_Recv(&Entry(local_A, mat_row, 0), Order(local_A),
MPI_FLOAT, 0, 0, grid->comm, &status);
}
} /* Read_matrix */
/***********************************************/
void Print_matrix(
char* title /* in */,
LOCAL_MATRIX_T* local_A /* out */,
GRID_INFO_T* grid /* in */,
int n /* in */) {
int mat_row, mat_col;
int grid_row, grid_col;
int source;
int coords[2];
float* temp;
MPI_Status status;
if (grid->my_rank == 0) {
temp = (float*) malloc(Order(local_A)*sizeof(float));
printf("%s\n", title);
for (mat_row = 0; mat_row < n; mat_row++) {
grid_row = mat_row/Order(local_A);
coords[0] = grid_row;
for (grid_col = 0; grid_col < grid->q; grid_col++) {
coords[1] = grid_col;
MPI_Cart_rank(grid->comm, coords, &source);
if (source == 0) {
for(mat_col = 0; mat_col < Order(local_A); mat_col++)
printf("%4.1f ", Entry(local_A, mat_row, mat_col));
} else {
MPI_Recv(temp, Order(local_A), MPI_FLOAT, source, 0,
grid->comm, &status);
for(mat_col = 0; mat_col < Order(local_A); mat_col++)
printf("%4.1f ", temp[mat_col]);
}
}
printf("\n");
}
free(temp);
} else {
for (mat_row = 0; mat_row < Order(local_A); mat_row++)
MPI_Send(&Entry(local_A, mat_row, 0), Order(local_A),
MPI_FLOAT, 0, 0, grid->comm);
}
} /* Print_matrix */
/***********************************************/
void write_matrix(
LOCAL_MATRIX_T* local_A,/in/
GRID_INFO_T* grid, /in/
int n /in/){
int mat_row, mat_col;
int grid_row, grid_col;
int source;
int coords[2];
float* temp;
MPI_Status status;
if (grid->my_rank == 0) {
temp = (float*) malloc(Order(local_A)*sizeof(float));
for (mat_row = 0; mat_row < n; mat_row++) {
grid_row = mat_row/Order(local_A);
coords[0] = grid_row;
for (grid_col = 0; grid_col < grid->q; grid_col++) {
coords[1] = grid_col;
MPI_Cart_rank(grid->comm, coords, &source);
if (source == 0) {
for(mat_col = 0; mat_col < Order(local_A); mat_col++)
fprintf(fp,”%4.1f “, Entry(local_A, mat_row, mat_col));
} else {
MPI_Recv(temp, Order(local_A), MPI_FLOAT, source, 0,
grid->comm, &status);
for(mat_col = 0; mat_col < Order(local_A); mat_col++)
fprintf(fp,”%4.1f “, temp[mat_col]);
}
}
fprintf(fp,”\n”);
}
free(temp);
} else {
for (mat_row = 0; mat_row < Order(local_A); mat_row++)
MPI_Send(&Entry(local_A, mat_row, 0), Order(local_A),
MPI_FLOAT, 0, 0, grid->comm);
}
}/*write_matrix*/
/***********************************************/
void Set_to_zero(
LOCAL_MATRIX_T* local_A /* out */) {
int i, j;
for (i = 0; i < Order(local_A); i++)
for (j = 0; j < Order(local_A); j++)
Entry(local_A,i,j) = 0.0;
} /* Set_to_zero */
/***********************************************/
void Build_matrix_type(
LOCAL_MATRIX_T* local_A /* in */) {
MPI_Datatype temp_mpi_t;
int block_lengths[2];
MPI_Aint displacements[2];
MPI_Datatype typelist[2];
MPI_Aint start_address;
MPI_Aint address;
MPI_Type_contiguous(Order(local_A)*Order(local_A),
MPI_FLOAT, &temp_mpi_t);
block_lengths[0] = block_lengths[1] = 1;
typelist[0] = MPI_INT;
typelist[1] = temp_mpi_t;
MPI_Address(local_A, &start_address);
MPI_Address(&(local_A->n_bar), &address);
displacements[0] = address - start_address;
MPI_Address(local_A->entries, &address);
displacements[1] = address - start_address;
MPI_Type_struct(2, block_lengths, displacements,
typelist, &local_matrix_mpi_t);
MPI_Type_commit(&local_matrix_mpi_t);
} /* Build_matrix_type */
/***********************************************/
void Local_matrix_multiply(
LOCAL_MATRIX_T* local_A /* in */,
LOCAL_MATRIX_T* local_B /* in */,
LOCAL_MATRIX_T* local_C /* out */) {
int i, j, k;
for (i = 0; i < Order(local_A); i++)
for (j = 0; j < Order(local_A); j++)
for (k = 0; k < Order(local_B); k++)
Entry(local_C,i,j) = Entry(local_C,i,j)
+ Entry(local_A,i,k)*Entry(local_B,k,j);
} /* Local_matrix_multiply */
/***********************************************/
void Print_local_matrices(
char* title /* in */,
LOCAL_MATRIX_T* local_A /* in */,
GRID_INFO_T* grid /* in */) {
int coords[2];
int i, j;
int source;
MPI_Status status;
if (grid->my_rank == 0) {
printf("%s\n", title);
printf("Process %d > grid_row = %d, grid_col = %d\n",
grid->my_rank, grid->my_row, grid->my_col);
for (i = 0; i < Order(local_A); i++) {
for (j = 0; j < Order(local_A); j++)
printf("%4.1f ", Entry(local_A,i,j));
printf("\n");
}
for (source = 1; source < grid->p; source++) {
MPI_Recv(temp_mat, 1, local_matrix_mpi_t, source, 0,
grid->comm, &status);
MPI_Cart_coords(grid->comm, source, 2, coords);
printf("Process %d > grid_row = %d, grid_col = %d\n",
source, coords[0], coords[1]);
for (i = 0; i < Order(temp_mat); i++) {
for (j = 0; j < Order(temp_mat); j++)
printf("%4.1f ", Entry(temp_mat,i,j));
printf("\n");
}
}
fflush(stdout);
} else {
MPI_Send(local_A, 1, local_matrix_mpi_t, 0, 0, grid->comm);
}
} /* Print_local_matrices */
*
参考代码