FOX算法的MPI实现

算法描述如下:
将待相乘的矩阵A和B分成p个方块Ai,j和Bi,j(0≤j,i≤√p-1),每块大小为(n/√p)×(n/√p),并将他们分配给√p×/√p,个处理器。开始时处理器Pi,j存放有Ai,j和Bi,j,并负责计算块Ci,j。然后Fox算法执行以下√p次迭代,即可完成:
选中对角块Ai,j,并将其向所在行的√p-1个处理器进行一到多播送;
各处理器将所收到的A阵的块和B阵原有的块进行乘加运算;
*③*B阵的块向上循环1步;
如果Ai,j是本次播送的块,则下次应选块Ai,(j+1)mod√p向同行的√p-1个处理器播送,然后转第②步。
如上所述,算法需要多次用到各种数组,为编程方便,定义如下结构体:
typedef struct {
int n_bar;
#define Order(A) ((A)->n_bar)
float entries[MAX];
#define Entry(A,i,j) ((((A)->entries) + ((A)->n_bar)(i) + (j)))
} LOCAL_MATRIX_T;
同样,由于矩阵划分成子矩阵进行并行相乘时,会用到很多通信子域,这里也定义如下结构体
typedef struct {
int p;
MPI_Comm comm;
MPI_Comm row_comm;
MPI_Comm col_comm;
int q;
int my_row;
int my_col;
int my_rank;
} GRID_INFO_T;
整个算法主要用到读取数据、划分数据、子矩阵运算、写入结果等部分,因此定义如下函数:
LOCAL_MATRIX_T* Local_matrix_allocate(int n_bar);
void Free_local_matrix(LOCAL_MATRIX_T** local_A);
void Read_matrix(char* prompt, LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid, int n);
void Print_matrix(char* title, LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid, int n);
void write_matrix(LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid, int n);
void Set_to_zero(LOCAL_MATRIX_T* local_A);
void Local_matrix_multiply(LOCAL_MATRIX_T* local_A,
LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);
void Build_matrix_type(LOCAL_MATRIX_T* local_A);
MPI_Datatype local_matrix_mpi_t;

LOCAL_MATRIX_T* temp_mat;
void Print_local_matrices(char* title, LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid);
完整代码如下:

/ fox.c – uses Fox’s algorithm to multiply two square matrices
*
* Created by Wangcan
*/
#include “stdio.h”
#include “mpi.h”
#include “math.h”
#include “stdlib.h”

typedef struct {
int p; /* Total number of processes */
MPI_Comm comm; /* Communicator for entire grid */
MPI_Comm row_comm; /* Communicator for my row */
MPI_Comm col_comm; /* Communicator for my col */
int q; /* Order of grid */
int my_row; /* My row number */
int my_col; /* My column number */
int my_rank; /* My rank in the grid comm */
} GRID_INFO_T;

#define MAX 65536
typedef struct {
int n_bar;
#define Order(A) ((A)->n_bar)
float entries[MAX];
#define Entry(A,i,j) ((((A)->entries) + ((A)->n_bar)(i) + (j)))
} LOCAL_MATRIX_T;
FILE* fp;
/* Function Declarations */
LOCAL_MATRIX_T* Local_matrix_allocate(int n_bar);
void Free_local_matrix(LOCAL_MATRIX_T** local_A);
void Read_matrix(char* prompt, LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid, int n);
void Print_matrix(char* title, LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid, int n);
void write_matrix(LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid, int n);
void Set_to_zero(LOCAL_MATRIX_T* local_A);
void Local_matrix_multiply(LOCAL_MATRIX_T* local_A,
LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);
void Build_matrix_type(LOCAL_MATRIX_T* local_A);
MPI_Datatype local_matrix_mpi_t;

LOCAL_MATRIX_T* temp_mat;
void Print_local_matrices(char* title, LOCAL_MATRIX_T* local_A,
GRID_INFO_T* grid);

/***********************************************/
main(int argc, char* argv[]) {
int p;
int my_rank;
GRID_INFO_T grid;
LOCAL_MATRIX_T* local_A;
LOCAL_MATRIX_T* local_B;
LOCAL_MATRIX_T* local_C;
int n;
int n_bar;
double start,finish,s_t,f_t;

void Setup_grid(GRID_INFO_T*  grid);
void Fox(int n, GRID_INFO_T* grid, LOCAL_MATRIX_T* local_A,
         LOCAL_MATRIX_T* local_B, LOCAL_MATRIX_T* local_C);

MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

Setup_grid(&grid);
if (my_rank == 0) {
    printf("What's the order of the matrices?\n");
    scanf("%d", &n);
}

MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
n_bar = n/grid.q;
srand((unsigned) time(NULL));
local_A = Local_matrix_allocate(n_bar);
Order(local_A) = n_bar;
Read_matrix("Enter A", local_A, &grid, n);

// Print_matrix(“We read A =”, local_A, &grid, n);
fp = fopen(“/home/guest/16011011/commonpart/question3/dataInA.txt”,”w”);
write_matrix(local_A,&grid,n);
fclose(fp);

local_B = Local_matrix_allocate(n_bar);
Order(local_B) = n_bar;
Read_matrix("Enter B", local_B, &grid, n);

// Print_matrix(“We read B =”, local_B, &grid, n);
fp = fopen(“/home/guest/16011011/commonpart/question3/dataInB.txt”,”w”);
write_matrix(local_B,&grid,n);
fclose(fp);

Build_matrix_type(local_A);
temp_mat = Local_matrix_allocate(n_bar);

local_C = Local_matrix_allocate(n_bar);
Order(local_C) = n_bar;
start=MPI_Wtime();
Fox(n, &grid, local_A, local_B, local_C);
finish=MPI_Wtime();
MPI_Reduce(&start,&s_t,1,MPI_DOUBLE,MPI_MIN,0,MPI_COMM_WORLD);
MPI_Reduce(&finish,&f_t,1,MPI_DOUBLE,MPI_MIN,0,MPI_COMM_WORLD);
fp = fopen("/home/guest/SC16011011/commonpart/question3/dataOutC.txt","w");
write_matrix(local_C,&grid,n);
fclose(fp);
if(my_rank==0)
    printf("time= %f\n",f_t-s_t);

// Print_matrix(“The product is”, local_C, &grid, n);

Free_local_matrix(&local_A);
Free_local_matrix(&local_B);
Free_local_matrix(&local_C);

MPI_Finalize();

} /* main */

/***********************************************/
void Setup_grid(
GRID_INFO_T* grid /* out */) {
int old_rank;
int dimensions[2];
int wrap_around[2];
int coordinates[2];
int free_coords[2];

/* Set up Global Grid Information */
MPI_Comm_size(MPI_COMM_WORLD, &(grid->p));
MPI_Comm_rank(MPI_COMM_WORLD, &old_rank);

/* We assume p is a perfect square */
grid->q = (int) sqrt((double) grid->p);
dimensions[0] = dimensions[1] = grid->q;

/* We want a circular shift in second dimension. */
/* Don't care about first                        */
wrap_around[0] = wrap_around[1] = 1;
MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions,
    wrap_around, 1, &(grid->comm));
MPI_Comm_rank(grid->comm, &(grid->my_rank));
MPI_Cart_coords(grid->comm, grid->my_rank, 2,
    coordinates);
grid->my_row = coordinates[0];
grid->my_col = coordinates[1];

/* Set up row communicators */
free_coords[0] = 0;
free_coords[1] = 1;
MPI_Cart_sub(grid->comm, free_coords,
    &(grid->row_comm));

/* Set up column communicators */
free_coords[0] = 1;
free_coords[1] = 0;
MPI_Cart_sub(grid->comm, free_coords,
    &(grid->col_comm));

} /* Setup_grid */

/***********************************************/
void Fox(
int n /* in */,
GRID_INFO_T* grid /* in */,
LOCAL_MATRIX_T* local_A /* in */,
LOCAL_MATRIX_T* local_B /* in */,
LOCAL_MATRIX_T* local_C /* out */) {

LOCAL_MATRIX_T*  temp_A; /* Storage for the sub-    */
                         /* matrix of A used during */
                         /* the current stage       */
int              stage;
int              bcast_root;
int              n_bar;  /* n/sqrt(p)               */
int              source;
int              dest;
MPI_Status       status;

n_bar = n/grid->q;
Set_to_zero(local_C);

/* Calculate addresses for circular shift of B */
source = (grid->my_row + 1) % grid->q;
dest = (grid->my_row + grid->q - 1) % grid->q;

/* Set aside storage for the broadcast block of A */
temp_A = Local_matrix_allocate(n_bar);

for (stage = 0; stage < grid->q; stage++) {
    bcast_root = (grid->my_row + stage) % grid->q;
    if (bcast_root == grid->my_col) {
        MPI_Bcast(local_A, 1, local_matrix_mpi_t,
            bcast_root, grid->row_comm);
        Local_matrix_multiply(local_A, local_B,
            local_C);
    } else {
        MPI_Bcast(temp_A, 1, local_matrix_mpi_t,
            bcast_root, grid->row_comm);
        Local_matrix_multiply(temp_A, local_B,
            local_C);
    }
    MPI_Sendrecv_replace(local_B, 1, local_matrix_mpi_t,
        dest, 0, source, 0, grid->col_comm, &status);
} /* for */

} /* Fox */

/***********************************************/
LOCAL_MATRIX_T* Local_matrix_allocate(int local_order) {
LOCAL_MATRIX_T* temp;

temp = (LOCAL_MATRIX_T*) malloc(sizeof(LOCAL_MATRIX_T));
return temp;

} /* Local_matrix_allocate */

/***********************************************/
void Free_local_matrix(
LOCAL_MATRIX_T** local_A_ptr /* in/out */) {
free(*local_A_ptr);
} /* Free_local_matrix */

/***********************************************/
/* Read and distribute matrix:
* foreach global row of the matrix,
* foreach grid column
* read a block of n_bar floats on process 0
* and send them to the appropriate process.
*/
void Read_matrix(
char* prompt /* in */,
LOCAL_MATRIX_T* local_A /* out */,
GRID_INFO_T* grid /* in */,
int n /* in */) {

int        mat_row, mat_col;
int        grid_row, grid_col;
int        dest;
int        coords[2];
float*     temp;
MPI_Status status;

if (grid->my_rank == 0) {
    temp = (float*) malloc(Order(local_A)*sizeof(float));
    printf("%s\n", prompt);
    fflush(stdout);
    for (mat_row = 0;  mat_row < n; mat_row++) {
        grid_row = mat_row/Order(local_A);
        coords[0] = grid_row;
        for (grid_col = 0; grid_col < grid->q; grid_col++) {
            coords[1] = grid_col;
            MPI_Cart_rank(grid->comm, coords, &dest);
            if (dest == 0) {
                for (mat_col = 0; mat_col < Order(local_A); mat_col++)
                   // scanf("%f",
                     //(local_A->entries)+mat_row*Order(local_A)+mat_col);
         *((local_A->entries)+mat_row*Order(local_A)+mat_col)=rand()%10;
            } else {
                for(mat_col = 0; mat_col < Order(local_A); mat_col++)
                    //scanf("%f", temp + mat_col);
    *(temp+mat_col)=rand()%10;
                MPI_Send(temp, Order(local_A), MPI_FLOAT, dest, 0,
                    grid->comm);
            }
        }
    }
    free(temp);
} else {
    for (mat_row = 0; mat_row < Order(local_A); mat_row++)
        MPI_Recv(&Entry(local_A, mat_row, 0), Order(local_A),
            MPI_FLOAT, 0, 0, grid->comm, &status);
}

} /* Read_matrix */

/***********************************************/
void Print_matrix(
char* title /* in */,
LOCAL_MATRIX_T* local_A /* out */,
GRID_INFO_T* grid /* in */,
int n /* in */) {
int mat_row, mat_col;
int grid_row, grid_col;
int source;
int coords[2];
float* temp;
MPI_Status status;

if (grid->my_rank == 0) {
    temp = (float*) malloc(Order(local_A)*sizeof(float));
    printf("%s\n", title);
    for (mat_row = 0;  mat_row < n; mat_row++) {
        grid_row = mat_row/Order(local_A);
        coords[0] = grid_row;
        for (grid_col = 0; grid_col < grid->q; grid_col++) {
            coords[1] = grid_col;
            MPI_Cart_rank(grid->comm, coords, &source);
            if (source == 0) {
                for(mat_col = 0; mat_col < Order(local_A); mat_col++)
                    printf("%4.1f ", Entry(local_A, mat_row, mat_col));
            } else {
                MPI_Recv(temp, Order(local_A), MPI_FLOAT, source, 0,
                    grid->comm, &status);
                for(mat_col = 0; mat_col < Order(local_A); mat_col++)
                    printf("%4.1f ", temp[mat_col]);
            }
        }
        printf("\n");
    }
    free(temp);
} else {
    for (mat_row = 0; mat_row < Order(local_A); mat_row++)
        MPI_Send(&Entry(local_A, mat_row, 0), Order(local_A),
            MPI_FLOAT, 0, 0, grid->comm);
}

} /* Print_matrix */

/***********************************************/
void write_matrix(
LOCAL_MATRIX_T* local_A,/in/
GRID_INFO_T* grid, /in/
int n /in/){
int mat_row, mat_col;
int grid_row, grid_col;
int source;
int coords[2];
float* temp;
MPI_Status status;
if (grid->my_rank == 0) {
temp = (float*) malloc(Order(local_A)*sizeof(float));
for (mat_row = 0; mat_row < n; mat_row++) {
grid_row = mat_row/Order(local_A);
coords[0] = grid_row;
for (grid_col = 0; grid_col < grid->q; grid_col++) {
coords[1] = grid_col;
MPI_Cart_rank(grid->comm, coords, &source);
if (source == 0) {
for(mat_col = 0; mat_col < Order(local_A); mat_col++)
fprintf(fp,”%4.1f “, Entry(local_A, mat_row, mat_col));
} else {
MPI_Recv(temp, Order(local_A), MPI_FLOAT, source, 0,
grid->comm, &status);
for(mat_col = 0; mat_col < Order(local_A); mat_col++)
fprintf(fp,”%4.1f “, temp[mat_col]);
}
}
fprintf(fp,”\n”);
}
free(temp);
} else {
for (mat_row = 0; mat_row < Order(local_A); mat_row++)
MPI_Send(&Entry(local_A, mat_row, 0), Order(local_A),
MPI_FLOAT, 0, 0, grid->comm);
}

  }/*write_matrix*/

/***********************************************/
void Set_to_zero(
LOCAL_MATRIX_T* local_A /* out */) {

int i, j;

for (i = 0; i < Order(local_A); i++)
    for (j = 0; j < Order(local_A); j++)
        Entry(local_A,i,j) = 0.0;

} /* Set_to_zero */

/***********************************************/
void Build_matrix_type(
LOCAL_MATRIX_T* local_A /* in */) {
MPI_Datatype temp_mpi_t;
int block_lengths[2];
MPI_Aint displacements[2];
MPI_Datatype typelist[2];
MPI_Aint start_address;
MPI_Aint address;

MPI_Type_contiguous(Order(local_A)*Order(local_A),
    MPI_FLOAT, &temp_mpi_t);

block_lengths[0] = block_lengths[1] = 1;

typelist[0] = MPI_INT;
typelist[1] = temp_mpi_t;

MPI_Address(local_A, &start_address);
MPI_Address(&(local_A->n_bar), &address);
displacements[0] = address - start_address;

MPI_Address(local_A->entries, &address);
displacements[1] = address - start_address;

MPI_Type_struct(2, block_lengths, displacements,
    typelist, &local_matrix_mpi_t);
MPI_Type_commit(&local_matrix_mpi_t);

} /* Build_matrix_type */

/***********************************************/
void Local_matrix_multiply(
LOCAL_MATRIX_T* local_A /* in */,
LOCAL_MATRIX_T* local_B /* in */,
LOCAL_MATRIX_T* local_C /* out */) {
int i, j, k;

for (i = 0; i < Order(local_A); i++)
    for (j = 0; j < Order(local_A); j++)
        for (k = 0; k < Order(local_B); k++)
            Entry(local_C,i,j) = Entry(local_C,i,j)
                + Entry(local_A,i,k)*Entry(local_B,k,j);

} /* Local_matrix_multiply */

/***********************************************/
void Print_local_matrices(
char* title /* in */,
LOCAL_MATRIX_T* local_A /* in */,
GRID_INFO_T* grid /* in */) {

int         coords[2];
int         i, j;
int         source;
MPI_Status  status;

if (grid->my_rank == 0) {
    printf("%s\n", title);
    printf("Process %d > grid_row = %d, grid_col = %d\n",
        grid->my_rank, grid->my_row, grid->my_col);
    for (i = 0; i < Order(local_A); i++) {
        for (j = 0; j < Order(local_A); j++)
            printf("%4.1f ", Entry(local_A,i,j));
        printf("\n");
    }
    for (source = 1; source < grid->p; source++) {
        MPI_Recv(temp_mat, 1, local_matrix_mpi_t, source, 0,
            grid->comm, &status);
        MPI_Cart_coords(grid->comm, source, 2, coords);
        printf("Process %d > grid_row = %d, grid_col = %d\n",
            source, coords[0], coords[1]);
        for (i = 0; i < Order(temp_mat); i++) {
            for (j = 0; j < Order(temp_mat); j++)
                printf("%4.1f ", Entry(temp_mat,i,j));
            printf("\n");
        }
    }
    fflush(stdout);
} else {
    MPI_Send(local_A, 1, local_matrix_mpi_t, 0, 0, grid->comm);
}

} /* Print_local_matrices */
*

参考代码

你可能感兴趣的:(算法实现)