/* * Notes: * 1. The number of processes, p, must be a perfect * square. * 2. The order of the matrices must be evenly * divisible by sqrt(p). */ #include #include #include #include "mpi.h" #define DEBUG 1 #define SYNCHRONIZE 1 typedef struct { int p; /* Total number of processes */ MPI_Comm comm; /* Communicator for entire grid */ MPI_Comm row_comm; /* Communicator for my row */ MPI_Comm col_comm; /* Communicator for my col */ int q; /* Order of grid */ int my_row; /* My row number */ int my_col; /* My column number */ int my_rank; /* My rank in the grid communicator */ } GRID_INFO_TYPE; typedef struct { int order; float* mat; } LOCAL_MATRIX_TYPE; /* Macros to access members of LOCAL_MATRIX_TYPE */ #define Entry(A,i,j) (*( ((A)->mat) + (i)*((A)->order) + (j)) ) #define Order(A) ((A)->order) #define Matrix(A) ((A)->mat) MPI_Datatype DERIVED_LOCAL_MATRIX; void Setup_grid(GRID_INFO_TYPE* grid) { int old_rank; int dimensions[2]; int periods[2]; int coordinates[2]; int varying_coords[2]; /* Set up Global Grid Information */ MPI_Comm_size(MPI_COMM_WORLD, &(grid->p)); MPI_Comm_rank(MPI_COMM_WORLD, &old_rank); grid->q = (int) sqrt((double) grid->p); dimensions[0] = dimensions[1] = grid->q; periods[0] = periods[1] = 1; MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &(grid->comm)); MPI_Comm_rank(grid->comm, &(grid->my_rank)); MPI_Cart_coords(grid->comm, grid->my_rank, 2, coordinates); grid->my_row = coordinates[0]; grid->my_col = coordinates[1]; /* Set up row and column communicators */ varying_coords[0] = 0; varying_coords[1] = 1; MPI_Cart_sub(grid->comm, varying_coords, &(grid->row_comm)); varying_coords[0] = 1; varying_coords[1] = 0; MPI_Cart_sub(grid->comm, varying_coords, &(grid->col_comm)); } /* Setup_grid */ LOCAL_MATRIX_TYPE* Local_matrix_allocate(int size) { LOCAL_MATRIX_TYPE* temp; temp = (LOCAL_MATRIX_TYPE*) malloc(sizeof(LOCAL_MATRIX_TYPE)); temp->order = size; temp->mat = (float*) malloc(size*size*sizeof(float)); return temp; } /* Local_matrix_allocate */ void Set_to_zero(LOCAL_MATRIX_TYPE* X) { int i; for (i = 0; i < Order(X)*Order(X); i++) X->mat[i] = 0.0; } /* Set_to_zero */ void Local_matrix_multiply(LOCAL_MATRIX_TYPE* A, LOCAL_MATRIX_TYPE* B, LOCAL_MATRIX_TYPE* C) { int i, j, k; for (i = 0; i < Order(A); i++) for (j = 0; j < Order(A); j++) for (k = 0; k < Order(A); k++) Entry(C,i,j) += Entry(A,i,k)*Entry(B,k,j); } /* Local_matrix_multiply */ void Build_derived_type(LOCAL_MATRIX_TYPE* X){ int block_lengths[2]; MPI_Aint addresses[3]; MPI_Aint displacements[2]; MPI_Datatype typelist[2]; MPI_Datatype temp_type; MPI_Type_contiguous(Order(X)*Order(X), MPI_FLOAT, &temp_type); MPI_Type_commit(&temp_type); typelist[0] = MPI_INT; typelist[1] = temp_type; block_lengths[0] = 1; block_lengths[1] = 1; MPI_Get_address(X, &addresses[0]); MPI_Get_address(&(X->order), &addresses[1]); MPI_Get_address(&(X->mat), &addresses[2]); displacements[0] = addresses[1] - addresses[0]; displacements[1] = addresses[2] - addresses[0]; MPI_Type_create_struct(2, block_lengths, displacements, typelist, &DERIVED_LOCAL_MATRIX); MPI_Type_commit(&DERIVED_LOCAL_MATRIX); } /* Build_derived_type */ void Torus(int my_rank, int n, GRID_INFO_TYPE* grid, LOCAL_MATRIX_TYPE* local_A, LOCAL_MATRIX_TYPE* local_B, LOCAL_MATRIX_TYPE* local_C) { LOCAL_MATRIX_TYPE* temp_A; LOCAL_MATRIX_TYPE* TEMP; int step; int bcast_root; int n_bar; /* order of block submatrix = n/q */ int source; int dest; int tag = 43; int retval; MPI_Status status; int order; if(DEBUG) fprintf(stderr, "Clone %d zeroing C\n", my_rank); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); n_bar = n/grid->q; order = Order(local_A); Set_to_zero(local_C); /* Calculate addresses for circular shift of B */ source = (grid->my_row + 1) % grid->q; dest = (grid->my_row + grid->q - 1) % grid->q; if(DEBUG) fprintf(stderr, "Clone %d's source = %d dest = %d\n", my_rank, source, dest); if(DEBUG) fprintf(stderr, "Clone %d allocating temp_A\n", my_rank); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); /* Set aside storage for the broadcast block of A */ temp_A = Local_matrix_allocate(n_bar); if(DEBUG) fprintf(stderr, "Clone %d commencing loop\n", my_rank); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); for (step = 0; step < grid->q; step++){ if(DEBUG) fprintf(stderr, "Clone %d commencing loop iteration %d\n", my_rank, step); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); bcast_root = (grid->my_row + step) % grid->q; TEMP = ( (bcast_root == grid->my_col) ? local_A : temp_A ); if(DEBUG) fprintf(stderr, "Clone %d in loop iteration %d broadcasting A\n", my_rank, step); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); MPI_Bcast(TEMP, 1, DERIVED_LOCAL_MATRIX, bcast_root, grid->row_comm); if(DEBUG) fprintf(stderr, "Clone %d in loop iteration %d multiplying \n", my_rank, step); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); Local_matrix_multiply(TEMP, local_B, local_C); if(DEBUG) fprintf(stderr, "Clone %d passing B up\n", my_rank); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); retval = MPI_Send(local_B, 1, DERIVED_LOCAL_MATRIX, dest, tag, grid->col_comm); if(DEBUG) fprintf(stderr, "Clone %d B send returned %d\n", my_rank, retval); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); if(DEBUG) fprintf(stderr, "Clone %d receiving B from below\n", my_rank); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); retval = MPI_Recv(local_B, 1, DERIVED_LOCAL_MATRIX, source, tag, grid->col_comm, &status); if(DEBUG) fprintf(stderr, "Clone %d receive of B returned %d\n", my_rank, retval); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); } /* for */ } /* Torus */ void Initialize_A(GRID_INFO_TYPE* grid, LOCAL_MATRIX_TYPE* local_A) { int i, j; int order; order = Order(local_A); for (i = 0; i < Order(local_A); i++) for (j = 0; j < Order(local_A); j++) Entry(local_A,i,j) = order*grid->my_row + order*grid->my_col + i+j; } /* Initialize_A */ void Initialize_B(LOCAL_MATRIX_TYPE* local_B) { int i, j; for (i = 0; i < Order(local_B); i++) for (j = 0; j < Order(local_B); j++) Entry(local_B,i,j) = 1.0; } /* Initialize_B */ int main(int argc, char** argv) { int my_rank; int p; int n; int n_bar; LOCAL_MATRIX_TYPE* local_A; LOCAL_MATRIX_TYPE* local_B; LOCAL_MATRIX_TYPE* local_C; GRID_INFO_TYPE grid_struct; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &p); if(DEBUG) fprintf(stderr, "Clone %d setting up the grid\n", my_rank); Setup_grid(&grid_struct); n = 8; n_bar = n/grid_struct.q; if(DEBUG) fprintf(stderr, "Clone %d allocating matrices\n", my_rank); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); local_A = Local_matrix_allocate(n_bar); local_B = Local_matrix_allocate(n_bar); local_C = Local_matrix_allocate(n_bar); if(DEBUG) fprintf(stderr, "Clone %d initializing A\n", my_rank); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); Initialize_A(&grid_struct, local_A); if(DEBUG) fprintf(stderr, "Clone %d initializing B\n", my_rank); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); Initialize_B(local_B); if(DEBUG) fprintf(stderr, "Clone %d building derived type\n", my_rank); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); Build_derived_type(local_A); if(DEBUG) fprintf(stderr, "Clone %d executing torus\n", my_rank); if(SYNCHRONIZE) MPI_Barrier(MPI_COMM_WORLD); Torus(my_rank, n, &grid_struct, local_A, local_B, local_C); MPI_Finalize(); return 0; } /* main */