/* 
 * Notes: 
 *    1.  The number of processes, p, must be a perfect
 *        square.
 *    2.  The order of the matrices must be evenly
 *        divisible by sqrt(p).
 */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"
#define DEBUG 1
#define SYNCHRONIZE 1

typedef struct {
    int p;             /* Total number of processes */
    MPI_Comm comm;     /* Communicator for entire grid */
    MPI_Comm row_comm; /* Communicator for my row */
    MPI_Comm col_comm; /* Communicator for my col */
    int q;             /* Order of grid */
    int my_row;        /* My row number */
    int my_col;        /* My column number */
    int my_rank;       /* My rank in the grid communicator */
} GRID_INFO_TYPE;

typedef struct {
    int order;
    float* mat;
} LOCAL_MATRIX_TYPE;

/* Macros to access members of LOCAL_MATRIX_TYPE */
#define Entry(A,i,j) (*( ((A)->mat) + (i)*((A)->order) + (j)) )
#define Order(A) ((A)->order)
#define Matrix(A) ((A)->mat)

MPI_Datatype DERIVED_LOCAL_MATRIX;

void Setup_grid(GRID_INFO_TYPE* grid) {
    int old_rank;
    int dimensions[2];
    int periods[2];
    int coordinates[2];
    int varying_coords[2];

    /* Set up Global Grid Information */
    MPI_Comm_size(MPI_COMM_WORLD, &(grid->p));
    MPI_Comm_rank(MPI_COMM_WORLD, &old_rank);
    grid->q = (int) sqrt((double) grid->p);
    dimensions[0] = dimensions[1] = grid->q;
    periods[0] = periods[1] = 1;
    MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 
        1, &(grid->comm));
    MPI_Comm_rank(grid->comm, &(grid->my_rank));
    MPI_Cart_coords(grid->comm, grid->my_rank, 2, 
        coordinates);
    grid->my_row = coordinates[0];
    grid->my_col = coordinates[1];

    /* Set up row and column communicators */
    varying_coords[0] = 0; varying_coords[1] = 1;
    MPI_Cart_sub(grid->comm, varying_coords, 
        &(grid->row_comm));
    varying_coords[0] = 1; varying_coords[1] = 0;
    MPI_Cart_sub(grid->comm, varying_coords, 
        &(grid->col_comm));
} /* Setup_grid */


LOCAL_MATRIX_TYPE* Local_matrix_allocate(int size) {
    LOCAL_MATRIX_TYPE* temp;

    temp = (LOCAL_MATRIX_TYPE*) malloc(sizeof(LOCAL_MATRIX_TYPE));
    temp->order = size;
    temp->mat = (float*) malloc(size*size*sizeof(float));

    return temp;

} /* Local_matrix_allocate */

void Set_to_zero(LOCAL_MATRIX_TYPE* X) {
    int i;

    for (i = 0; i < Order(X)*Order(X); i++)
        X->mat[i] = 0.0;
} /* Set_to_zero */


void Local_matrix_multiply(LOCAL_MATRIX_TYPE* A,
    LOCAL_MATRIX_TYPE* B, LOCAL_MATRIX_TYPE* C) {
    int i, j, k;

    for (i = 0; i < Order(A); i++)
        for (j = 0; j < Order(A); j++)
            for (k = 0; k < Order(A); k++) 
                Entry(C,i,j) += Entry(A,i,k)*Entry(B,k,j);

}  /* Local_matrix_multiply */


void Build_derived_type(LOCAL_MATRIX_TYPE* X){
    int block_lengths[2];
    MPI_Aint addresses[3];
    MPI_Aint displacements[2];
    MPI_Datatype typelist[2];
    MPI_Datatype temp_type;

    MPI_Type_contiguous(Order(X)*Order(X), MPI_FLOAT, &temp_type);
    MPI_Type_commit(&temp_type);

    typelist[0] = MPI_INT;
    typelist[1] = temp_type;

    block_lengths[0] = 1;
    block_lengths[1] = 1;

    MPI_Get_address(X, &addresses[0]);
    MPI_Get_address(&(X->order), &addresses[1]);
    MPI_Get_address(&(X->mat), &addresses[2]);
    displacements[0] = addresses[1] - addresses[0];
    displacements[1] = addresses[2] - addresses[0];

    MPI_Type_create_struct(2, block_lengths, displacements, typelist,
        &DERIVED_LOCAL_MATRIX);

    MPI_Type_commit(&DERIVED_LOCAL_MATRIX);
} /* Build_derived_type */


void Torus(int my_rank, int n, GRID_INFO_TYPE* grid, 
    LOCAL_MATRIX_TYPE* local_A,
    LOCAL_MATRIX_TYPE* local_B,
    LOCAL_MATRIX_TYPE* local_C) {
    LOCAL_MATRIX_TYPE* temp_A;
    LOCAL_MATRIX_TYPE* TEMP;
    int step;
    int bcast_root;
    int n_bar;  /* order of block submatrix = n/q */
    int source;
    int dest;
    int tag = 43;
    int retval;
    MPI_Status status;
    int order;

    if(DEBUG)
      fprintf(stderr, "Clone %d zeroing C\n", my_rank);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);

    n_bar = n/grid->q;
    order = Order(local_A);
    Set_to_zero(local_C);

    /* Calculate addresses for circular shift of B */  
    source = (grid->my_row + 1) % grid->q;
    dest = (grid->my_row + grid->q - 1) % grid->q;

    if(DEBUG)
      fprintf(stderr, "Clone %d's source = %d dest = %d\n", my_rank, source, dest);

    if(DEBUG)
      fprintf(stderr, "Clone %d allocating temp_A\n", my_rank);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);

    /* Set aside storage for the broadcast block of A */
    temp_A = Local_matrix_allocate(n_bar);

    if(DEBUG)
      fprintf(stderr, "Clone %d commencing loop\n", my_rank);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);


    for (step = 0; step < grid->q; step++){

    if(DEBUG)
      fprintf(stderr, "Clone %d commencing  loop iteration %d\n", my_rank, step);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);

    bcast_root = (grid->my_row + step) % grid->q;
    TEMP = ( (bcast_root == grid->my_col) ? local_A : temp_A );


    if(DEBUG)
      fprintf(stderr, "Clone %d in loop iteration %d broadcasting A\n", my_rank, step);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);


    MPI_Bcast(TEMP, 1, DERIVED_LOCAL_MATRIX, bcast_root, grid->row_comm);


    if(DEBUG)
      fprintf(stderr, "Clone %d in loop iteration %d multiplying \n", my_rank, step);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);

    Local_matrix_multiply(TEMP, local_B, local_C);

    if(DEBUG)
      fprintf(stderr, "Clone %d passing B up\n", my_rank);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);

    retval = MPI_Send(local_B, 1, DERIVED_LOCAL_MATRIX, dest, tag, grid->col_comm);

    if(DEBUG)
      fprintf(stderr, "Clone %d  B send returned %d\n", my_rank, retval);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);

    if(DEBUG)
      fprintf(stderr, "Clone %d receiving B from below\n", my_rank);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);

    retval = MPI_Recv(local_B, 1, DERIVED_LOCAL_MATRIX, source, tag, grid->col_comm, &status);

    if(DEBUG)
      fprintf(stderr, "Clone %d receive of B returned %d\n", my_rank, retval);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);


    } /* for */
    
} /* Torus */

void Initialize_A(GRID_INFO_TYPE* grid, LOCAL_MATRIX_TYPE* local_A) {
    int i, j;
    int order;

    order = Order(local_A);
    for (i = 0; i < Order(local_A); i++)
        for (j = 0; j < Order(local_A); j++)
            Entry(local_A,i,j) = 
                order*grid->my_row + order*grid->my_col + i+j;
} /* Initialize_A */

void Initialize_B(LOCAL_MATRIX_TYPE* local_B) {
    int i, j;

    for (i = 0; i < Order(local_B); i++)
        for (j = 0; j < Order(local_B); j++)
            Entry(local_B,i,j) = 1.0;
} /* Initialize_B */

int main(int argc, char** argv) {
    int my_rank;
    int p;
    int n;
    int n_bar;
    LOCAL_MATRIX_TYPE* local_A;
    LOCAL_MATRIX_TYPE* local_B;
    LOCAL_MATRIX_TYPE* local_C;
    GRID_INFO_TYPE grid_struct;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &p);
    if(DEBUG)
      fprintf(stderr, "Clone %d setting up the grid\n", my_rank);
    Setup_grid(&grid_struct);
    n = 8;
    n_bar = n/grid_struct.q;
    if(DEBUG)
      fprintf(stderr, "Clone %d allocating matrices\n", my_rank);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);
    local_A = Local_matrix_allocate(n_bar);
    local_B = Local_matrix_allocate(n_bar);
    local_C = Local_matrix_allocate(n_bar);
    if(DEBUG)
      fprintf(stderr, "Clone %d initializing A\n", my_rank);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);
    Initialize_A(&grid_struct, local_A);
    if(DEBUG)
      fprintf(stderr, "Clone %d initializing B\n", my_rank);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);
    Initialize_B(local_B);
    if(DEBUG)
      fprintf(stderr, "Clone %d building derived type\n", my_rank);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);
    Build_derived_type(local_A);
    
    if(DEBUG)
      fprintf(stderr, "Clone %d executing torus\n", my_rank);
    if(SYNCHRONIZE)
      MPI_Barrier(MPI_COMM_WORLD);
    Torus(my_rank, n, &grid_struct, local_A, local_B, local_C);
    MPI_Finalize();
    return 0;
} /* main */