>

COSC330/530 Parallel and Distributed Computing

Lecture 16 - Advanced Communication and Performance Analysis in MPI

Dr. Mitchell Welch


Reading


Summary


Collective Communication


Collective Communication


Collective Communication


int MPI_Allreduce(void *sndbuf, void *rcvbuf, int count, MPI_Datatype datatyp, MPI_Op op, MPI_Comm comm );

Collective Communication


Collective Communication

int MPI_Allgather(void *sndbuf,  
    int sndcnt, 
    MPI_Datatype sndtyp, 
    void *rcvbuf,  
    int rcvcnt, 
    MPI_Datatype rcvtyp, 
    MPI_Comm comm
);

Collective Communication


Derived Datatypes

float a; //start of interval
float b; //end of interval
int n;   //number of trapezoids


Derived Datatypes


Derived Datatypes

typedef struct {
  float a;
  float b;
  int   n;
} ParTrapData;
printf("Enter a, b, and n\n");  
scanf("%f %f %d", a_ptr, b_ptr, n_ptr);
ParTrapData  ptd = { *a_ptr, *b_ptr, *n_ptr };
MPI_Bcast(&ptd, 1, ParTrapData, 0, MPI_COMM_WORLD);

Derived Datatypes


Derived Datatypes


Derived Datatypes

#include "mpi.h"
int MPI_Type_create_struct(int count, 
                    int blocklens[], 
               MPI_Aint indices[], 
           MPI_Datatype old_types[], 
           MPI_Datatype *newtype )

Derived Datatypes

MPI_Datatype mpi_ptdatatype;

int count = 3;
MPI_Aint blocks[count]  = {1, 1, 1};
MPI_Aint indices[count];
MPI_Datatype old_types[count] = {MPI_Float, MPI_Float, MPI_Int};
printf("Enter a, b, and n\n");  
scanf("%f %f %d", a_ptr, b_ptr, n_ptr);
ParTrapData  ptd = { *a_ptr, *b_ptr, *n_ptr };

Derived Datatypes

MPI_Aint addresses[count + 1];
MPI_Get_address(&ptd, &addresses[0]);
MPI_Get_address(&(ptd.a) , &addresses[1]);
MPI_Get_address(&(ptd.b) , &addresses[2]);
MPI_Get_address(&(ptd.n) , &addresses[3]);
indices[0] = addresses[1] - addresses[0];
indices[1] = addresses[2] - addresses[0]; 
indices[2] = addresses[3] - addresses[0];
MPI_Type_create_struct(count, blocks, indices, old_types, &mpi_ptdatatype);

Derived Datatypes

#include "mpi.h"
int MPI_Type_commit(MPI_Datatype *datatype);

Derived Datatypes


Derived Datatypes


Derived Datatypes

#define BUFF 100
char buff[BUFF]
int position = 0;

MPI_Pack(a_ptr, 1, MPI_FLOAT, buff, BUFF, &position, MPI_COMM_WORLD);
MPI_Pack(b_ptr, 1, MPI_FLOAT, buff, BUFF, &position, MPI_COMM_WORLD);
MPI_Pack(n_ptr, 1, MPI_INT, buff, BUFF, &position, MPI_COMM_WORLD);


Derived Datatypes

MPI_Bcast(buffer, BUFF, MPI_PACKED, root, MPI_COMM_WORLD);
MPI_Unpack(buff, BUFF, &position, a_ptr, 1, MPI_FLOAT,  MPI_COMM_WORLD);
MPI_Unpack(buff, BUFF, &position, b_ptr, 1, MPI_FLOAT,  MPI_COMM_WORLD);
MPI_Unpack(buff, BUFF, &position, n_ptr, 1, MPI_INT,  MPI_COMM_WORLD);

Derived Datatypes


Timings in MPI

double start, finish;

start =  MPI_Wtime(void);

/* Something time-consuming*/

finish = MPI_Wtime(void);
printf("Proc %d > Elapsed time = %e seconds\n",my_rank, finish-start);



Timings in MPI

#include "timer.h"

double start, finish;
GET_TIME(start);

/* Something time-consuming*/

GET_TIME(finish);

finish = MPI_Wtime(void);
printf("Proc %d > Elapsed time = %e seconds\n",my_rank, finish-start);

Timings in MPI


Timings in MPI


double local start, local finish, local elapsed, elapsed; ... MPI Barrier(comm); local_start = MPI Wtime(); /* Code to be timed */ ... local_finish = MPI Wtime(); local_elapsed = local_finish − local_start; MPI_Reduce(&local elapsed, &elapsed, 1, MPI_DOUBLE, MPI_MAX, 0, comm); if (my rank == 0){ printf("Elapsed time = %e seconds\n", elapsed); }

Timings in MPI


Performance Evaluation of MPI Applications


Performance Evaluation of MPI Applications


Alt text


Performance Evaluation of MPI Applications

[cosc330@bourbaki examples] $ mpiexec -n 1 mat_vect_mult_time
Enter the number of rows
1024
Enter the number of columns
1024
Elapsed time = 3.496170e-03
[cosc330@bourbaki examples] $ mpiexec -n 2 mat_vect_mult_time
Enter the number of rows
1024
Enter the number of columns
1024
Elapsed time = 1.714945e-03


Performance Evaluation of MPI Applications

Processes Order:1024 Order:2048 Order:4096 Order:8192 Order:16384
1 3.43 13.21 53.62 211.94 837.77
2 1.78 6.91 26.83 107.39 422.23
4 1.03 3.83 16.88 66.63 233.58
8 0.76 2.04 9.030 34.59 169.47
16 0.71 1.71 6.334 24.70 94.06
32 0.80 1.92 6.611 32.94 91.35

Performance Evaluation of MPI Applications

When graphed by matrix order:


Alt text


Performance Evaluation of MPI Applications

\( S = \frac{T_{serial}}{T_{parallel}}\)
\( E = \frac{S}{p} = \frac{\big(\frac{T_{serial}}{T_{parallel}}\big)}{p} = \frac{T_{serial}}{p \cdot T_{parallel}}\)

Where, E is the efficiency, S is the speedup, p is the number of processes, \(T_{serial}\) is the serial runtime of the program and \(T_{parallel}\) is the parallel runtime


Performance Evaluation of MPI Applications

Processes Order:1024 Order:2048 Order:4096 Order:8192 Order:16384
1 1 1 1 1 1
2 1.92 1.91 1.99 1.97 1.98
4 3.33 3.44 3.17 3.18 3.58
8 4.51 6.47 5.93 6.12 4.94
16 4.83 7.72 8.46 8.58 8.90
32 4.28 6.88 8.11 6.43 9.17

Performance Evaluation of MPI Applications

Processes Order:1024 Order:2048 Order:4096 Order:8192 Order:16384
1 1 1 1 1 1
2 0.96 0.96 1 0.99 0.99
4 0.83 0.86 0.79 0.8 0.9
8 0.56 0.81 0.74 0.77 0.62
16 0.3 0.48 0.53 0.54 0.56
32 0.13 0.22 0.25 0.2 0.29

Performance Evaluation of MPI Applications


Summary


Reading