// version 0.6 by ian a. mason // june 1 @ u.n.e // // row i located on machine bi in the area // /disks/bi/homes/courses/comp309 // in files A.*.* B.*.* and C.*.* // parse_args check that the paths are // ok relative to DATAPATH with no .i.j // extension numbers. these files should // be created by mkBlockRows #include #include #include #include #include #include #include #include #include "mmlib.h" #include "errlog.h" #define CLUSTER 1 #define VERSION "mm.0.6" // data is stored on each node with relative path // /tmp/comp309/ // with absolute names: #define DATAPATH "/homes/localhost/comp309/%s" #define DATA "/homes/localhost/comp309/%s.%d.%d" //#define DATAPATH "/homes/localhost/homes/courses/comp309/%s" //#define DATA "/homes/localhost/homes/courses/comp309/%s.%d.%d" #define ENCODING PvmDataRaw //PvmDataDefault #define MAXNTIDS 1025 #define MAXROW 100 #define ATAG 2 #define BTAG 3 #define DEBUG 0 #define PERMS 0666 #define ERROR_BUFF 100 int parse_args(int, char*[], int*, int*, int *, int []); int main(int argc, char* argv[]){ int ntask, info, mytid, mygid; int i, j, m, blksize, matrix_size; int row, col, up, down; int child[MAXNTIDS-1]; int myrow[MAXROW]; char hostname[100]; // paths // A = P[0] B = P[1] C = P[2] // char P[3][PATH_MAX]; // file descriptors // A = F[0] B = F[1] C = F[2] // int F[3]; // data // A = M[0] B = M[1] C = M[2] TMP = M[3] // int *M[4]; //note real time int start_time = time(NULL); //check everything if(parse_args(argc, argv, &m, &blksize, &ntask, F) < 0) exit(0); //enroll in pvm daemon & note id if((mytid = pvm_mytid()) < 0){ errlog("pvm_mytid failed"); return -1; } //tinkerville //too many files? pvm_setopt(PvmRoute, PvmRouteDirect); //determine my machine gethostname(hostname, 64); //join the group if((mygid = pvm_joingroup("mmult")) < 0){ errlog("pvm_joingroup failed"); pvm_exit(); return -1; } if(DEBUG) fprintf(stderr, "Im on %s and my gid is %d\n", hostname, mygid); //spawn the torus if ((mygid == 0) && (ntask > 1)) { //tailored to the cluster? if(CLUSTER){ char nodes[MAXROW][64]; for(i = 0; i < m; i++) sprintf(nodes[i], "o%d", (i%16) + 1); info = 0; for(i = 0; i < m; i++){ int ns; ns = pvm_spawn(VERSION, &argv[1], PvmTaskHost, nodes[i], ((i == 0) ? m-1 : m), &child[((i > 0) ? m*i - 1 : 0)]); //wait until the spawned tasks have joined the group while(pvm_gsize("mmult") < (m*(i+1))); info += ns; if(DEBUG){ fprintf(stderr, "spawned %d tasks for row %d on %s with tids:\n", ns, i, nodes[i]); for(j = 0; j < ((i == 0) ? m-1 : m); j++) fprintf(stderr, "\t%d\n", child[((i > 0) ? m*i - 1 : 0) + j]); } } } else { info = pvm_spawn(VERSION, &argv[1], PvmTaskDefault, (char*)0, ntask-1, child); } //check torus if(info != ntask-1){ errlog("spawn failed"); fprintf(stderr, "only spawned %d of the %d requested tasks\n", info, ntask-1); pvm_lvgroup("mmult"); pvm_exit(); return -1; } //what about the orphans? //not my bug, their bug } // wait at barrier if((info = pvm_barrier("mmult",ntask)) < 0) errlog("pvm_barrier failed"); //figure out my row for (i = 0; i < m; i++) if((myrow[i] = pvm_gettid("mmult", (mygid/m)*m + i)) < 0){ errlog("pvm_gettid failed, tid requested has gid=%d\n",(mygid/m)*m + i); } //allocate space for the blocks for(i = 0; i < 4; i++) M[i] = (int*)calloc(sizeof(int), blksize*blksize); //bail if calloc fails if ((M[0] == NULL) || (M[1] == NULL) || (M[2] == NULL) || (M[3] == NULL)) { fprintf(stderr, "%s: out of memory!\n", argv[0]); for(i = 0; i < 4; i++)free(M[i]); pvm_lvgroup("mmult"); pvm_exit(); return -1; } //determine my coordinates on the torus row = mygid/m; col = mygid % m; //make the file names for(i = 0; i < 3; i++){ sprintf(P[i], DATA, argv[i+1], row, col); //open them if((F[i] = open(P[i], ( i < 2 ? O_RDONLY : O_WRONLY)) ) == -1){ errlog("open(P[%d]) failed",i); } } //size of virtual matrix matrix_size = m * blksize; //load in the data for(j = 0; j < 2; j++){ if(get_block_row(F[j], blksize, 0, 0, 0, blksize*blksize, M[j])<0){ fprintf(stderr,"failed to load data on %s for filename %s\n",hostname,P[j]); pvm_lvgroup("mmult"); pvm_exit(); return -1; } } if(DEBUG) fprintf(stderr,"%d loaded data\n",mygid); //find out who is above me on torus if((up = pvm_gettid("mmult", ((row)? (row-1): (m-1))*m+col)) < 0){ errlog("pvm_gettid failed for up gid=%d",((row)? (row-1): (m-1))*m+col); } //find out who is below me if((down = pvm_gettid("mmult", ((row == (m-1))? col: (row+1)*m+col))) < 0){ errlog("pvm_gettid failed for down gid=%d\n", ((row == (m-1))? col: (row+1)*m+col)); } if(DEBUG) fprintf(stderr,"%d starting calculations\n", mygid); //broadcast A's along, and rotate B's up //incrementing answer as we go. for (i = 0; i < m; i++) { if (col == (row + i)%m) { if(DEBUG) fprintf(stderr,"%d beginning loop %d\n", mygid, i); if(pvm_initsend(ENCODING) < 0){ errlog("pvm_initsend for A failed"); //need to bail here! } if(DEBUG) fprintf(stderr,"%d pvm_initsend completed %d\n", mygid, i); if(pvm_pkint(M[0], blksize*blksize, 1) < 0){ errlog("pvm_pkint for A failed"); //need to bail here! } if(DEBUG) fprintf(stderr,"%d pvm_pkint completed %d\n", mygid, i); if(pvm_mcast(myrow, m, (i+1)*ATAG)<0) errlog("pvm_mcast failed"); if(DEBUG) fprintf(stderr,"%d broadcast of A completed %d\n", mygid,i); block_mult(M[2],M[0],M[1],blksize); if(DEBUG) fprintf(stderr,"%d block_mult completed %d\n", mygid,i); } else { int tid = pvm_gettid("mmult", row*m + (row +i)%m); if(tid < 0){ errlog("pvm_gettid failed for gid=%d",row*m + (row +i)%m); } if(DEBUG) fprintf(stderr,"%d receiving A %d\n", mygid,i); if(pvm_recv(tid, (i+1)*ATAG)<0) errlog("Receive of A failed"); if(DEBUG) fprintf(stderr,"%d receiving A completed %d\n", mygid,i); if(pvm_upkint(M[3], blksize*blksize, 1) < 0){ errlog("pvm_upkint failed\n"); } if(DEBUG) fprintf(stderr,"%d upkint completed %d\n", mygid,i); block_mult(M[2],M[3],M[1],blksize); if(DEBUG) fprintf(stderr,"%d block_mult completed %d\n", mygid,i); } if(pvm_initsend(ENCODING) < 0){ errlog("pvm_initsend failed\n"); } if(DEBUG) fprintf(stderr,"%d pvm_initsend completed %d\n", mygid, i); if(pvm_pkint(M[1], blksize*blksize, 1) < 0){ errlog("pvm_pkint failed\n"); } if(DEBUG) fprintf(stderr,"%d pvm_pkint completed %d\n", mygid, i); if(pvm_send(up, (i+1)*BTAG) < 0) errlog("Send failed"); if(DEBUG) fprintf(stderr,"%d passing B up %d\n", mygid, i); if(pvm_recv(down, (i+1)*BTAG) < 0) errlog("Receive failed"); if(DEBUG) fprintf(stderr, "%d receiving B completed %d\n", mygid, i); if(pvm_upkint(M[1], blksize*blksize, 1) < 0){ errlog("pvm_upkint failed\n"); } if(DEBUG) fprintf(stderr,"%d pvm_upkint completed %d\n", mygid,i); } if(DEBUG) fprintf(stderr,"%d finished\n", mygid); if((info = pvm_barrier("mmult",ntask)) < 0) errlog("pvm_barrier failed"); //look right (A == C) ? for (i = 0 ; i < blksize*blksize; i++) if (M[0][i] != M[2][i]) printf("Error a[%d] (%d) != c[%d] (%d) \n", i, M[0][i], i, M[2][i]); //close A and B for(i = 0; i < 2; i++) close(F[i]); //write out my answer set_block_row(F[2], blksize, 0, 0, 0, blksize*blksize, M[2]); //log the details printf("%s %d %d task %d managing block [%d,%d] on %s done successfully after %ld seconds.\n", VERSION, m, blksize, mytid, row, col, hostname, time(NULL) - start_time); //shut up shop for(i = 0; i < 4; i++) free(M[i]); close(F[2]); pvm_lvgroup("mmult"); pvm_exit(); return 0; } int parse_args(int argc, char *argv[], int *m, int *blksize, int *ntask, int F[]){ int i, j; char P[3][PATH_MAX]; char hostname[100]; gethostname(hostname, 64); if ((argc != 6) || ((*m = atoi(argv[4])) <= 0) || ((*blksize = atoi(argv[5])) <= 0)){ fprintf(stderr, "Usage: %s matrixA matrixB matrixC m blk\n", argv[0]); return(-1); }; if(*m > MAXROW){ fprintf(stderr, "m = %d not valid.\n", *m); return(-1); }; *ntask = (*m)*(*m); if ((*ntask < 1) || (*ntask >= MAXNTIDS)) { fprintf(stderr, "ntask in parse_args = %d not valid.\n", *ntask); pvm_exit(); return(-1); }; for(i = 0; i < 3; i++) sprintf(P[i], DATAPATH, //hostname, argv[i+1]); for(j = 0; j < 3; j++) if ((F[j] = open(P[j], ((j < 2) ? O_RDONLY : O_WRONLY))) == -1) break; if(j < 3){ fprintf(stderr, "Data not detected at fd %d : %s\n", j, P[j]); for(i = 0; i < j; i++)close(F[i]); return(-1); } for(i = 0; i < 3; i++)close(F[i]); return(0); }