// version 0.7 by ian a. mason // june 1 @ u.n.e // // row i located on machine bi in the area // /disks/bi/homes/courses/comp309 // in files A.*.* B.*.* and C.*.* // parse_args check that the paths are // ok relative to DATAPATH with no .i.j // extension numbers. these files should // be created by mkBlockRows #include #include #include #include #include #include #include #include #include #include "mmlib.h" #include "errlog.h" #define CLUSTER 1 #define CLUST_SIZE 16 #define VERSION "mm.0.7" #define WORKER_GRP "worker" // data is stored on each node with relative path // /tmp/comp309/ // with absolute names: #define DATAPATH "/homes/localhost/comp309/%s" #define DATA "/homes/localhost/comp309/%s.%d.%d" #define ENCODING PvmDataRaw //PvmDataDefault #define MAXNTIDS 1025 #define MAXROW 100 #define ATAG 2 #define BTAG 3 #define DEBUG 0 #define PERMS 0666 #define ERROR_BUFF 100 int parse_args(int, char*[], int*, int*, int *, int []); void shutdown(int, int*); void signal_h(int); int finished = 0; int main(int argc, char* argv[]){ int ntask, info, mytid, mygid; int i, j, m, blksize, matrix_size; int row, col, up, down; int child[MAXNTIDS-1]; int myrow[MAXROW]; char hostname[100]; // paths // A = P[0] B = P[1] C = P[2] // char P[3][PATH_MAX]; // file descriptors // A = F[0] B = F[1] C = F[2] // int F[3]; // data // A = M[0] B = M[1] C = M[2] TMP = M[3] // int *M[4]; //note real time int start_time = time(NULL); //check everything if(parse_args(argc, argv, &m, &blksize, &ntask, F) < 0) exit(0); //enroll in pvm daemon & note id if((mytid = pvm_mytid()) < 0){ errlog("pvm_mytid failed"); return -1; } //tinkerville //too many files? pvm_setopt(PvmRoute, PvmRouteDirect); //determine my machine gethostname(hostname, 64); //join the group //if((mygid = pvm_joingroup(WORKER_GRP)) < 0){ // errlog("pvm_joingroup failed"); // pvm_exit(); // return -1; //} //spawn the torus if ((pvm_parent() == PvmNoParent) && (ntask > 1)) { struct sigaction newact; sigset_t blockset; sigset_t emptyset; newact.sa_handler = signal_h; newact.sa_flags = 0; sigemptyset(&newact.sa_mask); sigemptyset(&emptyset); sigemptyset(&blockset); sigaddset(&blockset, SIGUSR1); sigaddset(&blockset, SIGUSR2); sigaddset(&newact.sa_mask, SIGUSR1); sigaddset(&newact.sa_mask, SIGUSR2); if(sigaction(SIGUSR1, &newact, NULL) < 0){ fprintf(stderr, "Failed to install SIGUSR1 handler!\n"); pvm_exit(); exit(0); } if(sigaction(SIGUSR2, &newact, NULL) < 0){ fprintf(stderr, "Failed to install SIGUSR2 handler!\n"); pvm_exit(); exit(0); } //tailored to the cluster? if(CLUSTER){ char nodes[MAXROW][64]; for(i = 0; i < m; i++) sprintf(nodes[i], "o%d", (i%CLUST_SIZE) + 1); info = 0; for(i = 0; i < m; i++){ int ns; ns = pvm_spawn(VERSION, &argv[1], PvmTaskHost, nodes[i], m, &child[((i > 0) ? m*i - 1 : 0)]); if(ns < m){ errlog("Failed to spawn the correct number of tasks on %s, bailing out", nodes[i]); shutdown(info,child); } //wait until the spawned tasks have joined the group while(pvm_gsize(WORKER_GRP) < (m*(i+1))); info += ns; if(DEBUG){ fprintf(stderr, "spawned %d tasks for row %d on %s with tids:\n", ns, i, nodes[i]); for(j = 0; j < m; j++) fprintf(stderr, "\t%d\n", child[((i > 0) ? m*i - 1 : 0) + j]); } } } else { info = pvm_spawn(VERSION, &argv[1], PvmTaskDefault, (char*)0, ntask-1, child); } //check torus if(info != ntask){ errlog("spawn failed, bailing out"); fprintf(stderr, "only spawned %d of the %d requested tasks\n", info, ntask); shutdown(info,child); } if(sigprocmask(SIG_BLOCK, &blockset, NULL) < 0){ shutdown(info, child); } while(!finished){ sigsuspend(&emptyset); } if(finished) shutdown(info, child); } else{ int parent_tid = pvm_parent(); if(parent_tid < 0){ errlog("pvm_parent() failed!"); pvm_exit(); } //Join the worker group if((mygid = pvm_joingroup(WORKER_GRP)) < 0){ errlog("pvm_joingroup failed"); pvm_exit(); return -1; } if(DEBUG) fprintf(stderr, "Im on %s and my gid is %d\n", hostname, mygid); // wait at barrier if((info = pvm_barrier(WORKER_GRP,ntask)) < 0){ errlog("pvm_barrier failed"); pvm_sendsig(parent_tid, SIGUSR1); } //figure out my row for (i = 0; i < m; i++) if((myrow[i] = pvm_gettid(WORKER_GRP, (mygid/m)*m + i)) < 0){ errlog("pvm_gettid failed, tid requested has gid=%d\n",(mygid/m)*m + i); pvm_sendsig(parent_tid, SIGUSR1); } //allocate space for the blocks for(i = 0; i < 4; i++) M[i] = (int*)calloc(sizeof(int), blksize*blksize); //bail if calloc fails if ((M[0] == NULL) || (M[1] == NULL) || (M[2] == NULL) || (M[3] == NULL)) { fprintf(stderr, "%s: out of memory!\n", argv[0]); for(i = 0; i < 4; i++)free(M[i]); pvm_sendsig(parent_tid, SIGUSR1); pvm_lvgroup(WORKER_GRP); pvm_exit(); return -1; } //determine my coordinates on the torus row = mygid/m; col = mygid % m; //make the file names for(i = 0; i < 3; i++){ sprintf(P[i], DATA, argv[i+1], row, col); //open them if((F[i] = open(P[i], ( i < 2 ? O_RDONLY : O_WRONLY)) ) == -1){ errlog("open(P[%d]) failed",i); pvm_sendsig(parent_tid, SIGUSR1); } } //size of virtual matrix matrix_size = m * blksize; //load in the data for(j = 0; j < 2; j++){ if(get_block_row(F[j], blksize, 0, 0, 0, blksize*blksize, M[j])<0){ fprintf(stderr,"failed to load data on %s for filename %s\n",hostname,P[j]); pvm_sendsig(parent_tid, SIGUSR1); pvm_lvgroup(WORKER_GRP); pvm_exit(); return -1; } } if(DEBUG) fprintf(stderr,"%d loaded data\n",mygid); //find out who is above me on torus if((up = pvm_gettid(WORKER_GRP, ((row)? (row-1): (m-1))*m+col)) < 0){ errlog("pvm_gettid failed for up gid=%d",((row)? (row-1): (m-1))*m+col); pvm_sendsig(parent_tid, SIGUSR1); } //find out who is below me if((down = pvm_gettid(WORKER_GRP, ((row == (m-1))? col: (row+1)*m+col))) < 0){ errlog("pvm_gettid failed for down gid=%d\n", ((row == (m-1))? col: (row+1)*m+col)); pvm_sendsig(parent_tid, SIGUSR1); } if(DEBUG) fprintf(stderr,"%d starting calculations\n", mygid); //broadcast A's along, and rotate B's up //incrementing answer as we go. for (i = 0; i < m; i++) { if (col == (row + i)%m) { if(DEBUG) fprintf(stderr,"%d beginning loop %d\n", mygid, i); if(pvm_initsend(ENCODING) < 0){ errlog("pvm_initsend for A failed"); //need to bail here! pvm_sendsig(parent_tid, SIGUSR1); } if(DEBUG) fprintf(stderr,"%d pvm_initsend completed %d\n", mygid, i); if(pvm_pkint(M[0], blksize*blksize, 1) < 0){ errlog("pvm_pkint for A failed"); //need to bail here! pvm_sendsig(parent_tid, SIGUSR1); } if(DEBUG) fprintf(stderr,"%d pvm_pkint completed %d\n", mygid, i); if(pvm_mcast(myrow, m, (i+1)*ATAG)<0){ errlog("pvm_mcast failed"); pvm_sendsig(parent_tid, SIGUSR1); } if(DEBUG) fprintf(stderr,"%d broadcast of A completed %d\n", mygid,i); block_mult(M[2],M[0],M[1],blksize); if(DEBUG) fprintf(stderr,"%d block_mult completed %d\n", mygid,i); } else { int tid = pvm_gettid(WORKER_GRP, row*m + (row +i)%m); if(tid < 0){ errlog("pvm_gettid failed for gid=%d",row*m + (row +i)%m); pvm_sendsig(parent_tid, SIGUSR1); } if(DEBUG) fprintf(stderr,"%d receiving A %d\n", mygid,i); if(pvm_recv(tid, (i+1)*ATAG)<0){ errlog("Receive of A failed"); pvm_sendsig(parent_tid, SIGUSR1); } if(DEBUG) fprintf(stderr,"%d receiving A completed %d\n", mygid,i); if(pvm_upkint(M[3], blksize*blksize, 1) < 0){ errlog("pvm_upkint failed\n"); pvm_sendsig(parent_tid, SIGUSR1); } if(DEBUG) fprintf(stderr,"%d upkint completed %d\n", mygid,i); block_mult(M[2],M[3],M[1],blksize); if(DEBUG) fprintf(stderr,"%d block_mult completed %d\n", mygid,i); } if(pvm_initsend(ENCODING) < 0){ errlog("pvm_initsend failed\n"); pvm_sendsig(parent_tid, SIGUSR1); } if(DEBUG) fprintf(stderr,"%d pvm_initsend completed %d\n", mygid, i); if(pvm_pkint(M[1], blksize*blksize, 1) < 0){ errlog("pvm_pkint failed\n"); pvm_sendsig(parent_tid, SIGUSR1); } if(DEBUG) fprintf(stderr,"%d pvm_pkint completed %d\n", mygid, i); if(pvm_send(up, (i+1)*BTAG) < 0){ errlog("Send failed"); pvm_sendsig(parent_tid, SIGUSR1); } if(DEBUG) fprintf(stderr,"%d passing B up %d\n", mygid, i); if(pvm_recv(down, (i+1)*BTAG) < 0){ errlog("Receive failed"); pvm_sendsig(parent_tid, SIGUSR1); } if(DEBUG) fprintf(stderr, "%d receiving B completed %d\n", mygid, i); if(pvm_upkint(M[1], blksize*blksize, 1) < 0){ errlog("pvm_upkint failed\n"); pvm_sendsig(parent_tid, SIGUSR1); } if(DEBUG) fprintf(stderr,"%d pvm_upkint completed %d\n", mygid,i); } if(DEBUG) fprintf(stderr,"%d finished\n", mygid); if((info = pvm_barrier(WORKER_GRP,ntask)) < 0){ errlog("pvm_barrier failed"); pvm_sendsig(parent_tid, SIGUSR1); } //look right (A == C) ? for (i = 0 ; i < blksize*blksize; i++) if (M[0][i] != M[2][i]) printf("Error a[%d] (%d) != c[%d] (%d) \n", i, M[0][i], i, M[2][i]); //close A and B for(i = 0; i < 2; i++) close(F[i]); //write out my answer set_block_row(F[2], blksize, 0, 0, 0, blksize*blksize, M[2]); //log the details printf("%s %d %d task %d managing block [%d,%d] on %s done successfully after %ld seconds.\n", VERSION, m, blksize, mytid, row, col, hostname, time(NULL) - start_time); //shut up shop for(i = 0; i < 4; i++) free(M[i]); close(F[2]); pvm_lvgroup(WORKER_GRP); pvm_sendsig(parent_tid, SIGUSR2); pvm_exit(); } return 0; } int parse_args(int argc, char *argv[], int *m, int *blksize, int *ntask, int F[]){ int i, j; char P[3][PATH_MAX]; char hostname[100]; gethostname(hostname, 64); if ((argc != 6) || ((*m = atoi(argv[4])) <= 0) || ((*blksize = atoi(argv[5])) <= 0)){ fprintf(stderr, "Usage: %s matrixA matrixB matrixC m blk\n", argv[0]); return(-1); }; if(*m > MAXROW){ fprintf(stderr, "m = %d not valid.\n", *m); return(-1); }; *ntask = (*m)*(*m); if ((*ntask < 1) || (*ntask >= MAXNTIDS)) { fprintf(stderr, "ntask in parse_args = %d not valid.\n", *ntask); pvm_exit(); return(-1); }; for(i = 0; i < 3; i++) sprintf(P[i], DATAPATH, argv[i+1]); for(j = 0; j < 3; j++) if ((F[j] = open(P[j], ((j < 2) ? O_RDONLY : O_WRONLY))) == -1) break; if(j < 3){ fprintf(stderr, "Data not detected at fd %d : %s\n", j, P[j]); for(i = 0; i < j; i++)close(F[i]); return(-1); } for(i = 0; i < 3; i++)close(F[i]); return(0); } void shutdown(int ntasks, int *tids){ int i, info; //Let the slaughter begin! for(i = 0; i < ntasks; i++){ if((info = pvm_kill(tids[i])) < 0){ errlog("Failed to kill tid %d, maybe its already dead?", tids[i]); } } pvm_exit(); exit(0); } void signal_h(int signo){ if(signo == SIGUSR1){ fprintf(stderr, "Got a SIGUSR1 from a child, preparing to shutdown\n"); finished = 1; } else if(signo == SIGUSR2){ //If all workers have finished, the group will no longer exist if(pvm_gsize(WORKER_GRP) <= 0){ fprintf(stderr, "All children have completed, shutting down\n"); finished = 1; } } }