/* * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ /* * This sample implements a simple task consumer using threads and streams * with all data in Unified Memory, and tasks consumed by both host and device */ // system includes #include #include #include #include #ifdef USE_PTHREADS #include #else #include #endif #include // cuBLAS #include // utilities #include #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) // SRAND48 and DRAND48 don't exist on windows, but these are the equivalent functions void srand48(long seed) { srand((unsigned int)seed); } double drand48() { return double(rand())/RAND_MAX; } #endif const char *sSDKname = "UnifiedMemoryStreams"; // simple task template struct Task { unsigned int size, id; T *data; T *result; T *vector; Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL) {}; Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) { // allocate unified memory -- the operation performed in this example will be a DGEMV checkCudaErrors(cudaMallocManaged(&data, sizeof(T)*size*size)); checkCudaErrors(cudaMallocManaged(&result, sizeof(T)*size)); checkCudaErrors(cudaMallocManaged(&vector, sizeof(T)*size)); checkCudaErrors(cudaDeviceSynchronize()); } ~Task() { // ensure all memory is deallocated checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaFree(data)); checkCudaErrors(cudaFree(result)); checkCudaErrors(cudaFree(vector)); } void allocate(const unsigned int s, const unsigned int unique_id) { // allocate unified memory outside of constructor id = unique_id; size = s; checkCudaErrors(cudaMallocManaged(&data, sizeof(T)*size*size)); checkCudaErrors(cudaMallocManaged(&result, sizeof(T)*size)); checkCudaErrors(cudaMallocManaged(&vector, sizeof(T)*size)); checkCudaErrors(cudaDeviceSynchronize()); // populate data with random elements for (unsigned int i=0; i *TaskListPtr; cudaStream_t *streams; cublasHandle_t *handles; int taskSize; }; typedef struct threadData_t threadData; #endif // simple host dgemv: assume data is in row-major format and square template void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) { // rows for (int i=0; istreams; cublasHandle_t *handle = dataPtr->handles; int tid = dataPtr->tid; for (int i = 0; i < dataPtr->taskSize; i++) { Task &t = dataPtr->TaskListPtr[i]; if (t.size < 100) { // perform on host printf("Task [%d], thread [%d] executing on host (%d)\n",t.id,tid,t.size); // attach managed memory to a (dummy) stream to allow host access while the device is running checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost)); checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost)); checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost)); // necessary to ensure Async cudaStreamAttachMemAsync calls have finished checkCudaErrors(cudaStreamSynchronize(stream[0])); // call the host operation gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result); } else { // perform on device printf("Task [%d], thread [%d] executing on device (%d)\n",t.id,tid,t.size); double one = 1.0; double zero = 0.0; // attach managed memory to my stream checkCudaErrors(cublasSetStream(handle[tid+1], stream[tid+1])); checkCudaErrors(cudaStreamAttachMemAsync(stream[tid+1], t.data, 0, cudaMemAttachSingle)); checkCudaErrors(cudaStreamAttachMemAsync(stream[tid+1], t.vector, 0, cudaMemAttachSingle)); checkCudaErrors(cudaStreamAttachMemAsync(stream[tid+1], t.result, 0, cudaMemAttachSingle)); // call the device operation checkCudaErrors(cublasDgemv(handle[tid+1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1)); } } pthread_exit(NULL); } #else template void execute(Task &t, cublasHandle_t *handle, cudaStream_t *stream, int tid) { if (t.size < 100) { // perform on host printf("Task [%d], thread [%d] executing on host (%d)\n",t.id,tid,t.size); // attach managed memory to a (dummy) stream to allow host access while the device is running checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost)); checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost)); checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost)); // necessary to ensure Async cudaStreamAttachMemAsync calls have finished checkCudaErrors(cudaStreamSynchronize(stream[0])); // call the host operation gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result); } else { // perform on device printf("Task [%d], thread [%d] executing on device (%d)\n",t.id,tid,t.size); double one = 1.0; double zero = 0.0; // attach managed memory to my stream checkCudaErrors(cublasSetStream(handle[tid+1], stream[tid+1])); checkCudaErrors(cudaStreamAttachMemAsync(stream[tid+1], t.data, 0, cudaMemAttachSingle)); checkCudaErrors(cudaStreamAttachMemAsync(stream[tid+1], t.vector, 0, cudaMemAttachSingle)); checkCudaErrors(cudaStreamAttachMemAsync(stream[tid+1], t.result, 0, cudaMemAttachSingle)); // call the device operation checkCudaErrors(cublasDgemv(handle[tid+1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1)); } } #endif // populate a list of tasks with random sizes template void initialise_tasks(std::vector< Task > &TaskList) { for (unsigned int i=0; i > TaskList(N); initialise_tasks(TaskList); printf("Executing tasks on host / device\n"); // run through all tasks using threads and streams #ifdef USE_PTHREADS pthread_t threads[nthreads]; threadData *InputToThreads = new threadData[nthreads]; for (int i=0; i < nthreads; i++) { checkCudaErrors(cudaSetDevice(dev_id)); InputToThreads[i].tid = i; InputToThreads[i].streams = streams; InputToThreads[i].handles = handles; if ((TaskList.size() / nthreads) == 0) { InputToThreads[i].taskSize = (TaskList.size() / nthreads); InputToThreads[i].TaskListPtr = &TaskList[i*(TaskList.size() / nthreads)]; } else { if (i == nthreads - 1) { InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads); InputToThreads[i].TaskListPtr = &TaskList[i*(TaskList.size() / nthreads)+ (TaskList.size() % nthreads)]; } else { InputToThreads[i].taskSize = (TaskList.size() / nthreads); InputToThreads[i].TaskListPtr = &TaskList[i*(TaskList.size() / nthreads)]; } } pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]); } for (int i=0; i < nthreads; i++) { pthread_join(threads[i], NULL); } #else omp_set_num_threads(nthreads); #pragma omp parallel for schedule(dynamic) for (int i=0; i >().swap(TaskList); printf("All Done!\n"); exit(EXIT_SUCCESS); }