/** * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ /** * * This sample is a simple code that illustrates basic usage of * cooperative groups within the thread block. The code launches a single * thread block, creates a cooperative group of all threads in the block, * and a set of tiled partition cooperative groups. For each, it uses a * generic reduction function to calculate the sum of all the ranks in * that group. In each case the result is printed, together with the * expected answer (which is calculated using the analytical formula * (n-1)*n)/2, noting that the ranks start at zero). * */ #include #include using namespace cooperative_groups; /** * CUDA device function * * calculates the sum of val across the group g. The workspace array, x, * must be large enough to contain g.size() integers. */ __device__ int sumReduction(thread_group g, int *x, int val) { // rank of this thread in the group int lane = g.thread_rank(); // for each iteration of this loop, the number of threads active in the // reduction, i, is halved, and each active thread (with index [lane]) // performs a single summation of it's own value with that // of a "partner" (with index [lane+i]). for (int i = g.size()/2; i > 0; i /= 2) { // store value for this thread in temporary array x[lane] = val; // synchronize all threads in group g.sync(); if(lane tiledPartition16 = tiled_partition<16>(threadBlockGroup); // This offset allows each group to have its own unique area in the workspace array int workspaceOffset=threadBlockGroup.thread_rank()-tiledPartition16.thread_rank(); // input to reduction, for each thread, is its' rank in the group input=tiledPartition16.thread_rank(); // expected output from analytical formula (n-1)(n)/2 // (noting that indexing starts at 0 rather than 1) expectedOutput=15*16/2; // Perform reduction output=sumReduction(tiledPartition16, workspace+workspaceOffset, input); // each master thread prints out result if(tiledPartition16.thread_rank()==0) printf(" Sum of all ranks 0..15 in this tiledPartition16 group is %d (expected %d)\n",output,expectedOutput); return; } /** * Host main routine */ int main(){ // Error code to check return values for CUDA calls cudaError_t err; //Launch the kernel int blocksPerGrid=1; int threadsPerBlock=64; printf("\nLaunching a single block with %d threads...\n\n",threadsPerBlock); // we use the optional third argument to specify the size // of shared memory required in the kernel cgkernel <<>> (); err = cudaDeviceSynchronize(); if (err != cudaSuccess) { fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } printf("\n...Done.\n\n"); return 0; }