/* * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ // This example shows how to use the clock function to measure the performance of // block of threads of a kernel accurately. // // Blocks are executed in parallel and out of order. Since there's no synchronization // mechanism between blocks, we measure the clock once for each block. The clock // samples are written to device memory. // This kernel computes a standard parallel reduction and evaluates the // time it takes to do that for each block. The timing results are stored // in device memory. extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer) { // __shared__ float shared[2 * blockDim.x]; extern __shared__ float shared[]; const int tid = threadIdx.x; const int bid = blockIdx.x; if (tid == 0) timer[bid] = clock(); // Copy input. shared[tid] = input[tid]; shared[tid + blockDim.x] = input[tid + blockDim.x]; // Perform reduction to find minimum. for (int d = blockDim.x; d > 0; d /= 2) { __syncthreads(); if (tid < d) { float f0 = shared[tid]; float f1 = shared[tid + d]; if (f1 < f0) { shared[tid] = f1; } } } // Write result. if (tid == 0) output[bid] = shared[0]; __syncthreads(); if (tid == 0) timer[bid+gridDim.x] = clock(); }