#include #define N 128 #define threadsPerBlock 128 #define blocksPerGrid 4 struct Lock { int *mutex; Lock (void) { int state = 0; cudaMalloc ((void **) &mutex, sizeof (int)); cudaMemcpy (mutex, &state, sizeof (int), cudaMemcpyHostToDevice); } ~Lock (void) { cudaFree (mutex); } __device__ void lock (void) { while (atomicCAS (mutex, 0, 1) != 0) ; } __device__ void unlock (void) { atomicExch (mutex, 1); } }; __global__ void dot (Lock lock, float *a, float *b, float *c) { __shared__ float cache[threadsPerBlock]; int tid = threadIdx.x + blockIdx.x * blockDim.x; int cacheIndex = threadIdx.x; float temp = 0; while (tid < N) { temp += a[tid] * b[tid]; tid += blockDim.x * gridDim.x; } // set the cache values cache[cacheIndex] = temp; // synchronize threads in this block __syncthreads (); // for reductions, threadsPerBlock must be a power of 2 // because of the following code int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) cache[cacheIndex] += cache[cacheIndex + i]; __syncthreads (); i /= 2; } if (cacheIndex == 0) { lock.lock (); *c += cache[0]; lock.unlock (); } } int main (void) { float *a, *b, c = 0; float *dev_a, *dev_b, *dev_c; // allocate memory on the CPU side a = (float *) malloc (N * sizeof (float)); b = (float *) malloc (N * sizeof (float)); // allocate the memory on the GPU cudaMalloc ((void **) &dev_a, N * sizeof (float)); cudaMalloc ((void **) &dev_b, N * sizeof (float)); cudaMalloc ((void **) &dev_c, sizeof (float)); // fill in the host memory with data for (int i = 0; i < N; i++) { a[i] = i; b[i] = i * 2; } // copy the arrays 'a' and 'b' to the GPU cudaMemcpy (dev_a, a, N * sizeof (float), cudaMemcpyHostToDevice); cudaMemcpy (dev_b, b, N * sizeof (float), cudaMemcpyHostToDevice); cudaMemcpy (dev_c, &c, sizeof (float), cudaMemcpyHostToDevice); Lock lock; dot <<< blocksPerGrid, threadsPerBlock >>> (lock, dev_a, dev_b, dev_c); // copy c back from the GPU to the CPU cudaMemcpy (&c, dev_c, sizeof (float), cudaMemcpyDeviceToHost); #define sum_squares(x) (x*(x+1)*(2*x+1)/6) printf ("Does GPU value %.6g = %.6g?\n", c, 2 * sum_squares ((float) (N - 1))); // free memory on the GPU side cudaFree (dev_a); cudaFree (dev_b); cudaFree (dev_c); // free memory on the CPU side free (a); free (b); }