#include <stdio.h>

#define N 128
#define threadsPerBlock 128
#define blocksPerGrid 4

struct Lock
{
  int *mutex;
    Lock (void)
  {
    int state = 0;
      cudaMalloc ((void **) &mutex, sizeof (int));

      cudaMemcpy (mutex, &state, sizeof (int), cudaMemcpyHostToDevice);
  }
   ~Lock (void)
  {
    cudaFree (mutex);
  }
  __device__ void lock (void)
  {
    while (atomicCAS (mutex, 0, 1) != 0)
      ;
  }
  __device__ void unlock (void)
  {
    atomicExch (mutex, 1);
  }
};

__global__ void
dot (Lock lock, float *a, float *b, float *c)
{
  __shared__ float cache[threadsPerBlock];
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  int cacheIndex = threadIdx.x;
  float temp = 0;
  while (tid < N)
    {
      temp += a[tid] * b[tid];
      tid += blockDim.x * gridDim.x;
    }
  // set the cache values
  cache[cacheIndex] = temp;

  // synchronize threads in this block
  __syncthreads ();
  // for reductions, threadsPerBlock must be a power of 2
  // because of the following code
  int i = blockDim.x / 2;
  while (i != 0)
    {
      if (cacheIndex < i)
	cache[cacheIndex] += cache[cacheIndex + i];
      __syncthreads ();
      i /= 2;
    }
  if (cacheIndex == 0)
    {
      lock.lock ();
      *c += cache[0];
      lock.unlock ();
    }
}

int
main (void)
{
  float *a, *b, c = 0;
  float *dev_a, *dev_b, *dev_c;
  // allocate memory on the CPU side
  a = (float *) malloc (N * sizeof (float));
  b = (float *) malloc (N * sizeof (float));
  // allocate the memory on the GPU
  cudaMalloc ((void **) &dev_a, N * sizeof (float));
  cudaMalloc ((void **) &dev_b, N * sizeof (float));
  cudaMalloc ((void **) &dev_c, sizeof (float));
  // fill in the host memory with data
  for (int i = 0; i < N; i++)
    {
      a[i] = i;
      b[i] = i * 2;
    }
  // copy the arrays 'a' and 'b' to the GPU
  cudaMemcpy (dev_a, a, N * sizeof (float), cudaMemcpyHostToDevice);
  cudaMemcpy (dev_b, b, N * sizeof (float), cudaMemcpyHostToDevice);
  cudaMemcpy (dev_c, &c, sizeof (float), cudaMemcpyHostToDevice);
  Lock lock;
  dot <<< blocksPerGrid, threadsPerBlock >>> (lock, dev_a, dev_b, dev_c);
  // copy c back from the GPU to the CPU
  cudaMemcpy (&c, dev_c, sizeof (float), cudaMemcpyDeviceToHost);

#define sum_squares(x) (x*(x+1)*(2*x+1)/6)

  printf ("Does GPU value %.6g = %.6g?\n", c,
	  2 * sum_squares ((float) (N - 1)));
// free memory on the GPU side
  cudaFree (dev_a);
  cudaFree (dev_b);
  cudaFree (dev_c);
// free memory on the CPU side
  free (a);
  free (b);
}