>

COSC330/530 Parallel and Distributed Computing

Lecture 20 - Mutual Exclusion and Atomic Functions

Dr. Mitchell Welch


Reading


Summary


Atomic Operations and Mutual Exclusion


Atomic Operations and Mutual Exclusion

int atomicAdd(int* address, 
    int val
);

unsigned int atomicAdd(unsigned int* address,
    unsigned int val
);

unsigned long long int atomicAdd(unsigned long long int* address,
    unsigned long long int val
);

float atomicAdd(float* address, 
    float val
);


Atomic Operations and Mutual Exclusion


Atomic Operations and Mutual Exclusion

__global__ void vectSumRace(int* d_vect,size_t size, int* result){

    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;

    while(tid < size){

        *result+=d_vect[tid];

        tid+=blockDim.x * gridDim.x;
    }


}

Atomic Operations and Mutual Exclusion


__global__ void vectSumAtomic(int* d_vect,size_t size, int* result){ size_t tid = blockIdx.x * blockDim.x + threadIdx.x; while(tid < size){ atomicAdd(result, d_vect[tid]); tid+=blockDim.x * gridDim.x; } }

Atomic Operations and Mutual Exclusion

[bourbaki Debug] $ ./VectorSum 
4 2 3 1 4 1 2 3 5 2 

vectSumRace Result: 2

vectSumAtomic Result: 27


Atomic Operations and Mutual Exclusion

int atomicExch(int* address, int val);

unsigned int atomicExch(unsigned int* address, unsigned int val);

unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);

float atomicExch(float* address, float val);


Atomic Operations and Mutual Exclusion

int atomicCAS(int* address, int compare, int val);

unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val);

unsigned long long int atomicCAS(unsigned long long int* address, unsigned long long int compare, unsigned long long int val);


Atomic Operations and Mutual Exclusion


Atomic Operations and Mutual Exclusion

struct Lock {
    int *mutex;
    Lock( void ) {
        int state = 0;
        cudaMalloc( (void**)& mutex,
        sizeof(int) ) ;
        cudaMemcpy( mutex, &state, sizeof(int),
        cudaMemcpyHostToDevice );
    }
    ~Lock( void ) {
        cudaFree( mutex );
    }
    __device__ void lock( void ) {
        while( atomicCAS( mutex, 0, 1 ) != 0 );
    }
    __device__ void unlock( void ) {
        atomicExch( mutex, 1 );
    }
};


Atomic Operations and Mutual Exclusion


Warps and Divergence


Warps and Divergence


Warps and Divergence


Warps and Divergence

kernel<<<N, 1>>> ( ... )         /* Bad Design - the warp will be almost empty*/
kernel<<<N / 32, 32>>>( ... )    /*  Okay Design - the warp will be full*/
kernel<<<N / 128, 128>>>( ... )  /* Better Design - Maximise the number of full warps */


Warps and Divergence


__global__ void kn(int n, .... ) { int i = threadIdx.x + blockDim.x * blockIdx.x; if (i < n) { // Your task } }

Warps and Divergence

__shared__ float partialSum[]
...
unsigned int t = threadIdx.x;
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2){
    __syncthreads();

    if (t % (2*stride) == 0)
        partialSum[t] += partialSum[t+stride];
 }


Warps and Divergence


Warps and Divergence


center-aligned image


Warps and Divergence


Warps and Divergence


Warps and Divergence

__shared__ float partialSum[]
unsigned int t = threadIdx.x;
for (unsigned int stride = blockDim.x; stride > 1; stride /= 2){
    __syncthreads();
    if (t < stride)
        partialSum[t] += partialSum[t+stride];
}


Warps and Divergence


center-aligned image


Measuring Performance

cudaEvent_t start;
cudaEventCreate(&start);
cudaEvent_t stop;
cudaEventCreate(&stop);
// Record the start event
cudaEventRecord(start, NULL);

...
//Launch kernels etc.
...

// Record the stop event
cudaEventRecord(stop, NULL);
// Wait for the stop event to complete
cudaEventSynchronize(stop);
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop);

Measuring Performance

Review the MatrixMul.cu example.


Using Visual Studio Code for Debugging


Summary


Reading