>

COSC330/530 Parallel and Distributed Computing

Lecture 19 - CUDA Threads and Memory

Dr. Mitchell Welch


Reading


Summary


Error Checking in CUDA C


Error Checking in CUDA C

__host__ ​ __device__ ​const char* cudaGetErrorString ( cudaError_t error )


Error Checking in CUDA C

__host__ ​ __device__ ​const char* cudaGetErrorName ( cudaError_t error )


Error Checking in CUDA C

__host__ ​ __device__ ​cudaError_t cudaGetLastError ( void )


Error Checking in CUDA C


The CUDA Memory Hierarchy


The CUDA Memory Hierarchy


The CUDA Memory Hierarchy


The CUDA Memory Hierarchy


The CUDA Memory Hierarchy

#define WIDTH 10
#define HEIGHT 10

float* myArr = (float*)malloc(WIDTH * HEIGHT * sizeof(float));

for(int y = 0; y< HEIGHT; y++){
    for(int x=0; x < WIDTH; x++){

        printf("%f", myArr[y * WIDTH + x]);

    }
    printf("\n");
}


The CUDA Memory Hierarchy


The CUDA Memory Hierarchy

// Host code 
int width = 64, height = 64; 
float* devPtr; 
size_t pitch; 
cudaMallocPitch(&devPtr, &pitch, width * sizeof(float), height); 
MyKernel<<<100, 512>>>(devPtr, pitch, width, height); 

// Device code 
__global__ void MyKernel(float* devPtr, size_t pitch, int width, int height) { 
    for (int r = 0; r < height; ++r) {
        /*Tricky pointer arithmetic to get the pointer to the row*/  
        float* row = (float*)((char*)devPtr + r * pitch); 
        for (int c = 0; c < width; ++c) { 
            float element = row[c]; 
        } 
    } 
}


The CUDA Memory Hierarchy

 __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];


The CUDA Memory Hierarchy


The CUDA Memory Hierarchy

// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) {
    // Each thread computes one element of C
    // by accumulating results into Cvalue
    float Cvalue = 0;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    for (int e = 0; e < A.width; ++e)
        Cvalue += A.elements[row * A.width + e] * B.elements[e * B.width + col];
    C.elements[row * C.width + col] = Cvalue;
}


The CUDA Memory Hierarchy


The CUDA Memory Hierarchy


center-aligned image

The CUDA Memory Hierarchy


The CUDA Memory Hierarchy


The CUDA Memory Hierarchy


The CUDA Memory Hierarchy


The CUDA Memory Hierarchy


The CUDA Memory Hierarchy


The CUDA Memory Hierarchy


Thread Synchronisation


Thread Synchronisation


Thread Synchronisation

__syncthreads()


Thread Synchronisation


Thread Synchronisation


Conway's game of life in CUDA.


Conway's game of life in CUDA.


Conway's game of life in CUDA.


Conway's game of life in CUDA.


Conway's game of life in CUDA.


Conway's game of life in CUDA.


Conway's game of life in CUDA.


Summary


Reading