>

COSC330/530 Parallel and Distributed Computing

Lecture 22 - CUDA Processing Streams

Dr. Mitchell Welch


Reading


Summary


Single CUDA Streams


Single CUDA Streams

#include <stdio.h>

int main( void ) {
    cudaDeviceProp prop;
    int whichDevice;
    cudaGetDevice( &whichDevice );
    cudaGetDeviceProperties( &prop, whichDevice ) ;
    if(!prop.deviceOverlap){
            printf( "Device will not handle overlaps, so no speed up from streams\n" );
    }else{
        printf( "Device will handle overlaps\n" );
    }
return 0;
}


Single CUDA Streams

cudaError_t cudaStreamCreate(cudaStream_t * pStream );


Single CUDA Streams

// initialize the stream
cudaStream_t    stream;
HANDLE_ERROR( cudaStreamCreate( &stream ) );


Single CUDA Streams

__global__ void kernel(int *a, int *b, int *c) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        int idx1 = (idx + 1) % 256;
        int idx2 = (idx + 2) % 256;
        float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
        float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
        c[idx] = (as + bs) / 2;
    }
}


Single CUDA Streams


Single CUDA Streams


Single CUDA Streams


Single CUDA Streams

cudaError_t cudaHostAlloc(void ** pHost,
    size_t size, 
    unsigned int flag)  



Single CUDA Streams

// allocate page-locked memory, used to stream
cudaHostAlloc( (void**)&host_a, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault );
cudaHostAlloc( (void**)&host_b, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault );
cudaHostAlloc( (void**)&host_c, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault );

for (int i = 0; i < FULL_DATA_SIZE; i++) {
        host_a[i] = rand();
        host_b[i] = rand();
}


Single CUDA Streams


Single CUDA Streams

cudaError_t cudaMemcpyAsync (   void * dst,
    const void *    src,
    size_t count,
    enum cudaMemcpyKind kind,
    cudaStream_t    stream = 0   
)       


Single CUDA Streams

cudaMemcpyAsync( dev_a, host_a+i, N * sizeof(int), cudaMemcpyHostToDevice, stream);


Single CUDA Streams


kernel<<<N/256,256,0,stream>>>( dev_a, dev_b, dev_c );

Single CUDA Streams


Single CUDA Streams

c cudaError_t cudaStreamSynchronize(cudaStream_t stream)


Single CUDA Streams


Multiple CUDA Streams


Multiple CUDA Streams


center-aligned image


Multiple CUDA Streams


Using Multiple GPUs

cudaError_t cudaSetDevice   (int    device)     

Using Multiple GPUs


Summary


Reading

* N/A