/* * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ #include #include #include namespace cg = cooperative_groups; #include #include "FunctionPointers_kernels.h" // Texture object for reading image cudaTextureObject_t tex; extern __shared__ unsigned char LocalBlock[]; static cudaArray *array = NULL; #define RADIUS 1 // pixel value used for thresholding function, works well with sample image 'lena' #define THRESHOLD 150.0f #ifdef FIXED_BLOCKWIDTH #define BlockWidth 80 #define SharedPitch 384 #endif // A function pointer can be declared explicitly like this line: //__device__ unsigned char (*pointFunction)(unsigned char, float ) = NULL; // or by using typedef's like below: typedef unsigned char(*blockFunction_t)( unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, float); typedef unsigned char(*pointFunction_t)( unsigned char, float); __device__ blockFunction_t blockFunction; __device__ unsigned char ComputeSobel(unsigned char ul, // upper left unsigned char um, // upper middle unsigned char ur, // upper right unsigned char ml, // middle left unsigned char mm, // middle (unused) unsigned char mr, // middle right unsigned char ll, // lower left unsigned char lm, // lower middle unsigned char lr, // lower right float fScale) { short Horz = ur + 2*mr + lr - ul - 2*ml - ll; short Vert = ul + 2*um + ur - ll - 2*lm - lr; short Sum = (short)(fScale*(abs((int)Horz)+abs((int)Vert))); return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum)) ; } // define a function pointer and initialize to NULL __device__ unsigned char(*varFunction)( unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, float x ) = NULL; __device__ unsigned char ComputeBox(unsigned char ul, // upper left unsigned char um, // upper middle unsigned char ur, // upper right unsigned char ml, // middle left unsigned char mm, // middle...middle unsigned char mr, // middle right unsigned char ll, // lower left unsigned char lm, // lower middle unsigned char lr, // lower right float fscale ) { short Sum = (short)(ul+um+ur + ml+mm+mr + ll+lm+lr)/9; Sum *= fscale; return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum)) ; } __device__ unsigned char Threshold(unsigned char in, float thresh) { if (in > thresh) { return 0xFF; } else { return 0; } } // Declare function tables, one for the point function chosen, one for the // block function chosen. The number of entries is determined by the // enum in FunctionPointers_kernels.h __device__ blockFunction_t blockFunction_table[LAST_BLOCK_FILTER]; __device__ pointFunction_t pointFunction_table[LAST_POINT_FILTER]; // Declare device side function pointers. We retrieve them later with // cudaMemcpyFromSymbol to set our function tables above in some // particular order specified at runtime. __device__ blockFunction_t pComputeSobel = ComputeSobel; __device__ blockFunction_t pComputeBox = ComputeBox; __device__ pointFunction_t pComputeThreshold = Threshold; // Allocate host side tables to mirror the device side, and later, we // fill these tables with the function pointers. This lets us send // the pointers to the kernel on invocation, as a method of choosing // which function to run. blockFunction_t h_blockFunction_table[2]; pointFunction_t h_pointFunction_table[2]; // Perform a filter operation on the data, using shared memory // The actual operation performed is // determined by the function pointer "blockFunction" and selected // by the integer argument "blockOperation" and has access // to an apron around the current pixel being processed. // Following the block operation, a per-pixel operation, // pointed to by pPointFunction is performed before the final // pixel is produced. __global__ void SobelShared(uchar4 *pSobelOriginal, unsigned short SobelPitch, #ifndef FIXED_BLOCKWIDTH short BlockWidth, short SharedPitch, #endif short w, short h, float fScale, int blockOperation, pointFunction_t pPointFunction, cudaTextureObject_t tex ) { // Handle to thread block group cg::thread_block cta = cg::this_thread_block(); short u = 4*blockIdx.x*BlockWidth; short v = blockIdx.y*blockDim.y + threadIdx.y; short ib; int SharedIdx = threadIdx.y * SharedPitch; for (ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x) { LocalBlock[SharedIdx+4*ib+0] = tex2D(tex, (float)(u+4*ib-RADIUS+0), (float)(v-RADIUS)); LocalBlock[SharedIdx+4*ib+1] = tex2D(tex, (float)(u+4*ib-RADIUS+1), (float)(v-RADIUS)); LocalBlock[SharedIdx+4*ib+2] = tex2D(tex, (float)(u+4*ib-RADIUS+2), (float)(v-RADIUS)); LocalBlock[SharedIdx+4*ib+3] = tex2D(tex, (float)(u+4*ib-RADIUS+3), (float)(v-RADIUS)); } if (threadIdx.y < RADIUS*2) { // // copy trailing RADIUS*2 rows of pixels into shared // SharedIdx = (blockDim.y+threadIdx.y) * SharedPitch; for (ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x) { LocalBlock[SharedIdx+4*ib+0] = tex2D(tex, (float)(u+4*ib-RADIUS+0), (float)(v+blockDim.y-RADIUS)); LocalBlock[SharedIdx+4*ib+1] = tex2D(tex, (float)(u+4*ib-RADIUS+1), (float)(v+blockDim.y-RADIUS)); LocalBlock[SharedIdx+4*ib+2] = tex2D(tex, (float)(u+4*ib-RADIUS+2), (float)(v+blockDim.y-RADIUS)); LocalBlock[SharedIdx+4*ib+3] = tex2D(tex, (float)(u+4*ib-RADIUS+3), (float)(v+blockDim.y-RADIUS)); } } cg::sync(cta); u >>= 2; // index as uchar4 from here uchar4 *pSobel = (uchar4 *)(((char *) pSobelOriginal)+v*SobelPitch); SharedIdx = threadIdx.y * SharedPitch; blockFunction = blockFunction_table[blockOperation]; for (ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x) { uchar4 out; unsigned char pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+0]; unsigned char pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+1]; unsigned char pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+2]; unsigned char pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+0]; unsigned char pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+1]; unsigned char pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+2]; unsigned char pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+0]; unsigned char pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+1]; unsigned char pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+2]; out.x = (*blockFunction)(pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, fScale); pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+3]; pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+3]; pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+3]; out.y = (*blockFunction)(pix01, pix02, pix00, pix11, pix12, pix10, pix21, pix22, pix20, fScale); pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+4]; pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+4]; pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+4]; out.z = (*blockFunction)(pix02, pix00, pix01, pix12, pix10, pix11, pix22, pix20, pix21, fScale); pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+5]; pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+5]; pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+5]; out.w = (*blockFunction)(pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, fScale); if (pPointFunction != NULL) { out.x = (*pPointFunction)(out.x, THRESHOLD); out.y = (*pPointFunction)(out.y, THRESHOLD); out.z = (*pPointFunction)(out.z, THRESHOLD); out.w = (*pPointFunction)(out.w, THRESHOLD); } if (u+ib < w/4 && v < h) { pSobel[u+ib] = out; } } cg::sync(cta); } __global__ void SobelCopyImage(Pixel *pSobelOriginal, unsigned int Pitch, int w, int h, float fscale, cudaTextureObject_t tex) { unsigned char *pSobel = (unsigned char *)(((char *) pSobelOriginal)+blockIdx.x*Pitch); for (int i = threadIdx.x; i < w; i += blockDim.x) { pSobel[i] = min(max((tex2D(tex, (float) i, (float) blockIdx.x) * fscale), 0.f), 255.f); } } // Perform block and pointer filtering using texture lookups. // The block and point operations are determined by the // input argument (see comment above for "SobelShared" function) __global__ void SobelTex(Pixel *pSobelOriginal, unsigned int Pitch, int w, int h, float fScale, int blockOperation, pointFunction_t pPointOperation, cudaTextureObject_t tex) { unsigned char *pSobel = (unsigned char *)(((char *) pSobelOriginal)+blockIdx.x*Pitch); unsigned char tmp = 0; for (int i = threadIdx.x; i < w; i += blockDim.x) { unsigned char pix00 = tex2D(tex, (float) i-1, (float) blockIdx.x-1); unsigned char pix01 = tex2D(tex, (float) i+0, (float) blockIdx.x-1); unsigned char pix02 = tex2D(tex, (float) i+1, (float) blockIdx.x-1); unsigned char pix10 = tex2D(tex, (float) i-1, (float) blockIdx.x+0); unsigned char pix11 = tex2D(tex, (float) i+0, (float) blockIdx.x+0); unsigned char pix12 = tex2D(tex, (float) i+1, (float) blockIdx.x+0); unsigned char pix20 = tex2D(tex, (float) i-1, (float) blockIdx.x+1); unsigned char pix21 = tex2D(tex, (float) i+0, (float) blockIdx.x+1); unsigned char pix22 = tex2D(tex, (float) i+1, (float) blockIdx.x+1); tmp = (*(blockFunction_table[blockOperation]))(pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, fScale); if (pPointOperation != NULL) { tmp = (*pPointOperation)(tmp, 150.0); } pSobel[i] = tmp; } } extern "C" void setupTexture(int iw, int ih, Pixel *data, int Bpp) { cudaChannelFormatDesc desc; if (Bpp == 1) { desc = cudaCreateChannelDesc(); } else { desc = cudaCreateChannelDesc(); } checkCudaErrors(cudaMallocArray(&array, &desc, iw, ih)); checkCudaErrors(cudaMemcpy2DToArray(array, 0, 0, data, iw * Bpp * sizeof(Pixel), iw * Bpp * sizeof(Pixel), ih, cudaMemcpyHostToDevice)); cudaResourceDesc texRes; memset(&texRes,0,sizeof(cudaResourceDesc)); texRes.resType = cudaResourceTypeArray; texRes.res.array.array = array; cudaTextureDesc texDescr; memset(&texDescr,0,sizeof(cudaTextureDesc)); checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); } extern "C" void deleteTexture(void) { checkCudaErrors(cudaFreeArray(array)); checkCudaErrors(cudaDestroyTextureObject(tex)); } // Copy the pointers from the function tables to the host side void setupFunctionTables() { // Dynamically assign the function table. // Copy the function pointers to their appropriate locations according to the enum checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[SOBEL_FILTER], pComputeSobel, sizeof(blockFunction_t))); checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[BOX_FILTER], pComputeBox, sizeof(blockFunction_t))); // do the same for the point function, where the 2nd function is NULL ("no-op" filter, skipped in kernel code) checkCudaErrors(cudaMemcpyFromSymbol(&h_pointFunction_table[THRESHOLD_FILTER], pComputeThreshold, sizeof(pointFunction_t))); h_pointFunction_table[NULL_FILTER] = NULL; // now copy the function tables back to the device, so if we wish we can use an index into the table to choose them // We have now set the order in the function table according to our enum. checkCudaErrors(cudaMemcpyToSymbol(blockFunction_table, h_blockFunction_table, sizeof(blockFunction_t)*LAST_BLOCK_FILTER)); checkCudaErrors(cudaMemcpyToSymbol(pointFunction_table, h_pointFunction_table, sizeof(pointFunction_t)*LAST_POINT_FILTER)); } // Wrapper for the __global__ call that sets up the texture and threads // Below two methods for selecting the image processing function to run are shown. // BlockOperation is an integer kernel argument used as an index into the blockFunction_table on the device side // pPointOp is itself a function pointer passed as a kernel argument, retrieved from a host side copy of the function table extern "C" void sobelFilter(Pixel *odata, int iw, int ih, enum SobelDisplayMode mode, float fScale, int blockOperation, int pointOperation) { pointFunction_t pPointOp = h_pointFunction_table[pointOperation]; switch (mode) { case SOBELDISPLAY_IMAGE: SobelCopyImage<<>>(odata, iw, iw, ih, fScale, tex); break; case SOBELDISPLAY_SOBELTEX: SobelTex<<>>(odata, iw, iw, ih, fScale, blockOperation, pPointOp, tex); break; case SOBELDISPLAY_SOBELSHARED: { dim3 threads(16,4); #ifndef FIXED_BLOCKWIDTH int BlockWidth = 80; // must be divisible by 16 for coalescing #endif dim3 blocks = dim3(iw/(4*BlockWidth)+(0!=iw%(4*BlockWidth)), ih/threads.y+(0!=ih%threads.y)); int SharedPitch = ~0x3f&(4*(BlockWidth+2*RADIUS)+0x3f); int sharedMem = SharedPitch*(threads.y+2*RADIUS); // for the shared kernel, width must be divisible by 4 iw &= ~3; SobelShared<<>>((uchar4 *) odata, iw, #ifndef FIXED_BLOCKWIDTH BlockWidth, SharedPitch, #endif iw, ih, fScale, blockOperation, pPointOp, tex); } break; } }