/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#include <helper_cuda.h>

#include "FunctionPointers_kernels.h"

// Texture object for reading image
cudaTextureObject_t tex;
extern __shared__ unsigned char LocalBlock[];
static cudaArray *array = NULL;

#define RADIUS 1

// pixel value used for thresholding function, works well with sample image 'lena'
#define THRESHOLD 150.0f

#ifdef FIXED_BLOCKWIDTH
#define BlockWidth 80
#define SharedPitch 384
#endif

// A function pointer can be declared explicitly like this line:
//__device__ unsigned char (*pointFunction)(unsigned char, float ) = NULL;
// or by using typedef's like below:

typedef unsigned char(*blockFunction_t)(
    unsigned char, unsigned char, unsigned char,
    unsigned char, unsigned char, unsigned char,
    unsigned char, unsigned char, unsigned char,
    float);

typedef unsigned char(*pointFunction_t)(
    unsigned char, float);

__device__ blockFunction_t blockFunction;

__device__ unsigned char
ComputeSobel(unsigned char ul, // upper left
             unsigned char um, // upper middle
             unsigned char ur, // upper right
             unsigned char ml, // middle left
             unsigned char mm, // middle (unused)
             unsigned char mr, // middle right
             unsigned char ll, // lower left
             unsigned char lm, // lower middle
             unsigned char lr, // lower right
             float fScale)
{
    short Horz = ur + 2*mr + lr - ul - 2*ml - ll;
    short Vert = ul + 2*um + ur - ll - 2*lm - lr;
    short Sum = (short)(fScale*(abs((int)Horz)+abs((int)Vert)));
    return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum)) ;
}

// define a function pointer and initialize to NULL
__device__ unsigned char(*varFunction)(
    unsigned char, unsigned char, unsigned char,
    unsigned char, unsigned char, unsigned char,
    unsigned char, unsigned char, unsigned char,
    float x
) = NULL;

__device__ unsigned char
ComputeBox(unsigned char ul,   // upper left
           unsigned char um, // upper middle
           unsigned char ur, // upper right
           unsigned char ml, // middle left
           unsigned char mm, // middle...middle
           unsigned char mr, // middle right
           unsigned char ll, // lower left
           unsigned char lm, // lower middle
           unsigned char lr, // lower right
           float fscale
          )
{

    short Sum = (short)(ul+um+ur + ml+mm+mr + ll+lm+lr)/9;
    Sum *= fscale;
    return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum)) ;
}
__device__ unsigned char
Threshold(unsigned char in, float thresh)
{
    if (in > thresh)
    {
        return 0xFF;
    }
    else
    {
        return 0;
    }
}


// Declare function tables, one for the point function chosen, one for the
// block function chosen.  The number of entries is determined by the
// enum in FunctionPointers_kernels.h
__device__ blockFunction_t blockFunction_table[LAST_BLOCK_FILTER];
__device__ pointFunction_t pointFunction_table[LAST_POINT_FILTER];

// Declare device side function pointers.  We retrieve them later with
// cudaMemcpyFromSymbol to set our function tables above in some
// particular order specified at runtime.
__device__ blockFunction_t pComputeSobel = ComputeSobel;
__device__ blockFunction_t pComputeBox   = ComputeBox;
__device__ pointFunction_t pComputeThreshold = Threshold;

// Allocate host side tables to mirror the device side, and later, we
// fill these tables with the function pointers.  This lets us send
// the pointers to the kernel on invocation, as a method of choosing
// which function to run.
blockFunction_t h_blockFunction_table[2];
pointFunction_t h_pointFunction_table[2];


// Perform a filter operation on the data, using shared memory
// The actual operation performed is
// determined by the function pointer "blockFunction" and selected
// by the integer argument "blockOperation" and has access
// to an apron around the current pixel being processed.
// Following the block operation, a per-pixel operation,
// pointed to by pPointFunction is performed before the final
// pixel is produced.
__global__ void
SobelShared(uchar4 *pSobelOriginal, unsigned short SobelPitch,
#ifndef FIXED_BLOCKWIDTH
            short BlockWidth, short SharedPitch,
#endif
            short w, short h, float fScale,
            int blockOperation, pointFunction_t pPointFunction,
            cudaTextureObject_t tex
           )
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    short u = 4*blockIdx.x*BlockWidth;
    short v = blockIdx.y*blockDim.y + threadIdx.y;
    short ib;

    int SharedIdx = threadIdx.y * SharedPitch;

    for (ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x)
    {
        LocalBlock[SharedIdx+4*ib+0] = tex2D<unsigned char>(tex,
                                             (float)(u+4*ib-RADIUS+0), (float)(v-RADIUS));
        LocalBlock[SharedIdx+4*ib+1] = tex2D<unsigned char>(tex,
                                             (float)(u+4*ib-RADIUS+1), (float)(v-RADIUS));
        LocalBlock[SharedIdx+4*ib+2] = tex2D<unsigned char>(tex,
                                             (float)(u+4*ib-RADIUS+2), (float)(v-RADIUS));
        LocalBlock[SharedIdx+4*ib+3] = tex2D<unsigned char>(tex,
                                             (float)(u+4*ib-RADIUS+3), (float)(v-RADIUS));
    }

    if (threadIdx.y < RADIUS*2)
    {
        //
        // copy trailing RADIUS*2 rows of pixels into shared
        //
        SharedIdx = (blockDim.y+threadIdx.y) * SharedPitch;

        for (ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x)
        {
            LocalBlock[SharedIdx+4*ib+0] = tex2D<unsigned char>(tex,
                                                 (float)(u+4*ib-RADIUS+0), (float)(v+blockDim.y-RADIUS));
            LocalBlock[SharedIdx+4*ib+1] = tex2D<unsigned char>(tex,
                                                 (float)(u+4*ib-RADIUS+1), (float)(v+blockDim.y-RADIUS));
            LocalBlock[SharedIdx+4*ib+2] = tex2D<unsigned char>(tex,
                                                 (float)(u+4*ib-RADIUS+2), (float)(v+blockDim.y-RADIUS));
            LocalBlock[SharedIdx+4*ib+3] = tex2D<unsigned char>(tex,
                                                 (float)(u+4*ib-RADIUS+3), (float)(v+blockDim.y-RADIUS));
        }
    }

    cg::sync(cta);

    u >>= 2;    // index as uchar4 from here
    uchar4 *pSobel = (uchar4 *)(((char *) pSobelOriginal)+v*SobelPitch);
    SharedIdx = threadIdx.y * SharedPitch;

    blockFunction = blockFunction_table[blockOperation];

    for (ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x)
    {

        uchar4 out;

        unsigned char pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+0];
        unsigned char pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+1];
        unsigned char pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+2];
        unsigned char pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+0];
        unsigned char pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+1];
        unsigned char pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+2];
        unsigned char pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+0];
        unsigned char pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+1];
        unsigned char pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+2];

        out.x = (*blockFunction)(pix00, pix01, pix02,
                                 pix10, pix11, pix12,
                                 pix20, pix21, pix22, fScale);

        pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+3];
        pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+3];
        pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+3];
        out.y = (*blockFunction)(pix01, pix02, pix00,
                                 pix11, pix12, pix10,
                                 pix21, pix22, pix20, fScale);

        pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+4];
        pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+4];
        pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+4];
        out.z = (*blockFunction)(pix02, pix00, pix01,
                                 pix12, pix10, pix11,
                                 pix22, pix20, pix21, fScale);

        pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+5];
        pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+5];
        pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+5];
        out.w = (*blockFunction)(pix00, pix01, pix02,
                                 pix10, pix11, pix12,
                                 pix20, pix21, pix22, fScale);

        if (pPointFunction != NULL)
        {
            out.x = (*pPointFunction)(out.x, THRESHOLD);
            out.y = (*pPointFunction)(out.y, THRESHOLD);
            out.z = (*pPointFunction)(out.z, THRESHOLD);
            out.w = (*pPointFunction)(out.w, THRESHOLD);
        }

        if (u+ib < w/4 && v < h)
        {
            pSobel[u+ib] = out;
        }

    }

    cg::sync(cta);
}

__global__ void
SobelCopyImage(Pixel *pSobelOriginal, unsigned int Pitch,
               int w, int h, float fscale, cudaTextureObject_t tex)
{
    unsigned char *pSobel =
        (unsigned char *)(((char *) pSobelOriginal)+blockIdx.x*Pitch);

    for (int i = threadIdx.x; i < w; i += blockDim.x)
    {
        pSobel[i] = min(max((tex2D<unsigned char>(tex, (float) i, (float) blockIdx.x) * fscale), 0.f), 255.f);
    }
}

// Perform block and pointer filtering using texture lookups.
// The block and point operations are determined by the
// input argument (see comment above for "SobelShared" function)
__global__ void
SobelTex(Pixel *pSobelOriginal, unsigned int Pitch,
         int w, int h, float fScale, int blockOperation, pointFunction_t pPointOperation, cudaTextureObject_t tex)
{
    unsigned char *pSobel =
        (unsigned char *)(((char *) pSobelOriginal)+blockIdx.x*Pitch);
    unsigned char tmp = 0;

    for (int i = threadIdx.x; i < w; i += blockDim.x)
    {
        unsigned char pix00 = tex2D<unsigned char>(tex, (float) i-1, (float) blockIdx.x-1);
        unsigned char pix01 = tex2D<unsigned char>(tex, (float) i+0, (float) blockIdx.x-1);
        unsigned char pix02 = tex2D<unsigned char>(tex, (float) i+1, (float) blockIdx.x-1);
        unsigned char pix10 = tex2D<unsigned char>(tex, (float) i-1, (float) blockIdx.x+0);
        unsigned char pix11 = tex2D<unsigned char>(tex, (float) i+0, (float) blockIdx.x+0);
        unsigned char pix12 = tex2D<unsigned char>(tex, (float) i+1, (float) blockIdx.x+0);
        unsigned char pix20 = tex2D<unsigned char>(tex, (float) i-1, (float) blockIdx.x+1);
        unsigned char pix21 = tex2D<unsigned char>(tex, (float) i+0, (float) blockIdx.x+1);
        unsigned char pix22 = tex2D<unsigned char>(tex, (float) i+1, (float) blockIdx.x+1);
        tmp = (*(blockFunction_table[blockOperation]))(pix00, pix01, pix02,
                                                       pix10, pix11, pix12,
                                                       pix20, pix21, pix22, fScale);

        if (pPointOperation != NULL)
        {
            tmp = (*pPointOperation)(tmp, 150.0);
        }

        pSobel[i] = tmp;
    }
}

extern "C" void setupTexture(int iw, int ih, Pixel *data, int Bpp)
{
    cudaChannelFormatDesc desc;

    if (Bpp == 1)
    {
        desc = cudaCreateChannelDesc<unsigned char>();
    }
    else
    {
        desc = cudaCreateChannelDesc<uchar4>();
    }

    checkCudaErrors(cudaMallocArray(&array, &desc, iw, ih));
    checkCudaErrors(cudaMemcpy2DToArray(array, 0, 0, data, iw * Bpp * sizeof(Pixel), 
                                        iw * Bpp * sizeof(Pixel), ih, cudaMemcpyHostToDevice));

    cudaResourceDesc            texRes;
    memset(&texRes,0,sizeof(cudaResourceDesc));

    texRes.resType            = cudaResourceTypeArray;
    texRes.res.array.array    = array;

    cudaTextureDesc             texDescr;
    memset(&texDescr,0,sizeof(cudaTextureDesc));

    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
}

extern "C" void deleteTexture(void)
{
    checkCudaErrors(cudaFreeArray(array));
    checkCudaErrors(cudaDestroyTextureObject(tex));
}

// Copy the pointers from the function tables to the host side
void setupFunctionTables()
{
    // Dynamically assign the function table.
    // Copy the function pointers to their appropriate locations according to the enum
    checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[SOBEL_FILTER], pComputeSobel, sizeof(blockFunction_t)));
    checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[BOX_FILTER], pComputeBox, sizeof(blockFunction_t)));

    // do the same for the point function, where the 2nd function is NULL ("no-op" filter, skipped in kernel code)
    checkCudaErrors(cudaMemcpyFromSymbol(&h_pointFunction_table[THRESHOLD_FILTER], pComputeThreshold, sizeof(pointFunction_t)));
    h_pointFunction_table[NULL_FILTER] = NULL;

    // now copy the function tables back to the device, so if we wish we can use an index into the table to choose them
    // We have now set the order in the function table according to our enum.
    checkCudaErrors(cudaMemcpyToSymbol(blockFunction_table, h_blockFunction_table, sizeof(blockFunction_t)*LAST_BLOCK_FILTER));
    checkCudaErrors(cudaMemcpyToSymbol(pointFunction_table, h_pointFunction_table, sizeof(pointFunction_t)*LAST_POINT_FILTER));
}


// Wrapper for the __global__ call that sets up the texture and threads
// Below two methods for selecting the image processing function to run are shown.
// BlockOperation is an integer kernel argument used as an index into the blockFunction_table on the device side
// pPointOp is itself a function pointer passed as a kernel argument, retrieved from a host side copy of the function table
extern "C" void sobelFilter(Pixel *odata, int iw, int ih, enum SobelDisplayMode mode, float fScale, int blockOperation, int pointOperation)
{
    pointFunction_t pPointOp = h_pointFunction_table[pointOperation];

    switch (mode)
    {
        case SOBELDISPLAY_IMAGE:
            SobelCopyImage<<<ih, 384>>>(odata, iw, iw, ih, fScale, tex);
            break;

        case SOBELDISPLAY_SOBELTEX:
            SobelTex<<<ih, 384>>>(odata, iw, iw, ih, fScale, blockOperation, pPointOp, tex);
            break;

        case SOBELDISPLAY_SOBELSHARED:
            {
                dim3 threads(16,4);
#ifndef FIXED_BLOCKWIDTH
                int BlockWidth = 80; // must be divisible by 16 for coalescing
#endif
                dim3 blocks = dim3(iw/(4*BlockWidth)+(0!=iw%(4*BlockWidth)),
                                   ih/threads.y+(0!=ih%threads.y));
                int SharedPitch = ~0x3f&(4*(BlockWidth+2*RADIUS)+0x3f);
                int sharedMem = SharedPitch*(threads.y+2*RADIUS);

                // for the shared kernel, width must be divisible by 4
                iw &= ~3;

                SobelShared<<<blocks, threads, sharedMem>>>((uchar4 *) odata,
                                                            iw,
#ifndef FIXED_BLOCKWIDTH
                                                            BlockWidth, SharedPitch,
#endif
                                                            iw, ih, fScale, blockOperation, pPointOp, tex);
            }
            break;
    }
}