/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
    Recursive Gaussian filter
*/

#ifndef _RECURSIVEGAUSSIAN_KERNEL_CU_
#define _RECURSIVEGAUSSIAN_KERNEL_CU_

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#include <helper_cuda.h>
#include <helper_math.h>

#define BLOCK_DIM 16
#define CLAMP_TO_EDGE 1

// Transpose kernel (see transpose CUDA Sample for details)
__global__ void d_transpose(uint *odata, uint *idata, int width, int height)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();

    __shared__ uint block[BLOCK_DIM][BLOCK_DIM+1];

    // read the matrix tile into shared memory
    unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
    unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;

    if ((xIndex < width) && (yIndex < height))
    {
        unsigned int index_in = yIndex * width + xIndex;
        block[threadIdx.y][threadIdx.x] = idata[index_in];
    }

    cg::sync(cta);

    // write the transposed matrix tile to global memory
    xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
    yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;

    if ((xIndex < height) && (yIndex < width))
    {
        unsigned int index_out = yIndex * height + xIndex;
        odata[index_out] = block[threadIdx.x][threadIdx.y];
    }
}

// RGBA version
// reads from 32-bit uint array holding 8-bit RGBA

// convert floating point rgba color to 32-bit integer
__device__ uint rgbaFloatToInt(float4 rgba)
{
    rgba.x = __saturatef(rgba.x);   // clamp to [0.0, 1.0]
    rgba.y = __saturatef(rgba.y);
    rgba.z = __saturatef(rgba.z);
    rgba.w = __saturatef(rgba.w);
    return (uint(rgba.w*255)<<24) | (uint(rgba.z*255)<<16) | (uint(rgba.y*255)<<8) | uint(rgba.x*255);
}

// convert from 32-bit int to float4
__device__ float4 rgbaIntToFloat(uint c)
{
    float4 rgba;
    rgba.x = (c & 0xff) / 255.0f;
    rgba.y = ((c>>8) & 0xff) / 255.0f;
    rgba.z = ((c>>16) & 0xff) / 255.0f;
    rgba.w = ((c>>24) & 0xff) / 255.0f;
    return rgba;
}

/*
    simple 1st order recursive filter
    - processes one image column per thread

    parameters:
    id - pointer to input data (RGBA image packed into 32-bit integers)
    od - pointer to output data
    w  - image width
    h  - image height
    a  - blur parameter
*/

__global__ void
d_simpleRecursive_rgba(uint *id, uint *od, int w, int h, float a)
{
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;

    if (x >= w) return;

    id += x;    // advance pointers to correct column
    od += x;

    // forward pass
    float4 yp = rgbaIntToFloat(*id);  // previous output

    for (int y = 0; y < h; y++)
    {
        float4 xc = rgbaIntToFloat(*id);
        float4 yc = xc + a*(yp - xc);   // simple lerp between current and previous value
        *od = rgbaFloatToInt(yc);
        id += w;
        od += w;    // move to next row
        yp = yc;
    }

    // reset pointers to point to last element in column
    id -= w;
    od -= w;

    // reverse pass
    // ensures response is symmetrical
    yp = rgbaIntToFloat(*id);

    for (int y = h-1; y >= 0; y--)
    {
        float4 xc = rgbaIntToFloat(*id);
        float4 yc = xc + a*(yp - xc);
        *od = rgbaFloatToInt((rgbaIntToFloat(*od) + yc)*0.5f);
        id -= w;
        od -= w;  // move to previous row
        yp = yc;
    }
}

/*
    recursive Gaussian filter

    parameters:
    id - pointer to input data (RGBA image packed into 32-bit integers)
    od - pointer to output data
    w  - image width
    h  - image height
    a0-a3, b1, b2, coefp, coefn - filter parameters
*/

__global__ void
d_recursiveGaussian_rgba(uint *id, uint *od, int w, int h, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn)
{
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;

    if (x >= w) return;

    id += x;    // advance pointers to correct column
    od += x;

    // forward pass
    float4 xp = make_float4(0.0f);  // previous input
    float4 yp = make_float4(0.0f);  // previous output
    float4 yb = make_float4(0.0f);  // previous output by 2
#if CLAMP_TO_EDGE
    xp = rgbaIntToFloat(*id);
    yb = coefp*xp;
    yp = yb;
#endif

    for (int y = 0; y < h; y++)
    {
        float4 xc = rgbaIntToFloat(*id);
        float4 yc = a0*xc + a1*xp - b1*yp - b2*yb;
        *od = rgbaFloatToInt(yc);
        id += w;
        od += w;    // move to next row
        xp = xc;
        yb = yp;
        yp = yc;
    }

    // reset pointers to point to last element in column
    id -= w;
    od -= w;

    // reverse pass
    // ensures response is symmetrical
    float4 xn = make_float4(0.0f);
    float4 xa = make_float4(0.0f);
    float4 yn = make_float4(0.0f);
    float4 ya = make_float4(0.0f);
#if CLAMP_TO_EDGE
    xn = xa = rgbaIntToFloat(*id);
    yn = coefn*xn;
    ya = yn;
#endif

    for (int y = h-1; y >= 0; y--)
    {
        float4 xc = rgbaIntToFloat(*id);
        float4 yc = a2*xn + a3*xa - b1*yn - b2*ya;
        xa = xn;
        xn = xc;
        ya = yn;
        yn = yc;
        *od = rgbaFloatToInt(rgbaIntToFloat(*od) + yc);
        id -= w;
        od -= w;  // move to previous row
    }
}

#endif // #ifndef _GAUSSIAN_KERNEL_H_