////////////////////////////////////////////////////////////////////////////
//
// Copyright 2014 - 2015 NVIDIA Corporation.  All rights reserved.
//
// Please refer to the NVIDIA end user license agreement (EULA) associated
// with this source code for terms and conditions that govern your use of
// this software. Any use, reproduction, disclosure, or distribution of
// this software and related documentation outside the terms of the EULA
// is strictly prohibited.
//
////////////////////////////////////////////////////////////////////////////

/*
   This example demonstrates how to use the CUDA C bindings to OpenGL ES to
   dynamically modify a vertex buffer using a CUDA C kernel.

   The steps are:
   1. Create an empty vertex buffer object (VBO)
   2. Register the VBO with CUDA C
   3. Map the VBO for writing from CUDA C
   4. Run CUDA C kernel to modify the vertex positions
   5. Unmap the VBO
   6. Render the results using OpenGL ES

   Host code
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include <stdarg.h>
#include <unistd.h>
#include <screen/screen.h>

#include "graphics_interface.c"

// includes, cuda
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>

// Utilities and timing functions
#include <helper_functions.h>    // includes cuda.h and cuda_runtime_api.h

// CUDA helper functions
#include <helper_cuda.h>         // helper functions for CUDA error check

#include <vector_types.h>

#define MAX_EPSILON_ERROR 0.0f
#define THRESHOLD          0.0f
#define REFRESH_DELAY     1 //ms

#define GUI_IDLE 0x100
#define GUI_ROTATE 0x101
#define GUI_TRANSLATE 0x102

int gui_mode; 

////////////////////////////////////////////////////////////////////////////////
// Default configuration
unsigned int window_width  = 512;
unsigned int window_height = 512;
unsigned int dispno = 0;

// constants
const unsigned int mesh_width    = 256;
const unsigned int mesh_height   = 256;

// OpenGL ES variables and interop with CUDA C
GLuint mesh_vao, mesh_vbo;
struct cudaGraphicsResource *cuda_vbo_resource;
void *d_vbo_buffer = NULL;

float g_fAnim = 0.0;

// UI / mouse controls
int mouse_old_x, mouse_old_y;
int mouse_buttons = 0;
float rotate_x = 0.0, rotate_y = 0.0;
float translate_z = -3.0;

StopWatchInterface *timer = NULL;

// Frame statistics
int frame;
int fpsCount = 0;        // FPS count for averaging
int fpsLimit = 1;        // FPS limit for sampling
int g_Index = 0;
float avgFPS = 0.0f;
unsigned int frameCount = 0;
unsigned int g_TotalErrors = 0;

// The default number of seconds after which the test will end.
#define TIME_LIMIT 10.0 // 10 secs


// Flag indicating it is time to shut down
static GLboolean shutdown = GL_FALSE;

// Callback to close window
static void
closeCB_app(void)
{
    shutdown = GL_TRUE;
}

// Callback to handle key presses
static void
keyCB_app(char key, int state)
{
    // Ignoring releases
    if (!state) return;

    if ((key == 'q') || (key == 'Q') || (key == NvGlDemoKeyCode_Escape)) shutdown = GL_TRUE;
}

// Auto-Verification Code
bool g_bQAReadback = false;

int *pArgc = NULL;
char **pArgv = NULL;

#define MAX(a,b) ((a > b) ? a : b)

////////////////////////////////////////////////////////////////////////////////
// declaration, forward

// CUDA functionality
void runCuda(struct cudaGraphicsResource **vbo_resource);
void runAutoTest(int devID, char **argv, char *ref_file);
void checkResultCuda(int argc, char **argv, const GLuint &vbo);

const char *sSDKsample = "simpleGLES on Screen (VBO)";

void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
        fpsCount = 0;
        fpsLimit = (int)MAX(avgFPS, 1.f);

        sdkResetTimer(&timer);
    }

    char fps[256];
    sprintf(fps, "Cuda/OpenGL ES Interop (VBO): %3.1f fps (Max 1000 fps)", avgFPS);
    graphics_set_windowtitle(fps);
}

///////////////////////////////////////////////////////////////////////////////
//! Simple kernel to modify vertex positions in sine wave pattern
//! @param data  data in global memory
///////////////////////////////////////////////////////////////////////////////
__global__ void simple_vbo_kernel(float4 *pos, unsigned int width, unsigned int height, float time)
{
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    // calculate uv coordinates
    float u = x / (float) width;
    float v = y / (float) height;
    u = u*2.0f - 1.0f;
    v = v*2.0f - 1.0f;

    // calculate simple sine wave pattern
    float freq = 4.0f;
    float w = sinf(u*freq + time) * cosf(v*freq + time) * 0.5f;

    // write output vertex
    pos[y*width+x] = make_float4(u, w, v, 1.0f);
}


void launch_kernel(float4 *pos, unsigned int mesh_width, unsigned int mesh_height, float time)
{
    // execute the kernel
    dim3 block(8, 8, 1);
    dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);
    simple_vbo_kernel<<< grid, block>>>(pos, mesh_width, mesh_height, time);
}

////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void runCuda(struct cudaGraphicsResource **vbo_resource)
{
    // map OpenGL buffer object for writing from CUDA
    float4 *dptr;
    cudaGraphicsMapResources(1, vbo_resource, 0);
    size_t num_bytes;
    cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, *vbo_resource);

    launch_kernel(dptr, mesh_width, mesh_height, g_fAnim);

    // unmap buffer object
    cudaGraphicsUnmapResources(1, vbo_resource, 0);
}

#ifndef FOPEN
#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
#endif

void sdkDumpBin2(void *data, unsigned int bytes, const char *filename)
{
    printf("sdkDumpBin: <%s>\n", filename);
    FILE *fp;
    FOPEN(fp, filename, "wb");
    fwrite(data, bytes, 1, fp);
    fflush(fp);
    fclose(fp);
}

////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void runAutoTest(int devID, char **argv, char *ref_file)
{
    char *reference_file = NULL;
    void *imageData = malloc(mesh_width*mesh_height*sizeof(float));

    // execute the kernel
    launch_kernel((float4 *)d_vbo_buffer, mesh_width, mesh_height, g_fAnim);

    cudaDeviceSynchronize();
    getLastCudaError("launch_kernel failed");

    cudaMemcpy(imageData, d_vbo_buffer, mesh_width*mesh_height*sizeof(float), cudaMemcpyDeviceToHost);

    sdkDumpBin2(imageData, mesh_width*mesh_height*sizeof(float), "simpleGLES_screen.bin");
    reference_file = sdkFindFilePath(ref_file, argv[0]);

    if (reference_file && !sdkCompareBin2BinFloat("simpleGLES_screen.bin", reference_file, 
                mesh_width*mesh_height*sizeof(float),
                MAX_EPSILON_ERROR, THRESHOLD, pArgv[0]))
    {
        g_TotalErrors++;
    }
}

////////////////////////////////////////////////////////////////////////////////
//! Display callback
////////////////////////////////////////////////////////////////////////////////
void display_thisframe(float time_delta)
{
    sdkStartTimer(&timer);

    // run CUDA kernel to generate vertex positions
    runCuda(&cuda_vbo_resource);

    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

    glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);

    glFinish();

    g_fAnim += time_delta;

    sdkStopTimer(&timer);
    computeFPS();
}


////////////////////////////////////////////////////////////////////////////////
//! Check if the result is correct or write data to file for external
//! regression testing
////////////////////////////////////////////////////////////////////////////////
void checkResultCuda(int argc, char **argv, const GLuint &vbo)
{
    if (!d_vbo_buffer)
    {
        printf("%s: Mapping result buffer from OpenGL ES\n", __FUNCTION__); 

        cudaGraphicsUnregisterResource(cuda_vbo_resource);

        // map buffer object
        glBindBuffer(GL_ARRAY_BUFFER, vbo);
        float *data = (float *) glMapBufferRange(GL_ARRAY_BUFFER, 0, mesh_width * mesh_height * 4 * sizeof(float), GL_READ_ONLY);

        // check result
        if (checkCmdLineFlag(argc, (const char **) argv, "regression"))
        {
            // write file for regression test
            sdkWriteFile<float>("./data/regression.dat", data, mesh_width * mesh_height * 3, 0.0, false);
        }

        // unmap GL buffer object
        if (!glUnmapBuffer(GL_ARRAY_BUFFER))
        {
            fprintf(stderr, "Unmap buffer failed.\n");
            fflush(stderr);
        }

        checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard));

        CHECK_GLERROR(); 
    }
}

GLuint mesh_shader = 0;

void readAndCompileShaderFromGLSLFile(GLuint new_shaderprogram, const char *filename, GLenum shaderType)
{
    FILE *file = fopen(filename,"rb"); // open shader text file
    if (!file) 
    {
        error_exit("Filename %s does not exist\n", filename);
    }

    // get the size of the file and read it
    fseek(file,0,SEEK_END);
    GLint size = ftell(file);
    char *data = (char*)malloc(sizeof(char)*(size + 1));
    memset(data, 0, sizeof(char)*(size + 1));
    fseek(file,0,SEEK_SET);
    size_t res = fread(data,1,size,file);
    fclose(file);

    GLuint shader = glCreateShader(shaderType);
    glShaderSource(shader, 1, (const GLchar**)&data, &size);
    glCompileShader(shader);

    CHECK_GLERROR();
    GLint compile_success = 0;
    glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_success);
    CHECK_GLERROR();

    if (compile_success == GL_FALSE)
    {
        printf("Compilation of %s failed!\n Reason:\n", filename);

        GLint maxLength = 0;
        glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &maxLength);

        char errorLog[maxLength];
        glGetShaderInfoLog(shader, maxLength, &maxLength, &errorLog[0]);

        printf("%s", errorLog);

        glDeleteShader(shader); 
        exit(1);
    }

    glAttachShader(new_shaderprogram, shader);
    glDeleteShader(shader);

    free(data);  
}

GLuint ShaderCreate(const char *vshader_filename, const char *fshader_filename)
{
    printf("Loading GLSL shaders %s %s\n", vshader_filename, fshader_filename);

    GLuint new_shaderprogram = glCreateProgram();

    CHECK_GLERROR();
    if (vshader_filename)
    {
        readAndCompileShaderFromGLSLFile(new_shaderprogram, vshader_filename, GL_VERTEX_SHADER);
    }

    CHECK_GLERROR();
    if (fshader_filename)
    {
        readAndCompileShaderFromGLSLFile(new_shaderprogram, fshader_filename, GL_FRAGMENT_SHADER);
    }

    CHECK_GLERROR();

    glLinkProgram(new_shaderprogram);

    CHECK_GLERROR();
    GLint link_success;
    glGetProgramiv(new_shaderprogram, GL_LINK_STATUS, &link_success);

    if (link_success == GL_FALSE)
    {
        printf("Linking of %s with %s failed!\n Reason:\n", vshader_filename, fshader_filename);

        GLint maxLength = 0;
        glGetShaderiv(new_shaderprogram, GL_INFO_LOG_LENGTH, &maxLength);

        char errorLog[maxLength];
        glGetShaderInfoLog(new_shaderprogram, maxLength, &maxLength, &errorLog[0]);

        printf("%s", errorLog);

        exit(EXIT_FAILURE);
    }

    return new_shaderprogram;
}

//===========================================================================
// InitGraphicsState() - initialize OpenGL
//===========================================================================
static void InitGraphicsState(void)
{
    char *GL_version=(char *)glGetString(GL_VERSION);
    char *GL_vendor=(char *)glGetString(GL_VENDOR);
    char *GL_renderer=(char *)glGetString(GL_RENDERER);

    printf("Version: %s\n", GL_version);
    printf("Vendor: %s\n", GL_vendor);
    printf("Renderer: %s\n", GL_renderer);

    // RENDERING SETUP (OpenGL ES or OpenGL Core Profile!)
    glGenVertexArrays(1, &mesh_vao);  // Features' Vertex Array Object allocation
    glBindVertexArray(mesh_vao); // bind VAO

    // initialize buffer object
    glGenBuffers(1, &mesh_vbo);
    glBindBuffer(GL_ARRAY_BUFFER, mesh_vbo);

    unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);
    glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW);
    glVertexAttribPointer((GLuint)0, 4, GL_FLOAT, GL_FALSE, 0, 0);
    glEnableVertexAttribArray(0); 

    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, mesh_vbo, cudaGraphicsMapFlagsNone));

    // GLSL stuff
    char *vertex_shader_path = sdkFindFilePath("mesh.vert.glsl", pArgv[0]);
    char *fragment_shader_path = sdkFindFilePath("mesh.frag.glsl", pArgv[0]);

    if (vertex_shader_path == NULL || fragment_shader_path == NULL)
    {
        printf("Error finding shader file\n");
        exit(EXIT_FAILURE);
    }

    mesh_shader = ShaderCreate(vertex_shader_path, fragment_shader_path);
    CHECK_GLERROR();

    free(vertex_shader_path);
    free(fragment_shader_path);

    glUseProgram(mesh_shader);
}


////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
bool runTest(int argc, char **argv, char *ref_file)
{
    // command line mode only
    if (ref_file != NULL)
    {
        // This will pick the best possible CUDA capable device
        //int devID = findCudaDevice(argc, (const char **)argv);
#if defined (__aarch64__) || defined(__arm__)
        // find iGPU on the system which is compute capable which will perform GLES-CUDA interop
        int devID = findIntegratedGPU();
#else
        // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        int devID = findCudaDevice(argc, (const char **)argv);
#endif

        // create VBO
        checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer, mesh_width*mesh_height*4*sizeof(float)));

        // run the cuda part
        runAutoTest(devID, argv, ref_file);

        // check result of Cuda step
        checkResultCuda(argc, argv, mesh_vbo);

        cudaFree(d_vbo_buffer);
        d_vbo_buffer = NULL;
    }
    else
    {
        double endTime = TIME_LIMIT;

        // this would use command-line specified CUDA device, note that CUDA defaults to highest Gflops/s device
        if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        {
            error_exit("Device setting not yet implemented!\n");
        }

        // display selection 
        if (checkCmdLineFlag(argc, (const char **)argv, "dispno"))
        {
            dispno = getCmdLineArgumentInt(argc, (const char **)argv, "dispno");
        }

        // Window width 
        if (checkCmdLineFlag(argc, (const char **)argv, "width"))
        {
            window_width = getCmdLineArgumentInt(argc, (const char **)argv, "width");
        }

        // Window Height 
        if (checkCmdLineFlag(argc, (const char **)argv, "height"))
        {
            window_height = getCmdLineArgumentInt(argc, (const char **)argv, "height");
        }

        // Determine how long to run for in secs: default is 10s
        if (checkCmdLineFlag(argc, (const char **)argv, "runtime"))
        {
            endTime = getCmdLineArgumentInt(argc, (const char **)argv, "runtime");
        }

        SetCloseCB(closeCB_app);
        SetKeyCB(keyCB_app);

        // create QNX screen window and set up associated OpenGL ES context
        graphics_setup_window(0,0, window_width, window_height, sSDKsample, dispno);

#if defined (__aarch64__) || defined(__arm__)
        // find iGPU on the system which is compute capable which will perform GLES-CUDA interop
        int devID = findIntegratedGPU();
#else
        // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        int devID = findCudaDevice(argc, (const char **)argv);
#endif
        InitGraphicsState(); // set up GLES stuff

        glClearColor( 0, 0.5, 1, 1 ); // blue-ish background
        glClear( GL_COLOR_BUFFER_BIT );

        graphics_swap_buffers();

        int frame = 0;

        struct timeval begin, end;
        gettimeofday(&begin, NULL);

        // Print runtime
        if (endTime < 0.0) {
            endTime = TIME_LIMIT;
            printf(" running forever...\n");
        } else {
            printf(" running for %f seconds...\n", endTime);
        }

        while (!shutdown)
        {
            frame++;
            display_thisframe(0.010);
            usleep(1000);
            graphics_swap_buffers();
            CheckEvents();

            gettimeofday(&end, 0);
            double elapsed = (end.tv_sec - begin.tv_sec) + ((end.tv_usec - begin.tv_usec)/1000000.0);

            // Check whether time limit has been exceeded
            if (!shutdown) shutdown = (endTime <= elapsed);
        }

        // NOTE: Before destroying OpenGL ES context, must unregister all shared resources from CUDA !
        checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource));

        graphics_close_window(); // close window and destroy OpenGL ES context
    }

    return true;
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    char *ref_file = NULL;

    pArgc = &argc;
    pArgv = argv;

#if defined(__linux__)
    setenv ("DISPLAY", ":0", 0);
#endif

    printf("%s starting...\n", sSDKsample);

    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
        {
            // In this mode, we run without OpenGL and see if VBO is generated correctly
            getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file);
        }
    }

    printf("\n");

    runTest(argc, argv, ref_file);

    printf("%s completed, returned %s\n", sSDKsample, (g_TotalErrors == 0) ? "OK" : "ERROR!");

    exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
}