#pragma warning(disable:4819) /* * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ /* Recursive Gaussian filter sgreen 8/1/08 This code sample implements a Gaussian blur using Deriche's recursive method: http://citeseer.ist.psu.edu/deriche93recursively.html This is similar to the box filter sample in the SDK, but it uses the previous outputs of the filter as well as the previous inputs. This is also known as an IIR (infinite impulse response) filter, since its response to an input impulse can last forever. The main advantage of this method is that the execution time is independent of the filter width. The GPU processes columns of the image in parallel. To avoid uncoalesced reads for the row pass we transpose the image and then transpose it back again afterwards. The implementation is based on code from the CImg library: http://cimg.sourceforge.net/ Thanks to David Tschumperl� and all the CImg contributors! */ // OpenGL Graphics includes #include #if defined (__APPLE__) || defined(MACOSX) #pragma clang diagnostic ignored "-Wdeprecated-declarations" #include #ifndef glutCloseFunc #define glutCloseFunc glutWMCloseFunc #endif #else #include #endif // CUDA includes and interop headers #include #include // CUDA utilities and system includes #include #include // includes cuda.h and cuda_runtime_api.h // Includes #include #include #include #include #define MAX(a,b) ((a > b) ? a : b) #define USE_SIMPLE_FILTER 0 #define MAX_EPSILON_ERROR 5.0f #define THRESHOLD 0.15f // Define the files that are to be save and the reference images for validation const char *sOriginal[] = { "lena_10.ppm", "lena_14.ppm", "lena_18.ppm", "lena_22.ppm", NULL }; const char *sReference[] = { "ref_10.ppm", "ref_14.ppm", "ref_18.ppm", "ref_22.ppm", NULL }; const char *image_filename = "lena.ppm"; float sigma = 10.0f; int order = 0; int nthreads = 64; // number of threads per block unsigned int width, height; unsigned int *h_img = NULL; unsigned int *d_img = NULL; unsigned int *d_temp = NULL; GLuint pbo = 0; // OpenGL pixel buffer object GLuint texid = 0; // texture cudaGraphicsResource_t cuda_vbo_resource; StopWatchInterface *timer = 0; // Auto-Verification Code const int frameCheckNumber = 4; int fpsCount = 0; // FPS count for averaging int fpsLimit = 1; // FPS limit for sampling unsigned int frameCount = 0; int *pArgc = NULL; char **pArgv = NULL; bool runBenchmark = false; const char *sSDKsample = "CUDA Recursive Gaussian"; extern "C" void transpose(unsigned int *d_src, unsigned int *d_dest, unsigned int width, int height); extern "C" void gaussianFilterRGBA(unsigned int *d_src, unsigned int *d_dest, unsigned int *d_temp, int width, int height, float sigma, int order, int nthreads); void cleanup(); void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "%s (sigma=%4.2f): %3.1f fps", sSDKsample, sigma, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = ftoi(MAX(ifps, 1.f)); sdkResetTimer(&timer); } } // display results using OpenGL void display() { sdkStartTimer(&timer); // execute filter, writing results to pbo unsigned int *d_result; checkCudaErrors(cudaGraphicsMapResources(1, &cuda_vbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_result, &num_bytes, cuda_vbo_resource)); gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); // unmap buffer object checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0)); // load texture from pbo glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBindTexture(GL_TEXTURE_2D, texid); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); // display results glClear(GL_COLOR_BUFFER_BIT); glEnable(GL_TEXTURE_2D); glDisable(GL_DEPTH_TEST); glBegin(GL_QUADS); glTexCoord2f(0, 1); glVertex2f(0, 0); glTexCoord2f(1, 1); glVertex2f(1, 0); glTexCoord2f(1, 0); glVertex2f(1, 1); glTexCoord2f(0, 0); glVertex2f(0, 1); glEnd(); glDisable(GL_TEXTURE_2D); glutSwapBuffers(); sdkStopTimer(&timer); computeFPS(); } void idle() { glutPostRedisplay(); } void cleanup() { sdkDeleteTimer(&timer); checkCudaErrors(cudaFree(d_img)); checkCudaErrors(cudaFree(d_temp)); if (!runBenchmark) { if (pbo) { // unregister this buffer object with CUDA checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource)); glDeleteBuffers(1, &pbo); } if (texid) { glDeleteTextures(1, &texid); } } } void keyboard(unsigned char key, int x, int y) { switch (key) { case 27: #if defined(__APPLE__) || defined(MACOSX) exit(EXIT_SUCCESS); #else glutDestroyWindow(glutGetWindow()); return; #endif break; case '=': sigma+=0.1f; break; case '-': sigma-=0.1f; if (sigma < 0.0) { sigma = 0.0f; } break; case '+': sigma+=1.0f; break; case '_': sigma-=1.0f; if (sigma < 0.0) { sigma = 0.0f; } break; case '0': order = 0; break; case '1': order = 1; sigma = 0.5f; break; case '2': order = 2; sigma = 0.5f; break; default: break; } printf("sigma = %f\n", sigma); glutPostRedisplay(); } void reshape(int x, int y) { glViewport(0, 0, x, y); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glMatrixMode(GL_PROJECTION); glLoadIdentity(); glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); } void initCudaBuffers() { unsigned int size = width * height * sizeof(unsigned int); // allocate device memory checkCudaErrors(cudaMalloc((void **) &d_img, size)); checkCudaErrors(cudaMalloc((void **) &d_temp, size)); checkCudaErrors(cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice)); sdkCreateTimer(&timer); } void initGLBuffers() { // create pixel buffer object to store final image glGenBuffers(1, &pbo); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(GLubyte)*4, h_img, GL_STREAM_DRAW_ARB); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, pbo, cudaGraphicsRegisterFlagsWriteDiscard)); // create texture for display glGenTextures(1, &texid); glBindTexture(GL_TEXTURE_2D, texid); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glBindTexture(GL_TEXTURE_2D, 0); } void initGL(int *argc, char **argv) { glutInit(argc, argv); glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); glutInitWindowSize(width, height); glutCreateWindow(sSDKsample); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); glutIdleFunc(idle); #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif printf("Press '+' and '-' to change filter width\n"); printf("0, 1, 2 - change filter order\n"); if (!isGLVersionSupported(2,0) || !areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { fprintf(stderr, "Required OpenGL extensions missing."); exit(EXIT_FAILURE); } } void benchmark(int iterations) { // allocate memory for result unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); checkCudaErrors(cudaMalloc((void **) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); sdkStartTimer(&timer); // execute the kernel for (int i = 0; i < iterations; i++) { gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&timer); // check if kernel execution generated an error getLastCudaError("Kernel execution failed"); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("%.2f Mpixels/sec\n", (width*height*iterations / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); checkCudaErrors(cudaFree(d_result)); } bool runSingleTest(const char *ref_file, const char *exec_path) { // allocate memory for result int nTotalErrors = 0; unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); checkCudaErrors(cudaMalloc((void **) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); sdkStartTimer(&timer); gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); getLastCudaError("Kernel execution failed"); sdkStopTimer(&timer); unsigned char *h_result = (unsigned char *)malloc(width*height*4); checkCudaErrors(cudaMemcpy(h_result, d_result, width*height*4, cudaMemcpyDeviceToHost)); char dump_file[1024]; sprintf(dump_file, "lena_%02d.ppm", (int)sigma); sdkSavePPM4ub(dump_file, h_result, width, height); if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, false)) { nTotalErrors++; } printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("%.2f Mpixels/sec\n", (width*height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); checkCudaErrors(cudaFree(d_result)); free(h_result); printf("Summary: %d errors!\n", nTotalErrors); printf(nTotalErrors == 0 ? "Test passed\n": "Test failed!\n"); return (nTotalErrors == 0); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; char *ref_file = NULL; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKsample); printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); fpsLimit = frameCheckNumber; } } // Get the path of the filename char *filename; if (getCmdLineArgumentString(argc, (const char **) argv, "image", &filename)) { image_filename = filename; } // load image char *image_path = sdkFindFilePath(image_filename, argv[0]); if (image_path == NULL) { fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename); exit(EXIT_FAILURE); } sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); if (!h_img) { printf("Error unable to load PPM file: '%s'\n", image_path); exit(EXIT_FAILURE); } printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **)argv, "sigma")) { sigma = getCmdLineArgumentFloat(argc, (const char **) argv, "sigma"); } runBenchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark"); int device; struct cudaDeviceProp prop; cudaGetDevice(&device); cudaGetDeviceProperties(&prop, device); if (!strncmp("Tesla", prop.name, 5)) { printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n"); // runBenchmark = true; runBenchmark = true; } // Benchmark or AutoTest mode detected, no OpenGL if (runBenchmark == true || ref_file != NULL) { findCudaDevice(argc, (const char **)argv); } else { // First initialize OpenGL context, and then select CUDA device. initGL(&argc, argv); findCudaDevice(argc, (const char **)argv); } initCudaBuffers(); if (ref_file) { printf("(Automated Testing)\n"); bool testPassed = runSingleTest(ref_file, argv[0]); cleanup(); exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE); } if (runBenchmark) { printf("(Run Benchmark)\n"); benchmark(100); cleanup(); exit(EXIT_SUCCESS); } initGLBuffers(); glutMainLoop(); exit(EXIT_SUCCESS); }