/* * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ /* NVIDIA HW Decoder, both dGPU and Tegra, normally outputs NV12 pitch format frames. For the inference using TensorRT, the input frame needs to be BGR planar format with possibly different size. So, conversion and resizing from NV12 to BGR planar is usually required for the inference following decoding. This CUDA code is to provide a reference implementation for conversion and resizing. Limitaion ========= NV12resize needs the height to be a even value. Note ==== Resize function needs the pitch of image buffer to be 32 alignment. Run ==== ./NV12toBGRandResize OR ./NV12toBGRandResize -input=data/test1920x1080.nv12 -width=1920 -height=1080 \ -dst_width=640 -dst_height=480 -batch=40 -device=0 */ #include #include #include #include #include #include #include #include #include #include #include #include "resize_convert.h" #include "utils.h" #define TEST_LOOP 20 typedef struct _nv12_to_bgr24_context_t { int width; int height; int pitch; int dst_width; int dst_height; int dst_pitch; int batch; int device; // cuda device ID char *input_nv12_file; int ctx_pitch; // the value will be suitable for Texture memroy. int ctx_heights; // the value will be even. } nv12_to_bgr24_context; nv12_to_bgr24_context g_ctx; static void printHelp(const char *app_name) { std::cout << "Usage:" << app_name << " [options]\n\n"; std::cout << "OPTIONS:\n"; std::cout << "\t-h,--help\n\n"; std::cout << "\t-input=nv12file nv12 input file\n"; std::cout << "\t-width=width input nv12 image width, <1 -- 4096>\n"; std::cout << "\t-height=height input nv12 image height, <1 -- 4096>\n"; std::cout << "\t-pitch=pitch(optional) input nv12 image pitch, <0 -- 4096>\n"; std::cout << "\t-dst_width=width output BGR image width, <1 -- 4096>\n"; std::cout << "\t-dst_height=height output BGR image height, <1 -- 4096>\n"; std::cout << "\t-dst_pitch=pitch(optional) output BGR image pitch, <0 -- 4096>\n"; std::cout << "\t-batch=batch process frames count, <1 -- 4096>\n\n"; std::cout << "\t-device=device_num(optional) cuda device number, <0 -- 4096>\n\n"; return; } int parseCmdLine(int argc, char *argv[]) { char **argp = (char **)argv; char *arg = (char *)argv[0]; memset(&g_ctx, 0, sizeof(g_ctx)); if ((arg && (!strcmp(arg, "-h") || !strcmp(arg, "--help")))) { printHelp(argv[0]); return -1; } if (argc == 1) { // Run using default arguments g_ctx.input_nv12_file = sdkFindFilePath("test1920x1080.nv12", argv[0]); if (g_ctx.input_nv12_file == NULL) { printf("Cannot find input file test1920x1080.nv12\n Exiting\n"); return EXIT_FAILURE; } g_ctx.width = 1920; g_ctx.height = 1080; g_ctx.dst_width = 640; g_ctx.dst_height = 480; g_ctx.batch = 24; } else if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "width")) { g_ctx.width = getCmdLineArgumentInt(argc, (const char **)argv, "width"); } if (checkCmdLineFlag(argc, (const char **)argv, "height")) { g_ctx.height = getCmdLineArgumentInt(argc, (const char **)argv, "height"); } if (checkCmdLineFlag(argc, (const char **)argv, "pitch")) { g_ctx.pitch = getCmdLineArgumentInt(argc, (const char **)argv, "pitch"); } if (checkCmdLineFlag(argc, (const char **)argv, "input")) { getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&g_ctx.input_nv12_file); } if (checkCmdLineFlag(argc, (const char **)argv, "dst_width")) { g_ctx.dst_width = getCmdLineArgumentInt(argc, (const char **)argv, "dst_width"); } if (checkCmdLineFlag(argc, (const char **)argv, "dst_height")) { g_ctx.dst_height = getCmdLineArgumentInt(argc, (const char **)argv, "dst_height"); } if (checkCmdLineFlag(argc, (const char **)argv, "dst_pitch")) { g_ctx.dst_pitch = getCmdLineArgumentInt(argc, (const char **)argv, "dst_pitch"); } if (checkCmdLineFlag(argc, (const char **)argv, "batch")) { g_ctx.batch = getCmdLineArgumentInt(argc, (const char **)argv, "batch"); } } g_ctx.device = findCudaDevice(argc, (const char **)argv); if ((g_ctx.width == 0) || (g_ctx.height == 0) || (g_ctx.dst_width == 0) || (g_ctx.dst_height == 0) || !g_ctx.input_nv12_file) { printHelp(argv[0]); return -1; } if (g_ctx.pitch == 0) g_ctx.pitch = g_ctx.width; if (g_ctx.dst_pitch == 0) g_ctx.dst_pitch = g_ctx.dst_width; return 0; } /* load nv12 yuvfile data into GPU device memory with batch of copy */ static int loadNV12Frame(unsigned char *d_inputNV12) { unsigned char *pNV12FrameData; unsigned char *d_nv12; int frameSize; std::ifstream nv12File(g_ctx.input_nv12_file, std::ifstream::in | std::ios::binary); if (!nv12File.is_open()) { std::cerr << "Can't open files\n"; return -1; } frameSize = g_ctx.pitch * g_ctx.ctx_heights; #if USE_UVM_MEM pNV12FrameData = d_inputNV12; #else pNV12FrameData = (unsigned char *)malloc(frameSize); if (pNV12FrameData == NULL) { std::cerr << "Failed to malloc pNV12FrameData\n"; return -1; } #endif nv12File.read((char *)pNV12FrameData, frameSize); if (nv12File.gcount() < frameSize) { std::cerr << "can't get one frame!\n"; return -1; } #if USE_UVM_MEM // Prefetch to GPU for following GPU operation cudaStreamAttachMemAsync(NULL, pNV12FrameData, 0, cudaMemAttachGlobal); #endif // expand one frame to multi frames for batch processing d_nv12 = d_inputNV12; for (int i = 0; i < g_ctx.batch; i++) { checkCudaErrors(cudaMemcpy2D((void *)d_nv12, g_ctx.ctx_pitch, pNV12FrameData, g_ctx.width, g_ctx.width, g_ctx.ctx_heights, cudaMemcpyHostToDevice)); d_nv12 += g_ctx.ctx_pitch * g_ctx.ctx_heights; } #if (USE_UVM_MEM == 0) free(pNV12FrameData); #endif nv12File.close(); return 0; } /* 1. resize interlace nv12 to target size 2. convert nv12 to bgr 3 progressive planars */ void nv12ResizeAndNV12ToBGR(unsigned char *d_inputNV12) { unsigned char *d_resizedNV12; float *d_outputBGR; int size; char filename[40]; /* allocate device memory for resized nv12 output */ size = g_ctx.dst_width * (int)ceil(g_ctx.dst_height * 3.0f / 2.0f) * g_ctx.batch * sizeof(unsigned char); checkCudaErrors(cudaMalloc((void **)&d_resizedNV12, size)); /* allocate device memory for bgr output */ size = g_ctx.dst_pitch * g_ctx.dst_height * 3 * g_ctx.batch * sizeof(float); checkCudaErrors(cudaMalloc((void **)&d_outputBGR, size)); cudaStream_t stream; checkCudaErrors(cudaStreamCreate(&stream)); /* create cuda event handles */ cudaEvent_t start, stop; checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); float elapsedTime = 0.0f; /* resize interlace nv12 */ cudaEventRecord(start, 0); for (int i = 0; i < TEST_LOOP; i++) { resizeNV12Batch(d_inputNV12, g_ctx.ctx_pitch, g_ctx.width, g_ctx.height, d_resizedNV12, g_ctx.dst_width, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch); } checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventSynchronize(stop)); checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, stop)); printf( " CUDA resize nv12(%dx%d --> %dx%d), batch: %d," " average time: %.3f ms ==> %.3f ms/frame\n", g_ctx.width, g_ctx.height, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch, (elapsedTime / (TEST_LOOP * 1.0f)), (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch); SPRINTF(filename, "resized_nv12_%dx%d", g_ctx.dst_width, g_ctx.dst_height); /* convert nv12 to bgr 3 progressive planars */ checkCudaErrors(cudaEventRecord(start, 0)); for (int i = 0; i < TEST_LOOP; i++) { nv12ToBGRplanarBatch(d_resizedNV12, g_ctx.dst_pitch, // intput d_outputBGR, g_ctx.dst_pitch * sizeof(float), // output g_ctx.dst_width, g_ctx.dst_height, // output g_ctx.batch, 0); } checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventSynchronize(stop)); checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, stop)); printf( " CUDA convert nv12(%dx%d) to bgr(%dx%d), batch: %d," " average time: %.3f ms ==> %.3f ms/frame\n", g_ctx.dst_width, g_ctx.dst_height, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch, (elapsedTime / (TEST_LOOP * 1.0f)), (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch); SPRINTF(filename, "converted_bgr_%dx%d", g_ctx.dst_width, g_ctx.dst_height); dumpBGR(d_outputBGR, g_ctx.dst_pitch, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch, (char *)"t1", filename); /* release resources */ checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(stop)); checkCudaErrors(cudaStreamDestroy(stream)); checkCudaErrors(cudaFree(d_resizedNV12)); checkCudaErrors(cudaFree(d_outputBGR)); } /* 1. convert nv12 to bgr 3 progressive planars 2. resize bgr 3 planars to target size */ void nv12ToBGRandBGRresize(unsigned char *d_inputNV12) { float *d_bgr; float *d_resizedBGR; int size; char filename[40]; /* allocate device memory for bgr output */ size = g_ctx.ctx_pitch * g_ctx.height * 3 * g_ctx.batch * sizeof(float); checkCudaErrors(cudaMalloc((void **)&d_bgr, size)); /* allocate device memory for resized bgr output */ size = g_ctx.dst_width * g_ctx.dst_height * 3 * g_ctx.batch * sizeof(float); checkCudaErrors(cudaMalloc((void **)&d_resizedBGR, size)); cudaStream_t stream; checkCudaErrors(cudaStreamCreate(&stream)); /* create cuda event handles */ cudaEvent_t start, stop; checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); float elapsedTime = 0.0f; /* convert interlace nv12 to bgr 3 progressive planars */ checkCudaErrors(cudaEventRecord(start, 0)); checkCudaErrors(cudaDeviceSynchronize()); for (int i = 0; i < TEST_LOOP; i++) { nv12ToBGRplanarBatch(d_inputNV12, g_ctx.ctx_pitch, d_bgr, g_ctx.ctx_pitch * sizeof(float), g_ctx.width, g_ctx.height, g_ctx.batch, 0); } checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventSynchronize(stop)); checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, stop)); printf( " CUDA convert nv12(%dx%d) to bgr(%dx%d), batch: %d," " average time: %.3f ms ==> %.3f ms/frame\n", g_ctx.width, g_ctx.height, g_ctx.width, g_ctx.height, g_ctx.batch, (elapsedTime / (TEST_LOOP * 1.0f)), (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch); SPRINTF(filename, "converted_bgr_%dx%d", g_ctx.width, g_ctx.height); /* resize bgr 3 progressive planars */ checkCudaErrors(cudaEventRecord(start, 0)); for (int i = 0; i < TEST_LOOP; i++) { resizeBGRplanarBatch(d_bgr, g_ctx.ctx_pitch, g_ctx.width, g_ctx.height, d_resizedBGR, g_ctx.dst_width, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch); } checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventSynchronize(stop)); checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, stop)); printf( " CUDA resize bgr(%dx%d --> %dx%d), batch: %d," " average time: %.3f ms ==> %.3f ms/frame\n", g_ctx.width, g_ctx.height, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch, (elapsedTime / (TEST_LOOP * 1.0f)), (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch); memset(filename, 0, sizeof(filename)); SPRINTF(filename, "resized_bgr_%dx%d", g_ctx.dst_width, g_ctx.dst_height); dumpBGR(d_resizedBGR, g_ctx.dst_pitch, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch, (char *)"t2", filename); /* release resources */ checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(stop)); checkCudaErrors(cudaStreamDestroy(stream)); checkCudaErrors(cudaFree(d_bgr)); checkCudaErrors(cudaFree(d_resizedBGR)); } int main(int argc, char *argv[]) { unsigned char *d_inputNV12; if (parseCmdLine(argc, argv) < 0) return EXIT_FAILURE; g_ctx.ctx_pitch = g_ctx.width; int ctx_alignment = 32; g_ctx.ctx_pitch += (g_ctx.ctx_pitch % ctx_alignment != 0) ? (ctx_alignment - g_ctx.ctx_pitch % ctx_alignment) : 0; g_ctx.ctx_heights = (int)ceil(g_ctx.height * 3.0f / 2.0f); /* load nv12 yuv data into d_inputNV12 with batch of copies */ #if USE_UVM_MEM checkCudaErrors(cudaMallocManaged( (void **)&d_inputNV12, (g_ctx.ctx_pitch * g_ctx.ctx_heights * g_ctx.batch), cudaMemAttachHost)); printf("\nUSE_UVM_MEM\n"); #else checkCudaErrors( cudaMalloc((void **)&d_inputNV12, (g_ctx.ctx_pitch * g_ctx.ctx_heights * g_ctx.batch))); #endif if (loadNV12Frame(d_inputNV12)) { std::cerr << "failed to load batch data!\n"; return EXIT_FAILURE; } /* firstly resize nv12, then convert nv12 to bgr */ printf("\nTEST#1:\n"); nv12ResizeAndNV12ToBGR(d_inputNV12); /* first convert nv12 to bgr, then resize bgr */ printf("\nTEST#2:\n"); nv12ToBGRandBGRresize(d_inputNV12); checkCudaErrors(cudaFree(d_inputNV12)); return EXIT_SUCCESS; }