/** * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ /* Bicubic filtering See GPU Gems 2: "Fast Third-Order Texture Filtering", Sigg & Hadwiger http://http.developer.nvidia.com/GPUGems2/gpugems2_chapter20.html Reformulation thanks to Keenan Crane */ #ifndef _BICUBICTEXTURE_KERNEL_CUH_ #define _BICUBICTEXTURE_KERNEL_CUH_ enum Mode { MODE_NEAREST, MODE_BILINEAR, MODE_BICUBIC, MODE_FAST_BICUBIC, MODE_CATROM }; cudaTextureObject_t texObjPoint, texObjLinear; // w0, w1, w2, and w3 are the four cubic B-spline basis functions __host__ __device__ float w0(float a) { // return (1.0f/6.0f)*(-a*a*a + 3.0f*a*a - 3.0f*a + 1.0f); return (1.0f/6.0f)*(a*(a*(-a + 3.0f) - 3.0f) + 1.0f); // optimized } __host__ __device__ float w1(float a) { // return (1.0f/6.0f)*(3.0f*a*a*a - 6.0f*a*a + 4.0f); return (1.0f/6.0f)*(a*a*(3.0f*a - 6.0f) + 4.0f); } __host__ __device__ float w2(float a) { // return (1.0f/6.0f)*(-3.0f*a*a*a + 3.0f*a*a + 3.0f*a + 1.0f); return (1.0f/6.0f)*(a*(a*(-3.0f*a + 3.0f) + 3.0f) + 1.0f); } __host__ __device__ float w3(float a) { return (1.0f/6.0f)*(a*a*a); } // g0 and g1 are the two amplitude functions __device__ float g0(float a) { return w0(a) + w1(a); } __device__ float g1(float a) { return w2(a) + w3(a); } // h0 and h1 are the two offset functions __device__ float h0(float a) { // note +0.5 offset to compensate for CUDA linear filtering convention return -1.0f + w1(a) / (w0(a) + w1(a)) + 0.5f; } __device__ float h1(float a) { return 1.0f + w3(a) / (w2(a) + w3(a)) + 0.5f; } // filter 4 values using cubic splines template __device__ T cubicFilter(float x, T c0, T c1, T c2, T c3) { T r; r = c0 * w0(x); r += c1 * w1(x); r += c2 * w2(x); r += c3 * w3(x); return r; } // slow but precise bicubic lookup using 16 texture lookups template // texture data type, return type __device__ R tex2DBicubic(const cudaTextureObject_t tex, float x, float y) { x -= 0.5f; y -= 0.5f; float px = floorf(x); float py = floorf(y); float fx = x - px; float fy = y - py; return cubicFilter(fy, cubicFilter(fx, tex2D(tex, px-1, py-1), tex2D(tex, px, py-1), tex2D(tex, px+1, py-1), tex2D(tex, px+2,py-1)), cubicFilter(fx, tex2D(tex, px-1, py), tex2D(tex, px, py), tex2D(tex, px+1, py), tex2D(tex, px+2, py)), cubicFilter(fx, tex2D(tex, px-1, py+1), tex2D(tex, px, py+1), tex2D(tex, px+1, py+1), tex2D(tex, px+2, py+1)), cubicFilter(fx, tex2D(tex, px-1, py+2), tex2D(tex, px, py+2), tex2D(tex, px+1, py+2), tex2D(tex, px+2, py+2)) ); } // fast bicubic texture lookup using 4 bilinear lookups // assumes texture is set to non-normalized coordinates, point sampling template // texture data type, return type __device__ R tex2DFastBicubic(const cudaTextureObject_t tex, float x, float y) { x -= 0.5f; y -= 0.5f; float px = floorf(x); float py = floorf(y); float fx = x - px; float fy = y - py; // note: we could store these functions in a lookup table texture, but maths is cheap float g0x = g0(fx); float g1x = g1(fx); float h0x = h0(fx); float h1x = h1(fx); float h0y = h0(fy); float h1y = h1(fy); R r = g0(fy) * (g0x * tex2D(tex, px + h0x, py + h0y) + g1x * tex2D(tex, px + h1x, py + h0y)) + g1(fy) * (g0x * tex2D(tex, px + h0x, py + h1y) + g1x * tex2D(tex, px + h1x, py + h1y)); return r; } // higher-precision 2D bilinear lookup template // texture data type, return type __device__ R tex2DBilinear(const cudaTextureObject_t tex, float x, float y) { x -= 0.5f; y -= 0.5f; float px = floorf(x); // integer position float py = floorf(y); float fx = x - px; // fractional position float fy = y - py; px += 0.5f; py += 0.5f; return lerp(lerp(tex2D(tex, px, py), tex2D(tex, px + 1.0f, py), fx), lerp(tex2D(tex, px, py + 1.0f), tex2D(tex, px + 1.0f, py + 1.0f), fx), fy); } #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 /* bilinear 2D texture lookup using tex2Dgather function - tex2Dgather() returns the four neighbouring samples in a single texture lookup - it is only supported on the Fermi architecture - you can select which component to fetch using the "comp" parameter - it can be used to efficiently implement custom texture filters The samples are returned in a 4-vector in the following order: x: (0, 1) y: (1, 1) z: (1, 0) w: (0, 0) */ template // texture data type, return type __device__ float tex2DBilinearGather(const cudaTextureObject_t tex, float x, float y, int comp=0) { x -= 0.5f; y -= 0.5f; float px = floorf(x); // integer position float py = floorf(y); float fx = x - px; // fractional position float fy = y - py; R samples = tex2Dgather(tex, px + 0.5f, py + 0.5f, comp); return lerp(lerp((float) samples.w, (float) samples.z, fx), lerp((float) samples.x, (float) samples.y, fx), fy); } #endif // Catmull-Rom interpolation __host__ __device__ float catrom_w0(float a) { //return -0.5f*a + a*a - 0.5f*a*a*a; return a*(-0.5f + a*(1.0f - 0.5f*a)); } __host__ __device__ float catrom_w1(float a) { //return 1.0f - 2.5f*a*a + 1.5f*a*a*a; return 1.0f + a*a*(-2.5f + 1.5f*a); } __host__ __device__ float catrom_w2(float a) { //return 0.5f*a + 2.0f*a*a - 1.5f*a*a*a; return a*(0.5f + a*(2.0f - 1.5f*a)); } __host__ __device__ float catrom_w3(float a) { //return -0.5f*a*a + 0.5f*a*a*a; return a*a*(-0.5f + 0.5f*a); } template __device__ T catRomFilter(float x, T c0, T c1, T c2, T c3) { T r; r = c0 * catrom_w0(x); r += c1 * catrom_w1(x); r += c2 * catrom_w2(x); r += c3 * catrom_w3(x); return r; } // Note - can't use bilinear trick here because of negative lobes template // texture data type, return type __device__ R tex2DCatRom(const cudaTextureObject_t tex, float x, float y) { x -= 0.5f; y -= 0.5f; float px = floorf(x); float py = floorf(y); float fx = x - px; float fy = y - py; return catRomFilter(fy, catRomFilter(fx, tex2D(tex, px-1, py-1), tex2D(tex, px, py-1), tex2D(tex, px+1, py-1), tex2D(tex, px+2,py-1)), catRomFilter(fx, tex2D(tex, px-1, py), tex2D(tex, px, py), tex2D(tex, px+1, py), tex2D(tex, px+2, py)), catRomFilter(fx, tex2D(tex, px-1, py+1), tex2D(tex, px, py+1), tex2D(tex, px+1, py+1), tex2D(tex, px+2, py+1)), catRomFilter(fx, tex2D(tex, px-1, py+2), tex2D(tex, px, py+2), tex2D(tex, px+1, py+2), tex2D(tex, px+2, py+2)) ); } // test functions // render image using normal bilinear texture lookup __global__ void d_render(uchar4 *d_output, uint width, uint height, float tx, float ty, float scale, float cx, float cy, cudaTextureObject_t texObj) { uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; uint i = __umul24(y, width) + x; float u = (x-cx)*scale+cx + tx; float v = (y-cy)*scale+cy + ty; if ((x < width) && (y < height)) { // write output color float c = tex2D(texObj, u, v); //float c = tex2DBilinear(tex, u, v); //float c = tex2DBilinearGather(tex2, u, v, 0) / 255.0f; d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0); } } // render image using bicubic texture lookup __global__ void d_renderBicubic(uchar4 *d_output, uint width, uint height, float tx, float ty, float scale, float cx, float cy, cudaTextureObject_t texObj) { uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; uint i = __umul24(y, width) + x; float u = (x-cx)*scale+cx + tx; float v = (y-cy)*scale+cy + ty; if ((x < width) && (y < height)) { // write output color float c = tex2DBicubic(texObj, u, v); d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0); } } // render image using fast bicubic texture lookup __global__ void d_renderFastBicubic(uchar4 *d_output, uint width, uint height, float tx, float ty, float scale, float cx, float cy, cudaTextureObject_t texObj) { uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; uint i = __umul24(y, width) + x; float u = (x-cx)*scale+cx + tx; float v = (y-cy)*scale+cy + ty; if ((x < width) && (y < height)) { // write output color float c = tex2DFastBicubic(texObj, u, v); d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0); } } // render image using Catmull-Rom texture lookup __global__ void d_renderCatRom(uchar4 *d_output, uint width, uint height, float tx, float ty, float scale, float cx, float cy, cudaTextureObject_t texObj) { uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; uint i = __umul24(y, width) + x; float u = (x-cx)*scale+cx + tx; float v = (y-cy)*scale+cy + ty; if ((x < width) && (y < height)) { // write output color float c = tex2DCatRom(texObj, u, v); d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0); } } #endif // _BICUBICTEXTURE_KERNEL_CUH_