/* * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ /////////////////////////////////////////////////////////////////////////////// // CPU Fast Walsh Transform /////////////////////////////////////////////////////////////////////////////// extern"C" void fwtCPU(float *h_Output, float *h_Input, int log2N) { const int N = 1 << log2N; for (int pos = 0; pos < N; pos++) h_Output[pos] = h_Input[pos]; //Cycle through stages with different butterfly strides for (int stride = N / 2; stride >= 1; stride >>= 1) { //Cycle through subvectors of (2 * stride) elements for (int base = 0; base < N; base += 2 * stride) //Butterfly index within subvector of (2 * stride) size for (int j = 0; j < stride; j++) { int i0 = base + j + 0; int i1 = base + j + stride; float T1 = h_Output[i0]; float T2 = h_Output[i1]; h_Output[i0] = T1 + T2; h_Output[i1] = T1 - T2; } } } /////////////////////////////////////////////////////////////////////////////// // Straightforward Walsh Transform: used to test both CPU and GPU FWT // Slow. Uses doubles because of straightforward accumulation /////////////////////////////////////////////////////////////////////////////// extern"C" void slowWTcpu(float *h_Output, float *h_Input, int log2N) { const int N = 1 << log2N; for (int i = 0; i < N; i++) { double sum = 0; for (int j = 0; j < N; j++) { //Walsh-Hadamard quotient double q = 1.0; for (int t = i & j; t != 0; t >>= 1) if (t & 1) q = -q; sum += q * h_Input[j]; } h_Output[i] = (float)sum; } } //////////////////////////////////////////////////////////////////////////////// // Reference CPU dyadic convolution. // Extremely slow because of non-linear memory access patterns (cache thrashing) //////////////////////////////////////////////////////////////////////////////// extern "C" void dyadicConvolutionCPU( float *h_Result, float *h_Data, float *h_Kernel, int log2dataN, int log2kernelN ) { const int dataN = 1 << log2dataN; const int kernelN = 1 << log2kernelN; for (int i = 0; i < dataN; i++) { double sum = 0; for (int j = 0; j < kernelN; j++) sum += h_Data[i ^ j] * h_Kernel[j]; h_Result[i] = (float)sum; } }