//////////////////////////////////////////////////////////////////////////// // // Copyright 1993-2020 NVIDIA Corporation. All rights reserved. // // Please refer to the NVIDIA end user license agreement (EULA) associated // with this source code for terms and conditions that govern your use of // this software. Any use, reproduction, disclosure, or distribution of // this software and related documentation outside the terms of the EULA // is strictly prohibited. // //////////////////////////////////////////////////////////////////////////// // includes, system #include #include #include #include // includes CUDA #include // includes, project #include #include // helper functions for SDK examples //////////////////////////////////////////////////////////////////////////////// // declaration, forward void runTest(int argc, char **argv); cudaAccessPolicyWindow initAccessPolicyWindow(void) { cudaAccessPolicyWindow accessPolicyWindow = { 0 }; accessPolicyWindow.base_ptr = (void *)0; accessPolicyWindow.num_bytes = 0; accessPolicyWindow.hitRatio = 0.f; accessPolicyWindow.hitProp = cudaAccessPropertyNormal; accessPolicyWindow.missProp = cudaAccessPropertyStreaming; return accessPolicyWindow; } //////////////////////////////////////////////////////////////////////////////// //! Simple test kernel for device functionality //! @param data input data in global memory //! @param dataSize input data size //! @param bigData input bigData in global memory //! @param bigDataSize input bigData size //! @param hitcount how many data access are done within block //////////////////////////////////////////////////////////////////////////////// static __global__ void kernCacheSegmentTest(int* data, int dataSize, int *trash, int bigDataSize, int hitCount) { __shared__ unsigned int hit; int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; int tID = row * blockDim.y + col; uint32_t psRand = tID; atomicExch(&hit, 0); __syncthreads(); while (hit < hitCount) { psRand ^= psRand << 13; psRand ^= psRand >> 17; psRand ^= psRand << 5; int idx = tID - psRand; if (idx < 0) { idx = -idx; } if((tID % 2) == 0) { data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize]; } else { trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize]; } atomicAdd(&hit, 1); } } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { runTest(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// void runTest(int argc, char **argv) { bool bTestResult = true; cudaAccessPolicyWindow accessPolicyWindow; cudaDeviceProp deviceProp; cudaStreamAttrValue streamAttrValue; cudaStream_t stream; cudaStreamAttrID streamAttrID; dim3 threads(32, 32); int *dataDevicePointer; int *dataHostPointer; int dataSize; int *bigDataDevicePointer; int *bigDataHostPointer; int bigDataSize; StopWatchInterface *timer = 0; printf("%s Starting...\n\n", argv[0]); // use command-line specified CUDA device, otherwise use device with highest Gflops/s int devID = findCudaDevice(argc, (const char **)argv); sdkCreateTimer(&timer); sdkStartTimer(&timer); //Get device properties checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); dim3 blocks(deviceProp.maxGridSize[1], 1); //Make sure device the l2 optimization if (deviceProp.persistingL2CacheMaxSize == 0) { printf("Waiving execution as device %d does not support persisting L2 Caching\n", devID); exit(EXIT_WAIVED); } //Create stream to assiocate with window checkCudaErrors(cudaStreamCreate(&stream)); //Set the amount of l2 cache that will be persisting to maximum the device can support checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize)); //Stream attribute to set streamAttrID = cudaStreamAttributeAccessPolicyWindow; //Default window streamAttrValue.accessPolicyWindow = initAccessPolicyWindow(); accessPolicyWindow = initAccessPolicyWindow(); //Allocate size of both buffers bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int); dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int); //Allocate data checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int))); checkCudaErrors(cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int))); for ( int i = 0; i < bigDataSize; ++i) { if (i < dataSize) { dataHostPointer[i] = i; } bigDataHostPointer[bigDataSize - i - 1] = i; } checkCudaErrors(cudaMalloc((void**) &dataDevicePointer, dataSize * sizeof(int))); checkCudaErrors(cudaMalloc((void**) &bigDataDevicePointer, bigDataSize * sizeof(int))); checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream)); //Make a window for the buffer of interest accessPolicyWindow.base_ptr = (void *)dataDevicePointer; accessPolicyWindow.num_bytes = dataSize * sizeof(int); accessPolicyWindow.hitRatio = 1.f; accessPolicyWindow.hitProp = cudaAccessPropertyPersisting; accessPolicyWindow.missProp = cudaAccessPropertyNormal; streamAttrValue.accessPolicyWindow = accessPolicyWindow; //Assign window to stream checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue)); //Demote any previous persisting lines checkCudaErrors(cudaCtxResetPersistingL2Cache()); checkCudaErrors(cudaStreamSynchronize(stream)); kernCacheSegmentTest<<>>(dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF); checkCudaErrors(cudaStreamSynchronize(stream)); // check if kernel execution generated and error getLastCudaError("Kernel execution failed"); //Free memory checkCudaErrors(cudaFreeHost(dataHostPointer)); checkCudaErrors(cudaFreeHost(bigDataHostPointer)); checkCudaErrors(cudaFree(dataDevicePointer)); checkCudaErrors(cudaFree(bigDataDevicePointer)); sdkStopTimer(&timer); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); }