/* * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ /* Determine eigenvalues for large matrices for intervals that contained after * the first step one eigenvalue */ #ifndef _BISECT_KERNEL_LARGE_ONEI_H_ #define _BISECT_KERNEL_LARGE_ONEI_H_ #include namespace cg = cooperative_groups; // includes, project #include "config.h" #include "util.h" // additional kernel #include "bisect_util.cu" //////////////////////////////////////////////////////////////////////////////// //! Determine eigenvalues for large matrices for intervals that after //! the first step contained one eigenvalue //! @param g_d diagonal elements of symmetric, tridiagonal matrix //! @param g_s superdiagonal elements of symmetric, tridiagonal matrix //! @param n matrix size //! @param num_intervals total number of intervals containing one eigenvalue //! after the first step //! @param g_left left interval limits //! @param g_right right interval limits //! @param g_pos index of interval / number of intervals that are smaller than //! right interval limit //! @param precision desired precision of eigenvalues //////////////////////////////////////////////////////////////////////////////// __global__ void bisectKernelLarge_OneIntervals(float *g_d, float *g_s, const unsigned int n, unsigned int num_intervals, float *g_left, float *g_right, unsigned int *g_pos, float precision) { // Handle to thread block group cg::thread_block cta = cg::this_thread_block(); const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x; __shared__ float s_left_scratch[MAX_THREADS_BLOCK]; __shared__ float s_right_scratch[MAX_THREADS_BLOCK]; // active interval of thread // left and right limit of current interval float left, right; // number of threads smaller than the right limit (also corresponds to the // global index of the eigenvalues contained in the active interval) unsigned int right_count; // flag if current thread converged unsigned int converged = 0; // midpoint when current interval is subdivided float mid = 0.0f; // number of eigenvalues less than mid unsigned int mid_count = 0; // read data from global memory if (gtid < num_intervals) { left = g_left[gtid]; right = g_right[gtid]; right_count = g_pos[gtid]; } // flag to determine if all threads converged to eigenvalue __shared__ unsigned int converged_all_threads; // initialized shared flag if (0 == threadIdx.x) { converged_all_threads = 0; } cg::sync(cta); // process until all threads converged to an eigenvalue // while( 0 == converged_all_threads) { while (true) { atomicExch(&converged_all_threads, 1); // update midpoint for all active threads if ((gtid < num_intervals) && (0 == converged)) { mid = computeMidpoint(left, right); } // find number of eigenvalues that are smaller than midpoint mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, mid, gtid, num_intervals, s_left_scratch, s_right_scratch, converged, cta); cg::sync(cta); // for all active threads if ((gtid < num_intervals) && (0 == converged)) { // udpate intervals -- always one child interval survives if (right_count == mid_count) { right = mid; } else { left = mid; } // check for convergence float t0 = right - left; float t1 = max(abs(right), abs(left)) * precision; if (t0 < min(precision, t1)) { float lambda = computeMidpoint(left, right); left = lambda; right = lambda; converged = 1; } else { atomicExch(&converged_all_threads, 0); } } cg::sync(cta); if (1 == converged_all_threads) { break; } cg::sync(cta); } // write data back to global memory cg::sync(cta); if (gtid < num_intervals) { // intervals converged so left and right interval limit are both identical // and identical to the eigenvalue g_left[gtid] = left; } } #endif // #ifndef _BISECT_KERNEL_LARGE_ONEI_H_