#include <cudpp_globals.h>
#include <cudpp_util.h>
#include <math.h>
#include <cudpp.h>
Classes | |
| class | ScanTraits< T, oper, backward, exclusive, multiRow, sums, fullBlock > |
| Template class containing compile-time parameters to the scan functions. More... | |
Scan Functions | |
| #define | __SYNC |
| Macro to insert necessary __syncthreads() in device emulation mode. | |
| #define | DISALLOW_LOADSTORE_OVERLAP 1 |
| template<class T, class traits> | |
| __device__ void | loadSharedChunkFromMem4 (T *s_out, T threadScan0[4], T threadScan1[4], const T *d_in, int numElements, int iDataOffset, int &ai, int &bi, int &aiDev, int &biDev) |
| Handles loading input s_data from global memory to shared memory (vec4 version). | |
| template<class T, class traits> | |
| __device__ void | storeSharedChunkToMem4 (T *d_out, T threadScan0[4], T threadScan1[4], T *s_in, int numElements, int oDataOffset, int ai, int bi, int aiDev, int biDev) |
| Handles storing result s_data from shared memory to global memory (vec4 version). | |
| template<class T, class traits, int maxlevel> | |
| __device__ T | warpscan (T val, volatile T *s_data) |
| Scan all warps of a CTA without synchronization. | |
| template<class T, class traits> | |
| __device__ void | scanWarps (T x, T y, T *s_data) |
| Perform a full CTA scan using the warp-scan algorithm. | |
| template<class T, class traits> | |
| __device__ void | scanCTA (T *s_data, T *d_blockSums, unsigned int blockSumIndex) |
| CTA-level scan routine; scans s_data in shared memory in each thread block. | |
1.5.5