Compact Functions | |
| template<bool isBackward> | |
| __global__ void | computeNumValidElements (size_t *d_numValidElements, const unsigned int *d_indices, const unsigned int *d_isValid, size_t numElements) |
| Compute the number of valid flags in an array given the array and its exclusive sum scan. | |
| template<class T, bool isBackward> | |
| __global__ void | compactData (T *d_out, size_t *d_numValidElements, const unsigned int *d_indices, const unsigned int *d_isValid, const T *d_in, unsigned int numElements) |
| Consolidate non-null elements - for each non-null element in d_in write it to d_out, in the position specified by d_isValid. Called by compactArray(). | |
Scan Functions | |
| template<class T, class traits> | |
| __global__ void | scan4 (T *d_out, const T *d_in, T *d_blockSums, int numElements, unsigned int dataRowPitch, unsigned int blockSumRowPitch) |
| Main scan kernel. | |
Segmented scan Functions | |
| template<class T, class traits> | |
| __global__ void | segmentedScan4 (T *d_odata, const T *d_idata, const unsigned int *d_iflags, unsigned int numElements, T *d_blockSums=0, unsigned int *d_blockFlags=0, unsigned int *d_blockIndices=0) |
| Main segmented scan kernel. | |
Sort Functions | |
| template<class T> | |
| __global__ void | radixsort_kernel (T *d_out, const T *d_in, int numElements) |
| Sorts a chunk of data using radix sort. | |
| template<class T> | |
| __global__ void | radixGlobalSetup_kernel (unsigned int *d_out, const T *d_in, unsigned int bit, unsigned int last) |
| For each input in d_in, set output in d_out to 1 if bit is set in d_in or 0 if bit is not set in d_in. | |
| __global__ void | radixGlobalSplit_kernel (unsigned int *d_out, unsigned int *d_address, unsigned int *d_in, unsigned int bit, unsigned int last) |
| One pass of a global radix sort that splits its input based on which bit is specified in 'bit'. Bits set to 0 are placed before bits set to 1. Each radix pass is stable. | |
| template<class T> | |
| __global__ void | merge_kernel (T *d_out, T *d_in, int chunkSize, int numElements) |
| One pass of a merge sort that (in parallel) merges 2n sorted chunks of size k into chunkSize sorted chunks of size 2k. | |
Sparse Matrix-Vector multiply Functions | |
| template<class T, bool isFullBlock> | |
| __global__ void | sparseMatrixVectorFetchAndMultiply (unsigned int *d_flags, T *d_prod, const T *d_A, const T *d_x, const unsigned int *d_indx, unsigned int numNZElts) |
| Fetch and multiply kernel. | |
| __global__ void | sparseMatrixVectorSetFlags (unsigned int *d_flags, const unsigned int *d_rowindx, unsigned int numRows) |
| Set Flags kernel. | |
| template<class T> | |
| __global__ void | yGather (T *d_y, const T *d_prod, const unsigned int *d_rowFindx, unsigned int numRows) |
| Gather final y values kernel. | |
Vector Functions | |
| CUDA kernel methods for basic operations on vectors. | |
| template<class T> | |
| __global__ void | vectorAddConstant (T *d_vector, T constant, int n, int baseIndex) |
| Adds a constant value to all values in the input d_vector. | |
| template<class T> | |
| __global__ void | vectorAddUniform (T *d_vector, const T *d_uniforms, int numElements, int blockOffset, int baseIndex) |
| Add a uniform value to each data element of an array. | |
| template<class T, CUDPPOperator op, int elementsPerThread> | |
| __global__ void | vectorAddUniform4 (T *d_vector, const T *d_uniforms, int numElements, int vectorRowPitch, int uniformRowPitch, int blockOffset, int baseIndex) |
| Add a uniform value to each data element of an array (vec4 version). | |
| template<class T> | |
| __global__ void | vectorAddVector (T *d_vectorA, const T *d_vectorB, int numElements, int baseIndex) |
| Adds together two vectors. | |
| template<class T, CUDPPOperator oper, bool isLastBlockFull> | |
| __global__ void | vectorSegmentedAddUniform4 (T *d_vector, const T *d_uniforms, const unsigned int *d_maxIndices, unsigned int numElements, int blockOffset, int baseIndex) |
| Add a uniform value to data elements of an array (vec4 version). | |
__global__ so that they must be invoked from host (CPU) code. They generally invoke GPU __device__ routines in the CUDPP CTA-Level API. Kernel-Level API functions are used by CUDPP Application-Level functions to implement their functionality. | __global__ void computeNumValidElements | ( | size_t * | d_numValidElements, | |
| const unsigned int * | d_indices, | |||
| const unsigned int * | d_isValid, | |||
| size_t | numElements | |||
| ) | [inline] |
Compute the number of valid flags in an array given the array and its exclusive sum scan.
| [out] | d_numValidElements | Number of valid eleents in d_isValid. |
| [in] | d_indices | Array of output indices created using an exclusive sum scan. |
| [in] | d_isValid | Array of flags indicating which elements are valid (1) and invalid (0). |
| [in] | numElements | The length of the d_isValid and d_indices arrays. |
| __global__ void compactData | ( | T * | d_out, | |
| size_t * | d_numValidElements, | |||
| const unsigned int * | d_indices, | |||
| const unsigned int * | d_isValid, | |||
| const T * | d_in, | |||
| unsigned int | numElements | |||
| ) | [inline] |
Consolidate non-null elements - for each non-null element in d_in write it to d_out, in the position specified by d_isValid. Called by compactArray().
| [out] | d_out | Output array of compacted values. |
| [out] | d_numValidElements | The number of elements in d_in with valid flags set to 1. |
| [in] | d_indices | Positions where non-null elements will go in d_out. |
| [in] | d_isValid | Flags indicating valid (1) and invalid (0) elements. Only valid elements will be copied to d_out. |
| [in] | d_in | The input array |
| [in] | numElements | The length of the d_in in elements. |
| __global__ void scan4 | ( | T * | d_out, | |
| const T * | d_in, | |||
| T * | d_blockSums, | |||
| int | numElements, | |||
| unsigned int | dataRowPitch, | |||
| unsigned int | blockSumRowPitch | |||
| ) | [inline] |
Main scan kernel.
This __global__ device function performs one level of a multiblock scan on an arbitrary-dimensioned array in d_in, returning the result in d_out (which may point to the same array). The same function may be used for single or multi-row scans. To perform a multirow scan, pass the width of each row of the input row (in elements) in dataRowPitch, and the width of the rows of d_blockSums (in elements) in blockSumRowPitch, and invoke with a thread block grid with height greater than 1.
This function peforms one level of a recursive, multiblock scan. At the app level, this function is called by cudppScan and cudppMultiScan and used in combination with vectorAddUniform4() to produce a complete scan.
Template parameter T is the datatype of the array to be scanned. Template parameter traits is the ScanTraits struct containing compile-time options for the scan, such as whether it is forward or backward, exclusive or inclusive, multi- or single-row, etc.
| [out] | d_out | The output (scanned) array |
| [in] | d_in | The input array to be scanned |
| [out] | d_blockSums | The array of per-block sums |
| [in] | numElements | The number of elements to scan |
| [in] | dataRowPitch | The width of each row of d_in in elements (for multi-row scans) |
| [in] | blockSumRowPitch | The with of each row of d_blockSums in elements (for multi-row scans) |
| __global__ void segmentedScan4 | ( | T * | d_odata, | |
| const T * | d_idata, | |||
| const unsigned int * | d_iflags, | |||
| unsigned int | numElements, | |||
| T * | d_blockSums = 0, |
|||
| unsigned int * | d_blockFlags = 0, |
|||
| unsigned int * | d_blockIndices = 0 | |||
| ) | [inline] |
Main segmented scan kernel.
This __global__ device function performs one level of a multiblock segmented scan on an one-dimensioned array in d_idata, returning the result in d_odata (which may point to the same array).
This function peforms one level of a recursive, multiblock scan. At the app level, this function is called by cudppSegmentedScan and used in combination with vectorSegmentedAddUniform4() to produce a complete segmented scan.
Template parameter T is the datatype of the array to be scanned. Template parameter traits is the SegmentedScanTraits struct containing compile-time options for the segmented scan, such as whether it is forward or backward, inclusive or exclusive, etc.
| [out] | d_odata | The output (scanned) array |
| [in] | d_idata | The input array to be scanned |
| [in] | d_iflags | The input array of flags |
| [out] | d_blockSums | The array of per-block sums |
| [out] | d_blockFlags | The array of per-block OR-reduction of flags |
| [out] | d_blockIndices | The array of per-block min-reduction of indices |
| [in] | numElements | The number of elements to scan |
| __global__ void radixsort_kernel | ( | T * | d_out, | |
| const T * | d_in, | |||
| int | numElements | |||
| ) | [inline] |
Sorts a chunk of data using radix sort.
Sorts a chunk of data using radix sort, looping through each bit and calling split. On each pass, checks whether data is sorted and terminates if not. Since the check is non-trivial, it's debatable whether this is a good idea or not. (Could instead check every few passes or only check at the end.)
| [out] | d_out | Output data in GPU main memory |
| [in] | d_in | Input data in GPU main memory |
| [in] | numElements | Total number of elements to sort |
| __global__ void radixGlobalSetup_kernel | ( | unsigned int * | d_out, | |
| const T * | d_in, | |||
| unsigned int | bit, | |||
| unsigned int | last | |||
| ) | [inline] |
For each input in d_in, set output in d_out to 1 if bit is set in d_in or 0 if bit is not set in d_in.
Processes 2 elements per thread.
| [out] | d_out | Output data in GPU main memory |
| [in] | d_in | Input data in GPU main memory |
| [in] | bit | Which bit is checked (one-hot) |
| [in] | last | Index of the last element to split (for n d_in elements, "last" is n-1) |
| __global__ void radixGlobalSplit_kernel | ( | unsigned int * | d_out, | |
| unsigned int * | d_address, | |||
| unsigned int * | d_in, | |||
| unsigned int | bit, | |||
| unsigned int | last | |||
| ) |
One pass of a global radix sort that splits its input based on which bit is specified in 'bit'. Bits set to 0 are placed before bits set to 1. Each radix pass is stable.
Currently this is hardcoded for unsigned ints only.
Processes 2 elements per thread.
| [out] | d_out | Output data in GPU main memory |
| [in] | d_address | Temporary buffer used to store addresses |
| [in] | d_in | Input data in GPU main memory |
| [in] | bit | Which bit is used to split (one-hot) |
| [in] | last | Index of the last element to split (for n d_in elements, "last" is n-1) |
| __global__ void merge_kernel | ( | T * | d_out, | |
| T * | d_in, | |||
| int | chunkSize, | |||
| int | numElements | |||
| ) | [inline] |
One pass of a merge sort that (in parallel) merges 2n sorted chunks of size k into chunkSize sorted chunks of size 2k.
The current algorithm is described in Harris et al., GPU Gems 3, though we hope to improve the algorithm. Briefly: We keep two buffers in shared memory, one for each input, and use a parallel bitonic sort to merge the smallest elements from each buffer. We then refills the buffers from main memory if necessary, and repeat until both inputs are exhausted. All reads from global memory into shared memory and all writes to global memory are coherent and blocked; we also guarantee that each input element is only read once from global memory and each output element is only written once.
Each thread processes one element per input.
| [out] | d_out | Output data (for this pass) in GPU main memory |
| [in] | d_in | Input data (for this pass) in GPU main memory |
| [in] | chunkSize | Size of each chunk (in elements) |
| [in] | numElements | Total number of elements |
| __global__ void sparseMatrixVectorFetchAndMultiply | ( | unsigned int * | d_flags, | |
| T * | d_prod, | |||
| const T * | d_A, | |||
| const T * | d_x, | |||
| const unsigned int * | d_indx, | |||
| unsigned int | numNZElts | |||
| ) | [inline] |
Fetch and multiply kernel.
This __global__ device function takes an element from the vector d_A, finds its column in d_indx and multiplies the element from d_A with its corresponding (that is having the same row) element in d_x and stores the resulting product in d_prod. It also sets all the elements of d_flags to 0.
Template parameter T is the datatype of the matrix A and x.
| [out] | d_flags | The output flags array |
| [out] | d_prod | The output products array |
| [in] | d_A | The input matrix A |
| [in] | d_x | The input array x |
| [in] | d_indx | The input array of column indices for each element in A |
| [in] | numNZElts | The number of non-zero elements in matrix A |
| __global__ void sparseMatrixVectorSetFlags | ( | unsigned int * | d_flags, | |
| const unsigned int * | d_rowindx, | |||
| unsigned int | numRows | |||
| ) |
Set Flags kernel.
This __global__ device function takes an element from the vector d_rowindx, and sets the corresponding position in d_flags to 1
| [out] | d_flags | The output flags array |
| [in] | d_rowindx | The starting index of each row in the "flattened" version of matrix A |
| [in] | numRows | The number of rows in matrix A |
| __global__ void yGather | ( | T * | d_y, | |
| const T * | d_prod, | |||
| const unsigned int * | d_rowFindx, | |||
| unsigned int | numRows | |||
| ) | [inline] |
Gather final y values kernel.
This __global__ device function takes an element from the vector d_rowFindx, which for each row gives the index of the last element of that row, reads the corresponding position in d_prod and write it in d_y
Template parameter T is the datatype of the matrix A and x.
| [out] | d_y | The output result array |
| [in] | d_prod | The input products array (which now contains sums for each row) |
| [in] | d_rowFindx | The starting index of each row in the "flattened" version of matrix A |
| [in] | numRows | The number of rows in matrix A |
| __global__ void vectorAddConstant | ( | T * | d_vector, | |
| T | constant, | |||
| int | n, | |||
| int | baseIndex | |||
| ) | [inline] |
Adds a constant value to all values in the input d_vector.
Each thread adds two pairs of elements.
| [in,out] | d_vector | The array of elements to be modified |
| [in] | constant | The constant value to be added to elements of d_vector |
| [in] | n | The number of elements in the d_vector to be modified |
| [in] | baseIndex | An optional offset to the beginning of the elements in the input array to be processed |
| __global__ void vectorAddUniform | ( | T * | d_vector, | |
| const T * | d_uniforms, | |||
| int | numElements, | |||
| int | blockOffset, | |||
| int | baseIndex | |||
| ) | [inline] |
Add a uniform value to each data element of an array.
This function reads one value per CTA from d_uniforms into shared memory and adds that value to all values "owned" by the CTA in d_vector. Each thread adds two pairs of values.
| [out] | d_vector | The d_vector whose values will have the uniform added |
| [in] | d_uniforms | The array of uniform values (one per CTA) |
| [in] | numElements | The number of elements in d_vector to process |
| [in] | blockOffset | an optional offset to the beginning of this block's data. |
| [in] | baseIndex | an optional offset to the beginning of the array within d_vector. |
| __global__ void vectorAddUniform4 | ( | T * | d_vector, | |
| const T * | d_uniforms, | |||
| int | numElements, | |||
| int | vectorRowPitch, | |||
| int | uniformRowPitch, | |||
| int | blockOffset, | |||
| int | baseIndex | |||
| ) | [inline] |
Add a uniform value to each data element of an array (vec4 version).
This function reads one value per CTA from d_uniforms into shared memory and adds that value to all values "owned" by the CTA in d_vector. Each thread adds the uniform value to eight values in d_vector.
| [out] | d_vector | The d_vector whose values will have the uniform added |
| [in] | d_uniforms | The array of uniform values (one per CTA) |
| [in] | numElements | The number of elements in d_vector to process |
| [in] | vectorRowPitch | For 2D arrays, the pitch (in elements) of the rows of d_vector. |
| [in] | uniformRowPitch | For 2D arrays, the pitch (in elements) of the rows of d_uniforms. |
| [in] | blockOffset | an optional offset to the beginning of this block's data. |
| [in] | baseIndex | an optional offset to the beginning of the array within d_vector. |
| __global__ void vectorAddVector | ( | T * | d_vectorA, | |
| const T * | d_vectorB, | |||
| int | numElements, | |||
| int | baseIndex | |||
| ) | [inline] |
Adds together two vectors.
Each thread adds two pairs of elements.
| [out] | d_vectorA | The left operand array and the result |
| [in] | d_vectorB | The right operand array |
| [in] | numElements | The number of elements in the vectors to be added. |
| [in] | baseIndex | An optional offset to the beginning of the elements in the input arrays to be processed |
| __global__ void vectorSegmentedAddUniform4 | ( | T * | d_vector, | |
| const T * | d_uniforms, | |||
| const unsigned int * | d_maxIndices, | |||
| unsigned int | numElements, | |||
| int | blockOffset, | |||
| int | baseIndex | |||
| ) | [inline] |
Add a uniform value to data elements of an array (vec4 version).
This function reads one value per CTA from d_uniforms into shared memory and adds that value to values "owned" by the CTA in d_vector. The uniform value is added to only those values "owned" by the CTA which have an index less than d_maxIndex. If d_maxIndex for that CTA is UINT_MAX it adds the uniform to all values "owned" by the CTA. Each thread adds the uniform value to eight values in d_vector.
| [out] | d_vector | The d_vector whose values will have the uniform added |
| [in] | d_uniforms | The array of uniform values (one per CTA) |
| [in] | d_maxIndices | The array of maximum indices (one per CTA). This is index upto which the uniform would be added. If this is UINT_MAX the uniform is added to all elements of the CTA. |
| [in] | numElements | The number of elements in d_vector to process |
| [in] | blockOffset | an optional offset to the beginning of this block's data. |
| [in] | baseIndex | an optional offset to the beginning of the array within d_vector. |
1.5.5