32#ifndef _REDUCE_KERNEL_H_
33#define _REDUCE_KERNEL_H_
35#if defined(MORPHEUS_ENABLE_HIP)
36#include <impl/Morpheus_HIPUtils.hpp>
38#define SHFL_DOWN(mask, val, offset) __shfl_down(val, offset)
39#define BALLOT(mask, predicate) __ballot(predicate)
40#define BIT_MASK 0xffffffffffffffff
41#elif defined(MORPHEUS_ENABLE_CUDA)
42#include <impl/Morpheus_CudaUtils.hpp>
44#define SHFL_DOWN(mask, val, offset) __shfl_down_sync(mask, val, offset)
45#define BALLOT(mask, predicate) __ballot_sync(mask, predicate)
46#define BIT_MASK 0xffffffff
57 __device__
inline operator T *() {
58 extern __shared__
int __smem[];
62 __device__
inline operator const T *()
const {
63 extern __shared__
int __smem[];
72 __device__
inline operator double *() {
73 extern __shared__
double __smem_d[];
74 return (
double *)__smem_d;
77 __device__
inline operator const double *()
const {
78 extern __shared__
double __smem_d[];
79 return (
double *)__smem_d;
84__device__ __forceinline__ T warpReduceSum(
unsigned int mask, T mySum) {
85 for (
int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
87 mySum += SHFL_DOWN(mask, mySum, offset);
92#if __CUDA_ARCH__ >= 800
96__device__ __forceinline__
int warpReduceSum<int>(
unsigned int mask,
98 mySum = __reduce_add_sync(mask, mySum);
103template <
typename SizeType,
typename ValueType,
unsigned int blockSize,
105__global__
void reduce_kernel(
const ValueType *__restrict__ g_idata,
106 ValueType *__restrict__ g_odata, SizeType n) {
107 ValueType *sdata = SharedMemory<ValueType>();
111 SizeType tid = threadIdx.x;
112 SizeType gridSize = blockSize * gridDim.x;
116 SizeType maskLength = (blockSize & (WARP_SIZE - 1));
117 maskLength = (maskLength > 0) ? (WARP_SIZE - maskLength) : maskLength;
118 const SizeType mask = (BIT_MASK) >> maskLength;
126 SizeType i = blockIdx.x * blockSize * 2 + threadIdx.x;
127 gridSize = gridSize << 1;
133 if ((i + blockSize) < n) {
134 mySum += g_idata[i + blockSize];
139 SizeType i = blockIdx.x * blockSize + threadIdx.x;
148 mySum = warpReduceSum<ValueType>(mask, mySum);
151 if ((tid % WARP_SIZE) == 0) {
152 sdata[tid / WARP_SIZE] = mySum;
157 const SizeType shmem_extent =
158 (blockSize / WARP_SIZE) > 0 ? (blockSize / WARP_SIZE) : 1;
159 const SizeType ballot_result = BALLOT(mask, tid < shmem_extent);
160 if (tid < shmem_extent) {
164 mySum = warpReduceSum<ValueType>(ballot_result, mySum);
169 g_odata[blockIdx.x] = mySum;
Generic Morpheus interfaces.
Definition: dummy.cpp:24
Definition: DenseVector/Kernels/Morpheus_Reduction_Impl.hpp:56