Morpheus 1.0.0
Dynamic matrix type and algorithms for sparse matrices
Loading...
Searching...
No Matches
DenseVector/Kernels/Morpheus_Reduction_Impl.hpp
1/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
2 *
3 * Redistribution and use in source and binary forms, with or without
4 * modification, are permitted provided that the following conditions
5 * are met:
6 * * Redistributions of source code must retain the above copyright
7 * notice, this list of conditions and the following disclaimer.
8 * * Redistributions in binary form must reproduce the above copyright
9 * notice, this list of conditions and the following disclaimer in the
10 * documentation and/or other materials provided with the distribution.
11 * * Neither the name of NVIDIA CORPORATION nor the names of its
12 * contributors may be used to endorse or promote products derived
13 * from this software without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 Parallel reduction kernels
30*/
31
32#ifndef _REDUCE_KERNEL_H_
33#define _REDUCE_KERNEL_H_
34
35#if defined(MORPHEUS_ENABLE_HIP)
36#include <impl/Morpheus_HIPUtils.hpp>
37
38#define SHFL_DOWN(mask, val, offset) __shfl_down(val, offset)
39#define BALLOT(mask, predicate) __ballot(predicate)
40#define BIT_MASK 0xffffffffffffffff
41#elif defined(MORPHEUS_ENABLE_CUDA)
42#include <impl/Morpheus_CudaUtils.hpp>
43
44#define SHFL_DOWN(mask, val, offset) __shfl_down_sync(mask, val, offset)
45#define BALLOT(mask, predicate) __ballot_sync(mask, predicate)
46#define BIT_MASK 0xffffffff
47#endif
48
49namespace Morpheus {
50namespace Impl {
51namespace Kernels {
52
53// Utility class used to avoid linker errors with extern
54// unsized shared memory arrays with templated type
55template <class T>
57 __device__ inline operator T *() {
58 extern __shared__ int __smem[];
59 return (T *)__smem;
60 }
61
62 __device__ inline operator const T *() const {
63 extern __shared__ int __smem[];
64 return (T *)__smem;
65 }
66};
67
68// specialize for double to avoid unaligned memory
69// access compile errors
70template <>
71struct SharedMemory<double> {
72 __device__ inline operator double *() {
73 extern __shared__ double __smem_d[];
74 return (double *)__smem_d;
75 }
76
77 __device__ inline operator const double *() const {
78 extern __shared__ double __smem_d[];
79 return (double *)__smem_d;
80 }
81};
82
83template <class T>
84__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum) {
85 for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
86 // mySum += __shfl_down_sync(mask, mySum, offset);
87 mySum += SHFL_DOWN(mask, mySum, offset);
88 }
89 return mySum;
90}
91
92#if __CUDA_ARCH__ >= 800
93// Specialize warpReduceFunc for int inputs to use __reduce_add_sync intrinsic
94// when on SM 8.0 or higher
95template <>
96__device__ __forceinline__ int warpReduceSum<int>(unsigned int mask,
97 int mySum) {
98 mySum = __reduce_add_sync(mask, mySum);
99 return mySum;
100}
101#endif
102
103template <typename SizeType, typename ValueType, unsigned int blockSize,
104 bool nIsPow2>
105__global__ void reduce_kernel(const ValueType *__restrict__ g_idata,
106 ValueType *__restrict__ g_odata, SizeType n) {
107 ValueType *sdata = SharedMemory<ValueType>();
108
109 // perform first level of reduction,
110 // reading from global memory, writing to shared memory
111 SizeType tid = threadIdx.x;
112 SizeType gridSize = blockSize * gridDim.x;
113 // unsigned int maskLength = (blockSize & 31); // 31 = WARP_SIZE-1
114 // maskLength = (maskLength > 0) ? (32 - maskLength) :
115 // maskLength;
116 SizeType maskLength = (blockSize & (WARP_SIZE - 1)); // 31 = WARP_SIZE-1
117 maskLength = (maskLength > 0) ? (WARP_SIZE - maskLength) : maskLength;
118 const SizeType mask = (BIT_MASK) >> maskLength;
119
120 ValueType mySum = 0;
121
122 // we reduce multiple elements per thread. The number is determined by the
123 // number of active thread blocks (via gridDim). More blocks will result
124 // in a larger gridSize and therefore fewer elements per thread
125 if (nIsPow2) {
126 SizeType i = blockIdx.x * blockSize * 2 + threadIdx.x;
127 gridSize = gridSize << 1;
128
129 while (i < n) {
130 mySum += g_idata[i];
131 // ensure we don't read out of bounds -- this is optimized away for
132 // powerOf2 sized arrays
133 if ((i + blockSize) < n) {
134 mySum += g_idata[i + blockSize];
135 }
136 i += gridSize;
137 }
138 } else {
139 SizeType i = blockIdx.x * blockSize + threadIdx.x;
140 while (i < n) {
141 mySum += g_idata[i];
142 i += gridSize;
143 }
144 }
145
146 // Reduce within warp using shuffle or reduce_add if T==int & CUDA_ARCH ==
147 // SM 8.0
148 mySum = warpReduceSum<ValueType>(mask, mySum);
149
150 // each thread puts its local sum into shared memory
151 if ((tid % WARP_SIZE) == 0) {
152 sdata[tid / WARP_SIZE] = mySum;
153 }
154
155 __syncthreads();
156
157 const SizeType shmem_extent =
158 (blockSize / WARP_SIZE) > 0 ? (blockSize / WARP_SIZE) : 1;
159 const SizeType ballot_result = BALLOT(mask, tid < shmem_extent);
160 if (tid < shmem_extent) {
161 mySum = sdata[tid];
162 // Reduce final warp using shuffle or reduce_add if T==int & CUDA_ARCH ==
163 // SM 8.0
164 mySum = warpReduceSum<ValueType>(ballot_result, mySum);
165 }
166
167 // write result for this block to global mem
168 if (tid == 0) {
169 g_odata[blockIdx.x] = mySum;
170 }
171}
172
173} // namespace Kernels
174} // namespace Impl
175} // namespace Morpheus
176
177#endif // _REDUCE_KERNEL_H_
Generic Morpheus interfaces.
Definition: dummy.cpp:24
Definition: DenseVector/Kernels/Morpheus_Reduction_Impl.hpp:56