Morpheus 1.0.0
Dynamic matrix type and algorithms for sparse matrices
Loading...
Searching...
No Matches
DenseVector/Cuda/Morpheus_Reduction_Impl.hpp
1
24#ifndef MORPHEUS_DENSEVECTOR_CUDA_REDUCTION_IMPL_HPP
25#define MORPHEUS_DENSEVECTOR_CUDA_REDUCTION_IMPL_HPP
26
27#include <Morpheus_Macros.hpp>
28#if defined(MORPHEUS_ENABLE_CUDA)
29
30#include <Morpheus_SpaceTraits.hpp>
31#include <Morpheus_FormatTraits.hpp>
32#include <Morpheus_FormatTags.hpp>
33#include <Morpheus_Spaces.hpp>
34
35#include <impl/DenseVector/Morpheus_Copy_Impl.hpp>
36#include <impl/DenseVector/Kernels/Morpheus_Reduction_Impl.hpp>
37#include <impl/DenseVector/Serial/Morpheus_Reduction_Impl.hpp>
38#include <impl/Morpheus_CudaUtils.hpp>
39
40namespace Morpheus {
41namespace Impl {
42
43template <typename ExecSpace, typename Vector, typename SizeType>
44void reduce(const Vector& in, Vector& out, SizeType size, SizeType threads,
45 SizeType blocks,
46 typename std::enable_if_t<
47 Morpheus::is_dense_vector_format_container_v<Vector> &&
48 Morpheus::has_custom_backend_v<ExecSpace> &&
49 Morpheus::has_cuda_execution_space_v<ExecSpace> &&
50 Morpheus::has_access_v<ExecSpace, Vector>>* = nullptr) {
51 using size_type = typename Vector::size_type;
52 using value_type = typename Vector::value_type;
53
54 value_type* d_idata = in.data();
55 value_type* d_odata = out.data();
56
57 // For reduce kernel we require only blockSize/warpSize
58 // number of elements in shared memory
59 SizeType smemSize = ((threads / 32) + 1) * sizeof(value_type);
60 if (isPow2<SizeType>(size)) {
61 switch (threads) {
62 case 1024:
63 Kernels::reduce_kernel<size_type, value_type, 1024, true>
64 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
65 break;
66 case 512:
67 Kernels::reduce_kernel<size_type, value_type, 512, true>
68 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
69 break;
70
71 case 256:
72 Kernels::reduce_kernel<size_type, value_type, 256, true>
73 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
74 break;
75
76 case 128:
77 Kernels::reduce_kernel<size_type, value_type, 128, true>
78 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
79 break;
80
81 case 64:
82 Kernels::reduce_kernel<size_type, value_type, 64, true>
83 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
84 break;
85
86 case 32:
87 Kernels::reduce_kernel<size_type, value_type, 32, true>
88 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
89 break;
90
91 case 16:
92 Kernels::reduce_kernel<size_type, value_type, 16, true>
93 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
94 break;
95
96 case 8:
97 Kernels::reduce_kernel<size_type, value_type, 8, true>
98 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
99 break;
100
101 case 4:
102 Kernels::reduce_kernel<size_type, value_type, 4, true>
103 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
104 break;
105
106 case 2:
107 Kernels::reduce_kernel<size_type, value_type, 2, true>
108 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
109 break;
110
111 case 1:
112 Kernels::reduce_kernel<size_type, value_type, 1, true>
113 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
114 break;
115 }
116 } else {
117 switch (threads) {
118 case 1024:
119 Kernels::reduce_kernel<size_type, value_type, 1024, false>
120 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
121 break;
122 case 512:
123 Kernels::reduce_kernel<size_type, value_type, 512, false>
124 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
125 break;
126
127 case 256:
128 Kernels::reduce_kernel<size_type, value_type, 256, false>
129 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
130 break;
131
132 case 128:
133 Kernels::reduce_kernel<size_type, value_type, 128, false>
134 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
135 break;
136
137 case 64:
138 Kernels::reduce_kernel<size_type, value_type, 64, false>
139 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
140 break;
141
142 case 32:
143 Kernels::reduce_kernel<size_type, value_type, 32, false>
144 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
145 break;
146
147 case 16:
148 Kernels::reduce_kernel<size_type, value_type, 16, false>
149 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
150 break;
151
152 case 8:
153 Kernels::reduce_kernel<size_type, value_type, 8, false>
154 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
155 break;
156
157 case 4:
158 Kernels::reduce_kernel<size_type, value_type, 4, false>
159 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
160 break;
161
162 case 2:
163 Kernels::reduce_kernel<size_type, value_type, 2, false>
164 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
165 break;
166
167 case 1:
168 Kernels::reduce_kernel<size_type, value_type, 1, false>
169 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
170 break;
171 }
172 }
173}
174
175template <typename ExecSpace, typename Vector>
176typename Vector::value_type reduce(
177 const Vector& in, typename Vector::size_type size,
178 typename std::enable_if_t<
179 Morpheus::is_dense_vector_format_container_v<Vector> &&
180 Morpheus::has_custom_backend_v<ExecSpace> &&
181 Morpheus::has_cuda_execution_space_v<ExecSpace> &&
182 Morpheus::has_access_v<ExecSpace, Vector>>* = nullptr) {
183 using size_type = typename Vector::size_type;
184 using value_type = typename Vector::value_type;
185
186 value_type result = 0;
187 const size_type maxThreads = 256; // number of threads per block
188 const size_type maxBlocks =
189 min((size_type)size / maxThreads + 1, (size_type)MAX_BLOCK_DIM_SIZE);
190 const size_type cpuFinalThreshold = WARP_SIZE;
191 size_type numBlocks = 0;
192 size_type numThreads = 0;
193
194 getNumBlocksAndThreads<size_type>(size, maxBlocks, maxThreads, numBlocks,
195 numThreads);
196
197 Vector inter_sums(maxBlocks, 0);
198 Vector out(numBlocks, 0);
199 reduce<ExecSpace>(in, out, size, numThreads, numBlocks);
200#if defined(DEBUG) || defined(MORPHEUS_DEBUG)
201 getLastCudaError("reduce_kernel: Kernel execution failed");
202#endif
203
204 // sum partial block sums on GPU
205 size_type reduced_size = numBlocks;
206
207 while (reduced_size > cpuFinalThreshold) {
208 size_type threads = 0, blocks = 0;
209 getNumBlocksAndThreads<size_type>(reduced_size, maxBlocks, maxThreads,
210 blocks, threads);
211 Impl::copy(out, inter_sums, 0, reduced_size, 0, reduced_size);
212
213 Impl::reduce<ExecSpace>(inter_sums, out, reduced_size, threads, blocks);
214#if defined(DEBUG) || defined(MORPHEUS_DEBUG)
215 getLastCudaError("reduce_kernel: Kernel execution failed");
216#endif
217
218 reduced_size = (reduced_size + (threads * 2 - 1)) / (threads * 2);
219 }
220
221 if (reduced_size > 1) {
222 typename Vector::HostMirror h_out(reduced_size, 0);
223 // copy result from device to host
224 Impl::copy(out, h_out, 0, reduced_size, 0, reduced_size);
225 result = Impl::reduce<Morpheus::Serial>(h_out, reduced_size);
226 } else {
227 // copy final sum from device to host
228 typename Vector::HostMirror h_out(1, 0);
229 Impl::copy(out, h_out, 0, 1, 0, 1);
230 result = h_out[0];
231 }
232
233 return result;
234}
235
236} // namespace Impl
237} // namespace Morpheus
238
239#endif // MORPHEUS_ENABLE_CUDA
240#endif // MORPHEUS_DENSEVECTOR_CUDA_REDUCTION_IMPL_HPP
Generic Morpheus interfaces.
Definition: dummy.cpp:24