24#ifndef MORPHEUS_DENSEVECTOR_HIP_REDUCTION_IMPL_HPP
25#define MORPHEUS_DENSEVECTOR_HIP_REDUCTION_IMPL_HPP
27#include <Morpheus_Macros.hpp>
28#if defined(MORPHEUS_ENABLE_HIP)
30#include <Morpheus_SpaceTraits.hpp>
31#include <Morpheus_FormatTraits.hpp>
32#include <Morpheus_FormatTags.hpp>
33#include <Morpheus_Spaces.hpp>
35#include <impl/DenseVector/Morpheus_Copy_Impl.hpp>
36#include <impl/DenseVector/Kernels/Morpheus_Reduction_Impl.hpp>
37#include <impl/DenseVector/Serial/Morpheus_Reduction_Impl.hpp>
38#include <impl/Morpheus_HIPUtils.hpp>
43template <
typename ExecSpace,
typename Vector,
typename SizeType>
44void reduce(
const Vector& in, Vector& out, SizeType size, SizeType threads,
46 typename std::enable_if_t<
47 Morpheus::is_dense_vector_format_container_v<Vector> &&
48 Morpheus::has_custom_backend_v<ExecSpace> &&
49 Morpheus::has_hip_execution_space_v<ExecSpace> &&
50 Morpheus::has_access_v<ExecSpace, Vector>>* =
nullptr) {
51 using size_type =
typename Vector::size_type;
52 using value_type =
typename Vector::value_type;
54 value_type* d_idata = in.data();
55 value_type* d_odata = out.data();
59 SizeType smemSize = ((threads / 32) + 1) *
sizeof(value_type);
60 if (isPow2<SizeType>(size)) {
63 Kernels::reduce_kernel<size_type, value_type, 1024, true>
64 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
67 Kernels::reduce_kernel<size_type, value_type, 512, true>
68 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
72 Kernels::reduce_kernel<size_type, value_type, 256, true>
73 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
77 Kernels::reduce_kernel<size_type, value_type, 128, true>
78 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
82 Kernels::reduce_kernel<size_type, value_type, 64, true>
83 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
87 Kernels::reduce_kernel<size_type, value_type, 32, true>
88 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
92 Kernels::reduce_kernel<size_type, value_type, 16, true>
93 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
97 Kernels::reduce_kernel<size_type, value_type, 8, true>
98 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
102 Kernels::reduce_kernel<size_type, value_type, 4, true>
103 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
107 Kernels::reduce_kernel<size_type, value_type, 2, true>
108 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
112 Kernels::reduce_kernel<size_type, value_type, 1, true>
113 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
119 Kernels::reduce_kernel<size_type, value_type, 1024, false>
120 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
123 Kernels::reduce_kernel<size_type, value_type, 512, false>
124 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
128 Kernels::reduce_kernel<size_type, value_type, 256, false>
129 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
133 Kernels::reduce_kernel<size_type, value_type, 128, false>
134 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
138 Kernels::reduce_kernel<size_type, value_type, 64, false>
139 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
143 Kernels::reduce_kernel<size_type, value_type, 32, false>
144 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
148 Kernels::reduce_kernel<size_type, value_type, 16, false>
149 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
153 Kernels::reduce_kernel<size_type, value_type, 8, false>
154 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
158 Kernels::reduce_kernel<size_type, value_type, 4, false>
159 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
163 Kernels::reduce_kernel<size_type, value_type, 2, false>
164 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
168 Kernels::reduce_kernel<size_type, value_type, 1, false>
169 <<<blocks, threads, smemSize>>>(d_idata, d_odata, size);
175template <
typename ExecSpace,
typename Vector>
176typename Vector::value_type reduce(
177 const Vector& in,
typename Vector::size_type size,
178 typename std::enable_if_t<
179 Morpheus::is_dense_vector_format_container_v<Vector> &&
180 Morpheus::has_custom_backend_v<ExecSpace> &&
181 Morpheus::has_hip_execution_space_v<ExecSpace> &&
182 Morpheus::has_access_v<ExecSpace, Vector>>* =
nullptr) {
183 using size_type =
typename Vector::size_type;
184 using value_type =
typename Vector::value_type;
186 value_type result = 0;
187 const size_type maxThreads = 256;
188 const size_type maxBlocks =
189 min((size_type)size / maxThreads + 1, (size_type)MAX_BLOCK_DIM_SIZE);
190 const size_type cpuFinalThreshold = WARP_SIZE;
191 size_type numBlocks = 0;
192 size_type numThreads = 0;
194 getNumBlocksAndThreads<size_type>(size, maxBlocks, maxThreads, numBlocks,
197 Vector inter_sums(maxBlocks, 0);
198 Vector out(numBlocks, 0);
199 Impl::reduce<ExecSpace>(in, out, size, numThreads, numBlocks);
200#if defined(DEBUG) || defined(MORPHEUS_DEBUG)
201 getLastHIPError(
"reduce_kernel: Kernel execution failed");
205 size_type reduced_size = numBlocks;
207 while (reduced_size > cpuFinalThreshold) {
208 size_type threads = 0, blocks = 0;
209 getNumBlocksAndThreads<size_type>(reduced_size, maxBlocks, maxThreads,
211 Impl::copy(out, inter_sums, 0, reduced_size, 0, reduced_size);
213 Impl::reduce<ExecSpace>(inter_sums, out, reduced_size, threads, blocks);
214#if defined(DEBUG) || defined(MORPHEUS_DEBUG)
215 getLastHIPError(
"reduce_kernel: Kernel execution failed");
218 reduced_size = (reduced_size + (threads * 2 - 1)) / (threads * 2);
221 if (reduced_size > 1) {
222 typename Vector::HostMirror h_out(s, 0);
224 Impl::copy(out, h_out, 0, reduced_size, 0, reduced_size);
225 result = reduce<Kokkos::Serial>(h_out, reduced_size);
228 typename Vector::HostMirror h_out(1, 0);
229 Impl::copy(out, h_out, 0, 1, 0, 1);
Generic Morpheus interfaces.
Definition: dummy.cpp:24