25#ifndef MORPHEUS_COO_KERNELS_MULTIPLY_IMPL_HPP
26#define MORPHEUS_COO_KERNELS_MULTIPLY_IMPL_HPP
28#include <Morpheus_Macros.hpp>
29#if defined(MORPHEUS_ENABLE_CUDA) || defined(MORPHEUS_ENABLE_HIP)
37template <
typename IndexType,
typename ValueType>
38__device__
void segreduce_block(
const IndexType* idx, ValueType* val);
44template <
typename SizeType,
typename IndexType,
typename ValueType>
45__global__
void spmv_coo_serial_kernel(
const SizeType nnnz,
const IndexType* I,
46 const IndexType* J,
const ValueType* V,
47 const ValueType* x, ValueType* y) {
48 for (SizeType n = 0; n < nnnz; n++) {
49 y[I[n]] += V[n] * x[J[n]];
125template <
typename SizeType,
typename IndexType,
typename ValueType,
127__launch_bounds__(BLOCK_SIZE, 1) __global__
128 void spmv_coo_flat_kernel(const SizeType nnnz, const SizeType interval_size,
129 const IndexType* I, const IndexType* J,
130 const ValueType* V, const ValueType* x,
131 ValueType* y, IndexType* temp_rows,
132 ValueType* temp_vals) {
133 const SizeType MID_LANE = WARP_SIZE / 2;
134 const SizeType LAST_LANE = WARP_SIZE - 1;
136 __shared__
volatile IndexType
137 rows[(WARP_SIZE + MID_LANE) * (BLOCK_SIZE / WARP_SIZE)];
138 __shared__
volatile ValueType vals[BLOCK_SIZE];
140 const SizeType thread_id =
141 BLOCK_SIZE * blockIdx.x + threadIdx.x;
142 const SizeType thread_lane =
143 threadIdx.x & (WARP_SIZE - 1);
144 const SizeType warp_id = thread_id / WARP_SIZE;
146 const SizeType interval_begin =
147 warp_id * interval_size;
148 const SizeType interval_end = Morpheus::Impl::min(
149 interval_begin + interval_size, nnnz);
151 const SizeType idx = MID_LANE * (threadIdx.x / WARP_SIZE + 1) +
154 rows[idx - MID_LANE] = -1;
156 if (interval_begin >= interval_end)
159 if (thread_lane == WARP_SIZE - 1) {
161 rows[idx] = I[interval_begin];
162 vals[threadIdx.x] = ValueType(0);
165 for (SizeType n = interval_begin + thread_lane; n < interval_end;
167 IndexType row = I[n];
168 ValueType val = V[n] * x[J[n]];
170 if (thread_lane == 0) {
171 if (row == rows[idx + LAST_LANE])
172 val += ValueType(vals[threadIdx.x + LAST_LANE]);
174 y[rows[idx + LAST_LANE]] +=
175 ValueType(vals[threadIdx.x + LAST_LANE]);
179 vals[threadIdx.x] = val;
181 if (row == rows[idx - 1]) {
182 vals[threadIdx.x] = val += ValueType(vals[threadIdx.x - 1]);
184 if (row == rows[idx - 2]) {
185 vals[threadIdx.x] = val += ValueType(vals[threadIdx.x - 2]);
187 if (row == rows[idx - 4]) {
188 vals[threadIdx.x] = val += ValueType(vals[threadIdx.x - 4]);
190 if (row == rows[idx - 8]) {
191 vals[threadIdx.x] = val += ValueType(vals[threadIdx.x - 8]);
193 if (row == rows[idx - 16]) {
194 vals[threadIdx.x] = val += ValueType(vals[threadIdx.x - 16]);
197#if defined(MORPHEUS_ENABLE_HIP)
198 if (row == rows[idx - 32]) {
199 vals[threadIdx.x] = val += ValueType(vals[threadIdx.x - 32]);
203 if (thread_lane < LAST_LANE && row != rows[idx + 1])
204 y[row] += ValueType(vals[threadIdx.x]);
207 if (thread_lane == LAST_LANE) {
209 temp_rows[warp_id] = IndexType(rows[idx]);
210 temp_vals[warp_id] = ValueType(vals[threadIdx.x]);
215template <
typename SizeType,
typename IndexType,
typename ValueType,
217__launch_bounds__(BLOCK_SIZE, 1) __global__
218 void spmv_coo_reduce_update_kernel(const SizeType num_warps,
219 const IndexType* temp_rows,
220 const ValueType* temp_vals,
222 __shared__ IndexType rows[BLOCK_SIZE + 1];
223 __shared__ ValueType vals[BLOCK_SIZE + 1];
225 const SizeType end = num_warps - (num_warps & (BLOCK_SIZE - 1));
227 if (threadIdx.x == 0) {
228 rows[BLOCK_SIZE] = (IndexType)-1;
229 vals[BLOCK_SIZE] = (ValueType)0;
234 SizeType i = threadIdx.x;
238 rows[threadIdx.x] = temp_rows[i];
239 vals[threadIdx.x] = temp_vals[i];
243 segreduce_block(rows, vals);
245 if (rows[threadIdx.x] != rows[threadIdx.x + 1])
246 y[rows[threadIdx.x]] += vals[threadIdx.x];
253 if (end < num_warps) {
255 rows[threadIdx.x] = temp_rows[i];
256 vals[threadIdx.x] = temp_vals[i];
258 rows[threadIdx.x] = (IndexType)-1;
259 vals[threadIdx.x] = (ValueType)0;
264 segreduce_block(rows, vals);
267 if (rows[threadIdx.x] != rows[threadIdx.x + 1])
268 y[rows[threadIdx.x]] += vals[threadIdx.x];
272template <
typename IndexType,
typename ValueType>
273__device__
void segreduce_block(
const IndexType* idx, ValueType* val) {
275 if (threadIdx.x >= 1 && idx[threadIdx.x] == idx[threadIdx.x - 1]) {
276 left = val[threadIdx.x - 1];
279 val[threadIdx.x] += left;
282 if (threadIdx.x >= 2 && idx[threadIdx.x] == idx[threadIdx.x - 2]) {
283 left = val[threadIdx.x - 2];
286 val[threadIdx.x] += left;
289 if (threadIdx.x >= 4 && idx[threadIdx.x] == idx[threadIdx.x - 4]) {
290 left = val[threadIdx.x - 4];
293 val[threadIdx.x] += left;
296 if (threadIdx.x >= 8 && idx[threadIdx.x] == idx[threadIdx.x - 8]) {
297 left = val[threadIdx.x - 8];
300 val[threadIdx.x] += left;
303 if (threadIdx.x >= 16 && idx[threadIdx.x] == idx[threadIdx.x - 16]) {
304 left = val[threadIdx.x - 16];
307 val[threadIdx.x] += left;
310 if (threadIdx.x >= 32 && idx[threadIdx.x] == idx[threadIdx.x - 32]) {
311 left = val[threadIdx.x - 32];
314 val[threadIdx.x] += left;
317 if (threadIdx.x >= 64 && idx[threadIdx.x] == idx[threadIdx.x - 64]) {
318 left = val[threadIdx.x - 64];
321 val[threadIdx.x] += left;
324 if (threadIdx.x >= 128 && idx[threadIdx.x] == idx[threadIdx.x - 128]) {
325 left = val[threadIdx.x - 128];
328 val[threadIdx.x] += left;
331 if (threadIdx.x >= 256 && idx[threadIdx.x] == idx[threadIdx.x - 256]) {
332 left = val[threadIdx.x - 256];
335 val[threadIdx.x] += left;
Generic Morpheus interfaces.
Definition: dummy.cpp:24