25#ifndef MORPHEUS_CSR_KERNELS_MULTIPLY_IMPL_HPP
26#define MORPHEUS_CSR_KERNELS_MULTIPLY_IMPL_HPP
28#include <Morpheus_Macros.hpp>
29#if defined(MORPHEUS_ENABLE_CUDA) || defined(MORPHEUS_ENABLE_HIP)
36template <
typename SizeType,
typename IndexType,
typename ValueType>
37__global__
void spmv_csr_scalar_kernel(
const SizeType nrows,
38 const IndexType* Ap,
const IndexType* Aj,
39 const ValueType* Ax,
const ValueType* x,
41 const SizeType thread_id = blockDim.x * blockIdx.x + threadIdx.x;
42 const SizeType grid_size = gridDim.x * blockDim.x;
44 for (SizeType row = thread_id; row < nrows; row += grid_size) {
45 const IndexType row_start = Ap[row];
46 const IndexType row_end = Ap[row + 1];
48 ValueType sum = ValueType(0);
50 for (IndexType jj = row_start; jj < row_end; jj++)
51 sum += Ax[jj] * x[Aj[jj]];
75template <
typename SizeType,
typename IndexType,
typename ValueType,
76 size_t VECTORS_PER_BLOCK,
size_t THREADS_PER_VECTOR>
77__global__
void spmv_csr_vector_kernel(
const SizeType nrows,
78 const IndexType* Ap,
const IndexType* Aj,
79 const ValueType* Ax,
const ValueType* x,
81 __shared__
volatile ValueType
82 sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR +
83 THREADS_PER_VECTOR / 2];
84 __shared__
volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
86 const SizeType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
88 const SizeType thread_id =
89 THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
90 const SizeType thread_lane =
91 threadIdx.x & (THREADS_PER_VECTOR - 1);
92 const SizeType vector_id =
93 thread_id / THREADS_PER_VECTOR;
94 const SizeType vector_lane =
95 threadIdx.x / THREADS_PER_VECTOR;
96 const SizeType num_vectors =
97 VECTORS_PER_BLOCK * gridDim.x;
99 for (SizeType row = vector_id; row < nrows; row += num_vectors) {
102 if (thread_lane < 2) ptrs[vector_lane][thread_lane] = Ap[row + thread_lane];
104 const IndexType row_start =
105 ptrs[vector_lane][0];
106 const IndexType row_end =
107 ptrs[vector_lane][1];
110 ValueType sum = ValueType(0);
112 if (THREADS_PER_VECTOR == WARP_SIZE && row_end - row_start > WARP_SIZE) {
116 row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
119 if (jj >= row_start && jj < row_end) sum += Ax[jj] * x[Aj[jj]];
122 for (jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
123 sum += Ax[jj] * x[Aj[jj]];
126 for (IndexType jj = row_start + thread_lane; jj < row_end;
127 jj += THREADS_PER_VECTOR)
128 sum += Ax[jj] * x[Aj[jj]];
132 sdata[threadIdx.x] = sum;
136#if defined(MORPHEUS_ENABLE_HIP)
137 if (THREADS_PER_VECTOR > 32) {
138 temp = sdata[threadIdx.x + 32];
139 sdata[threadIdx.x] = sum += temp;
144 if (THREADS_PER_VECTOR > 16) {
145 temp = sdata[threadIdx.x + 16];
146 sdata[threadIdx.x] = sum += temp;
148 if (THREADS_PER_VECTOR > 8) {
149 temp = sdata[threadIdx.x + 8];
150 sdata[threadIdx.x] = sum += temp;
152 if (THREADS_PER_VECTOR > 4) {
153 temp = sdata[threadIdx.x + 4];
154 sdata[threadIdx.x] = sum += temp;
156 if (THREADS_PER_VECTOR > 2) {
157 temp = sdata[threadIdx.x + 2];
158 sdata[threadIdx.x] = sum += temp;
160 if (THREADS_PER_VECTOR > 1) {
161 temp = sdata[threadIdx.x + 1];
162 sdata[threadIdx.x] = sum += temp;
166 if (thread_lane == 0) y[row] += ValueType(sdata[threadIdx.x]);
Generic Morpheus interfaces.
Definition: dummy.cpp:24