Morpheus 1.0.0
Dynamic matrix type and algorithms for sparse matrices
Loading...
Searching...
No Matches
Csr/Kernels/Morpheus_Multiply_Impl.hpp
1
25#ifndef MORPHEUS_CSR_KERNELS_MULTIPLY_IMPL_HPP
26#define MORPHEUS_CSR_KERNELS_MULTIPLY_IMPL_HPP
27
28#include <Morpheus_Macros.hpp>
29#if defined(MORPHEUS_ENABLE_CUDA) || defined(MORPHEUS_ENABLE_HIP)
30
31namespace Morpheus {
32namespace Impl {
33
34namespace Kernels {
35// One thread per row
36template <typename SizeType, typename IndexType, typename ValueType>
37__global__ void spmv_csr_scalar_kernel(const SizeType nrows,
38 const IndexType* Ap, const IndexType* Aj,
39 const ValueType* Ax, const ValueType* x,
40 ValueType* y) {
41 const SizeType thread_id = blockDim.x * blockIdx.x + threadIdx.x;
42 const SizeType grid_size = gridDim.x * blockDim.x;
43
44 for (SizeType row = thread_id; row < nrows; row += grid_size) {
45 const IndexType row_start = Ap[row];
46 const IndexType row_end = Ap[row + 1];
47
48 ValueType sum = ValueType(0);
49
50 for (IndexType jj = row_start; jj < row_end; jj++)
51 sum += Ax[jj] * x[Aj[jj]];
52
53 y[row] += sum;
54 }
55}
56
58// CSR SpMV kernels based on a vector model (one warp per row)
60//
61// spmv_csr_vector_kernel
62// Each row of the CSR matrix is assigned to a warp. The warp computes
63// y[i] = A[i,:] * x, i.e. the dot product of the i-th row of A with
64// the x vector, in parallel. This division of work implies that
65// the CSR index and data arrays (Aj and Ax) are accessed in a contiguous
66// manner (but generally not aligned). On GT200 these accesses are
67// coalesced, unlike kernels based on the one-row-per-thread division of
68// work. Since an entire 32-thread warp is assigned to each row, many
69// threads will remain idle when their row contains a small number
70// of elements. This code relies on implicit synchronization among
71// threads in a warp.
72//
73// Note: THREADS_PER_VECTOR must be one of [2,4,8,16,32]
74
75template <typename SizeType, typename IndexType, typename ValueType,
76 size_t VECTORS_PER_BLOCK, size_t THREADS_PER_VECTOR>
77__global__ void spmv_csr_vector_kernel(const SizeType nrows,
78 const IndexType* Ap, const IndexType* Aj,
79 const ValueType* Ax, const ValueType* x,
80 ValueType* y) {
81 __shared__ volatile ValueType
82 sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR +
83 THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
84 __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
85
86 const SizeType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
87
88 const SizeType thread_id =
89 THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
90 const SizeType thread_lane =
91 threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
92 const SizeType vector_id =
93 thread_id / THREADS_PER_VECTOR; // global vector index
94 const SizeType vector_lane =
95 threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
96 const SizeType num_vectors =
97 VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
98
99 for (SizeType row = vector_id; row < nrows; row += num_vectors) {
100 // use two threads to fetch Ap[row] and Ap[row+1]
101 // this is considerably faster than the straightforward version
102 if (thread_lane < 2) ptrs[vector_lane][thread_lane] = Ap[row + thread_lane];
103
104 const IndexType row_start =
105 ptrs[vector_lane][0]; // same as: row_start = Ap[row];
106 const IndexType row_end =
107 ptrs[vector_lane][1]; // same as: row_end = Ap[row+1];
108
109 // initialize local sum
110 ValueType sum = ValueType(0);
111
112 if (THREADS_PER_VECTOR == WARP_SIZE && row_end - row_start > WARP_SIZE) {
113 // ensure aligned memory access to Aj and Ax
114
115 IndexType jj =
116 row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
117
118 // accumulate local sums
119 if (jj >= row_start && jj < row_end) sum += Ax[jj] * x[Aj[jj]];
120
121 // accumulate local sums
122 for (jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
123 sum += Ax[jj] * x[Aj[jj]];
124 } else {
125 // accumulate local sums
126 for (IndexType jj = row_start + thread_lane; jj < row_end;
127 jj += THREADS_PER_VECTOR)
128 sum += Ax[jj] * x[Aj[jj]];
129 }
130
131 // store local sum in shared memory
132 sdata[threadIdx.x] = sum;
133
134 ValueType temp;
135
136#if defined(MORPHEUS_ENABLE_HIP)
137 if (THREADS_PER_VECTOR > 32) {
138 temp = sdata[threadIdx.x + 32];
139 sdata[threadIdx.x] = sum += temp;
140 }
141#endif // MORPHEUS_ENABLE_HIP
142
143 // reduce local sums to row sum
144 if (THREADS_PER_VECTOR > 16) {
145 temp = sdata[threadIdx.x + 16];
146 sdata[threadIdx.x] = sum += temp;
147 }
148 if (THREADS_PER_VECTOR > 8) {
149 temp = sdata[threadIdx.x + 8];
150 sdata[threadIdx.x] = sum += temp;
151 }
152 if (THREADS_PER_VECTOR > 4) {
153 temp = sdata[threadIdx.x + 4];
154 sdata[threadIdx.x] = sum += temp;
155 }
156 if (THREADS_PER_VECTOR > 2) {
157 temp = sdata[threadIdx.x + 2];
158 sdata[threadIdx.x] = sum += temp;
159 }
160 if (THREADS_PER_VECTOR > 1) {
161 temp = sdata[threadIdx.x + 1];
162 sdata[threadIdx.x] = sum += temp;
163 }
164
165 // first thread writes the result
166 if (thread_lane == 0) y[row] += ValueType(sdata[threadIdx.x]);
167 }
168}
169
170} // namespace Kernels
171} // namespace Impl
172} // namespace Morpheus
173
174#endif
175
176#endif // MORPHEUS_CSR_KERNELS_MULTIPLY_IMPL_HPP
Generic Morpheus interfaces.
Definition: dummy.cpp:24