Morpheus 1.0.0
Dynamic matrix type and algorithms for sparse matrices
Loading...
Searching...
No Matches
Coo/Cuda/Morpheus_Multiply_Impl.hpp
1
24#ifndef MORPHEUS_COO_CUDA_MULTIPLY_IMPL_HPP
25#define MORPHEUS_COO_CUDA_MULTIPLY_IMPL_HPP
26
27#include <Morpheus_Macros.hpp>
28#if defined(MORPHEUS_ENABLE_CUDA)
29
30#include <Morpheus_SpaceTraits.hpp>
31#include <Morpheus_FormatTraits.hpp>
32#include <Morpheus_FormatTags.hpp>
33#include <Morpheus_Spaces.hpp>
34
35#include <impl/Morpheus_CudaUtils.hpp>
36#include <impl/Coo/Kernels/Morpheus_Multiply_Impl.hpp>
37
38namespace Morpheus {
39namespace Impl {
40
41// forward decl
42template <typename Matrix, typename Vector>
43void __spmv_coo_flat(const Matrix& A, const Vector& x, Vector& y,
44 const bool init);
45
46template <typename Matrix, typename Vector>
47void __spmv_coo_serial(const Matrix& A, const Vector& x, Vector& y,
48 const bool init);
49
50template <typename ExecSpace, typename Matrix, typename Vector>
51inline void multiply(
52 const Matrix& A, const Vector& x, Vector& y, const bool init,
53 typename std::enable_if_t<
54 Morpheus::is_coo_matrix_format_container_v<Matrix> &&
55 Morpheus::is_dense_vector_format_container_v<Vector> &&
56 Morpheus::has_custom_backend_v<ExecSpace> &&
57 Morpheus::has_cuda_execution_space_v<ExecSpace> &&
58 Morpheus::has_access_v<ExecSpace, Matrix, Vector>>* = nullptr) {
59 switch (A.options()) {
60 case MATOPT_SHORT_ROWS: __spmv_coo_serial(A, x, y, init); break;
61 default: __spmv_coo_flat(A, x, y, init);
62 }
63}
64
65template <typename Matrix, typename Vector>
66void __spmv_coo_serial(const Matrix& A, const Vector& x, Vector& y,
67 const bool init) {
68 using size_type = typename Matrix::size_type;
69 using index_type = typename Matrix::index_type;
70 using value_type = typename Matrix::value_type;
71 const index_type* I = A.crow_indices().data();
72 const index_type* J = A.ccolumn_indices().data();
73 const value_type* V = A.cvalues().data();
74
75 const value_type* x_ptr = x.data();
76 value_type* y_ptr = y.data();
77
78 if (init) {
79 y.assign(y.size(), 0);
80 }
81
82 Kernels::spmv_coo_serial_kernel<size_type, index_type, value_type>
83 <<<1, 1>>>(A.nnnz(), I, J, V, x_ptr, y_ptr);
84
85#if defined(DEBUG) || defined(MORPHEUS_DEBUG)
86 getLastCudaError("spmv_coo_serial_kernel: Kernel execution failed");
87#endif
88}
89
91// COO SpMV kernel which flattens data irregularity (segmented reduction)
93// Copyright 2008-2014 NVIDIA Corporation
94// spmv_coo_flat
95// The input coo_matrix must be sorted by row. Columns within each row
96// may appear in any order and duplicate entries are also acceptable.
97// This sorted COO format is easily obtained by expanding the row pointer
98// of a CSR matrix (csr.Ap) into proper row indices and then copying
99// the arrays containing the CSR column indices (csr.Aj) and nonzero values
100// (csr.Ax) verbatim. A segmented reduction is used to compute the per-row
101// sums.
102//
103//
104template <typename Matrix, typename Vector>
105void __spmv_coo_flat(const Matrix& A, const Vector& x, Vector& y,
106 const bool init) {
107 using size_type = typename Matrix::size_type;
108 using index_type = typename Matrix::index_type;
109 using value_type = typename Matrix::value_type;
110
111 if (init) {
112 y.assign(y.size(), 0);
113 }
114
115 if (A.nnnz() == 0) {
116 // empty matrix
117 return;
118 } else if (A.nnnz() < static_cast<size_type>(WARP_SIZE)) {
119 // small matrix
120 Kernels::spmv_coo_serial_kernel<size_type, index_type, value_type>
121 <<<1, 1, 0>>>(A.nnnz(), A.crow_indices().data(),
122 A.ccolumn_indices().data(), A.cvalues().data(), x.data(),
123 y.data());
124#if defined(DEBUG) || defined(MORPHEUS_DEBUG)
125 getLastCudaError("spmv_coo_serial_kernel: Kernel execution failed");
126#endif
127 return;
128 }
129
130 const size_type BLOCK_SIZE = 256;
131 const size_type MAX_BLOCKS =
132 max_active_blocks(Kernels::spmv_coo_flat_kernel<size_type, index_type,
133 value_type, BLOCK_SIZE>,
134 BLOCK_SIZE, 0);
135 const size_type WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
136
137 const size_type num_units = A.nnnz() / WARP_SIZE;
138 const size_type num_warps = std::min(num_units, WARPS_PER_BLOCK * MAX_BLOCKS);
139 const size_type num_blocks =
140 Impl::ceil_div<size_type>(num_warps, WARPS_PER_BLOCK);
141 const size_type num_iters = Impl::ceil_div<size_type>(num_units, num_warps);
142
143 const size_type interval_size = WARP_SIZE * num_iters;
144
145 const size_type tail =
146 num_units * WARP_SIZE; // do the last few nonzeros separately (fewer
147 // than WARP_SIZE elements)
148
149 const size_type active_warps =
150 (interval_size == 0) ? 0 : Impl::ceil_div<size_type>(tail, interval_size);
151
152 typename Matrix::index_array_type temp_rows(active_warps, 0);
153 typename Matrix::value_array_type temp_vals(active_warps, 0);
154
155 Kernels::spmv_coo_flat_kernel<size_type, index_type, value_type, BLOCK_SIZE>
156 <<<num_blocks, BLOCK_SIZE, 0>>>(
157 tail, interval_size, A.crow_indices().data(),
158 A.ccolumn_indices().data(), A.cvalues().data(), x.data(), y.data(),
159 temp_rows.data(), temp_vals.data());
160#if defined(DEBUG) || defined(MORPHEUS_DEBUG)
161 getLastCudaError("spmv_coo_flat_kernel: Kernel execution failed");
162#endif
163
164 Kernels::spmv_coo_reduce_update_kernel<size_type, index_type, value_type,
165 BLOCK_SIZE><<<1, BLOCK_SIZE, 0>>>(
166 active_warps, temp_rows.data(), temp_vals.data(), y.data());
167#if defined(DEBUG) || defined(MORPHEUS_DEBUG)
168 getLastCudaError("spmv_coo_reduce_kernel: Kernel execution failed");
169#endif
170
171 Kernels::spmv_coo_serial_kernel<size_type, index_type, value_type>
172 <<<1, 1, 0>>>(A.nnnz() - tail, A.crow_indices().data() + tail,
173 A.ccolumn_indices().data() + tail,
174 A.cvalues().data() + tail, x.data(), y.data());
175#if defined(DEBUG) || defined(MORPHEUS_DEBUG)
176 getLastCudaError("spmv_coo_serial_kernel: Kernel execution failed");
177#endif
178}
179
180} // namespace Impl
181} // namespace Morpheus
182
183#endif // MORPHEUS_ENABLE_CUDA
184#endif // MORPHEUS_COO_CUDA_MULTIPLY_IMPL_HPP
Generic Morpheus interfaces.
Definition: dummy.cpp:24