Morpheus 1.0.0
Dynamic matrix type and algorithms for sparse matrices
Loading...
Searching...
No Matches
DenseVector/Kernels/Morpheus_Dot_Impl.hpp
1
24#ifndef MORPHEUS_DENSEVECTOR_KERNELS_DOT_IMPL_HPP
25#define MORPHEUS_DENSEVECTOR_KERNELS_DOT_IMPL_HPP
26
27#if defined(MORPHEUS_ENABLE_HIP)
28#include <impl/Morpheus_HIPUtils.hpp>
29#elif defined(MORPHEUS_ENABLE_CUDA)
30#include <impl/Morpheus_CudaUtils.hpp>
31#endif
32
33namespace Morpheus {
34namespace Impl {
35namespace Kernels {
36
37template <typename ValueType, typename SizeType>
38__global__ void dot_kernel(SizeType n, const ValueType* x, const ValueType* y,
39 SizeType* res) {
40 const SizeType tid = blockDim.x * blockIdx.x + threadIdx.x;
41 if (tid > n) return;
42
43 res[tid] = x[tid] * y[tid];
44}
45
46template <unsigned int BLOCKSIZE, typename ValueType, typename SizeType>
47__launch_bounds__(BLOCKSIZE) __global__
48 void dot_kernel_part1(SizeType n, const ValueType* x, const ValueType* y,
49 ValueType* workspace) {
50 SizeType gid = blockIdx.x * BLOCKSIZE + threadIdx.x;
51 SizeType inc = gridDim.x * BLOCKSIZE;
52
53 ValueType sum = 0.0;
54 for (SizeType idx = gid; idx < n; idx += inc) {
55 sum += y[idx] * x[idx];
56 }
57
58 __shared__ ValueType sdata[BLOCKSIZE];
59 sdata[threadIdx.x] = sum;
60
61 __syncthreads();
62
63 if (threadIdx.x < 128) sdata[threadIdx.x] += sdata[threadIdx.x + 128];
64 __syncthreads();
65 if (threadIdx.x < 64) sdata[threadIdx.x] += sdata[threadIdx.x + 64];
66 __syncthreads();
67 if (threadIdx.x < 32) sdata[threadIdx.x] += sdata[threadIdx.x + 32];
68 __syncthreads();
69 if (threadIdx.x < 16) sdata[threadIdx.x] += sdata[threadIdx.x + 16];
70 __syncthreads();
71 if (threadIdx.x < 8) sdata[threadIdx.x] += sdata[threadIdx.x + 8];
72 __syncthreads();
73 if (threadIdx.x < 4) sdata[threadIdx.x] += sdata[threadIdx.x + 4];
74 __syncthreads();
75 if (threadIdx.x < 2) sdata[threadIdx.x] += sdata[threadIdx.x + 2];
76 __syncthreads();
77
78 if (threadIdx.x == 0) {
79 workspace[blockIdx.x] = sdata[0] + sdata[1];
80 }
81}
82
83template <unsigned int BLOCKSIZE, typename ValueType>
84__launch_bounds__(BLOCKSIZE) __global__
85 void dot_kernel_part2(ValueType* workspace) {
86 __shared__ ValueType sdata[BLOCKSIZE];
87 sdata[threadIdx.x] = workspace[threadIdx.x];
88
89 __syncthreads();
90
91 if (threadIdx.x < 128) sdata[threadIdx.x] += sdata[threadIdx.x + 128];
92 __syncthreads();
93 if (threadIdx.x < 64) sdata[threadIdx.x] += sdata[threadIdx.x + 64];
94 __syncthreads();
95 if (threadIdx.x < 32) sdata[threadIdx.x] += sdata[threadIdx.x + 32];
96 __syncthreads();
97 if (threadIdx.x < 16) sdata[threadIdx.x] += sdata[threadIdx.x + 16];
98 __syncthreads();
99 if (threadIdx.x < 8) sdata[threadIdx.x] += sdata[threadIdx.x + 8];
100 __syncthreads();
101 if (threadIdx.x < 4) sdata[threadIdx.x] += sdata[threadIdx.x + 4];
102 __syncthreads();
103 if (threadIdx.x < 2) sdata[threadIdx.x] += sdata[threadIdx.x + 2];
104 __syncthreads();
105
106 if (threadIdx.x == 0) {
107 workspace[0] = sdata[0] + sdata[1];
108 }
109}
110
111template <typename ValueType, typename SizeType>
112__global__ void DOT_D_ini(SizeType n, ValueType* x, ValueType* y,
113 ValueType* valpha) {
114 extern __shared__ ValueType vtmp[];
115 // Each thread loads two elements from each chunk
116 // from global to shared memory
117 SizeType tid = threadIdx.x;
118 SizeType NumBlk = gridDim.x; // = 256
119 SizeType BlkSize = blockDim.x; // = 192
120 SizeType Chunk = 2 * NumBlk * BlkSize;
121 SizeType i = blockIdx.x * (2 * BlkSize) + tid;
122 volatile ValueType* vtmp2 = vtmp;
123
124 // Reduce from n to NumBlk * BlkSize elements. Each thread // operates with
125 // two elements of each chunk
126 vtmp[tid] = 0;
127 while (i < n) {
128 vtmp[tid] += x[i] * y[i];
129 vtmp[tid] += (i + BlkSize < n) ? (x[i + BlkSize] * y[i + BlkSize]) : 0;
130 i += Chunk;
131 }
132 __syncthreads();
133 // Reduce from BlkSize=192 elements to 96, 48, 24, 12, 6, 3 and 1
134 if (tid < 96) {
135 vtmp[tid] += vtmp[tid + 96];
136 }
137 __syncthreads();
138 if (tid < 48) {
139 vtmp[tid] += vtmp[tid + 48];
140 }
141 __syncthreads();
142 if (tid < 24) {
143 vtmp2[tid] += vtmp2[tid + 24];
144 vtmp2[tid] += vtmp2[tid + 12];
145 vtmp2[tid] += vtmp2[tid + 6];
146 vtmp2[tid] += vtmp2[tid + 3];
147 }
148 // Write result for this block to global mem
149 if (tid == 0) valpha[blockIdx.x] = vtmp[0] + vtmp[1] + vtmp[2];
150}
151
152template <typename ValueType, typename SizeType>
153__global__ void DOT_D_fin(ValueType* valpha) {
154 extern __shared__ ValueType vtmp[];
155 // Each thread loads one element from global to shared mem
156 SizeType tid = threadIdx.x;
157 volatile ValueType* vtmp2 = vtmp;
158 vtmp[tid] = valpha[tid];
159 __syncthreads();
160 // Reduce from 256 elements to 128, 64, 32, 16, 8, 2 and 1
161 if (tid < 128) {
162 vtmp[tid] += vtmp[tid + 128];
163 }
164 __syncthreads();
165 if (tid < 64) {
166 vtmp[tid] += vtmp[tid + 64];
167 }
168 __syncthreads();
169 if (tid < 32) {
170 vtmp2[tid] += vtmp2[tid + 32];
171 vtmp2[tid] += vtmp2[tid + 16];
172 vtmp2[tid] += vtmp2[tid + 8];
173 vtmp2[tid] += vtmp2[tid + 4];
174 vtmp2[tid] += vtmp2[tid + 2];
175 vtmp2[tid] += vtmp2[tid + 1];
176 }
177 // Write result for this block to global mem
178 if (tid == 0) valpha[blockIdx.x] = *vtmp;
179}
180
181} // namespace Kernels
182} // namespace Impl
183} // namespace Morpheus
184
185#endif // MORPHEUS_DENSEVECTOR_KERNELS_DOT_IMPL_HPP
Generic Morpheus interfaces.
Definition: dummy.cpp:24