24#ifndef MORPHEUS_DENSEVECTOR_KERNELS_DOT_IMPL_HPP
25#define MORPHEUS_DENSEVECTOR_KERNELS_DOT_IMPL_HPP
27#if defined(MORPHEUS_ENABLE_HIP)
28#include <impl/Morpheus_HIPUtils.hpp>
29#elif defined(MORPHEUS_ENABLE_CUDA)
30#include <impl/Morpheus_CudaUtils.hpp>
37template <
typename ValueType,
typename SizeType>
38__global__
void dot_kernel(SizeType n,
const ValueType* x,
const ValueType* y,
40 const SizeType tid = blockDim.x * blockIdx.x + threadIdx.x;
43 res[tid] = x[tid] * y[tid];
46template <
unsigned int BLOCKSIZE,
typename ValueType,
typename SizeType>
47__launch_bounds__(BLOCKSIZE) __global__
48 void dot_kernel_part1(SizeType n,
const ValueType* x,
const ValueType* y,
49 ValueType* workspace) {
50 SizeType gid = blockIdx.x * BLOCKSIZE + threadIdx.x;
51 SizeType inc = gridDim.x * BLOCKSIZE;
54 for (SizeType idx = gid; idx < n; idx += inc) {
55 sum += y[idx] * x[idx];
58 __shared__ ValueType sdata[BLOCKSIZE];
59 sdata[threadIdx.x] = sum;
63 if (threadIdx.x < 128) sdata[threadIdx.x] += sdata[threadIdx.x + 128];
65 if (threadIdx.x < 64) sdata[threadIdx.x] += sdata[threadIdx.x + 64];
67 if (threadIdx.x < 32) sdata[threadIdx.x] += sdata[threadIdx.x + 32];
69 if (threadIdx.x < 16) sdata[threadIdx.x] += sdata[threadIdx.x + 16];
71 if (threadIdx.x < 8) sdata[threadIdx.x] += sdata[threadIdx.x + 8];
73 if (threadIdx.x < 4) sdata[threadIdx.x] += sdata[threadIdx.x + 4];
75 if (threadIdx.x < 2) sdata[threadIdx.x] += sdata[threadIdx.x + 2];
78 if (threadIdx.x == 0) {
79 workspace[blockIdx.x] = sdata[0] + sdata[1];
83template <
unsigned int BLOCKSIZE,
typename ValueType>
84__launch_bounds__(BLOCKSIZE) __global__
85 void dot_kernel_part2(ValueType* workspace) {
86 __shared__ ValueType sdata[BLOCKSIZE];
87 sdata[threadIdx.x] = workspace[threadIdx.x];
91 if (threadIdx.x < 128) sdata[threadIdx.x] += sdata[threadIdx.x + 128];
93 if (threadIdx.x < 64) sdata[threadIdx.x] += sdata[threadIdx.x + 64];
95 if (threadIdx.x < 32) sdata[threadIdx.x] += sdata[threadIdx.x + 32];
97 if (threadIdx.x < 16) sdata[threadIdx.x] += sdata[threadIdx.x + 16];
99 if (threadIdx.x < 8) sdata[threadIdx.x] += sdata[threadIdx.x + 8];
101 if (threadIdx.x < 4) sdata[threadIdx.x] += sdata[threadIdx.x + 4];
103 if (threadIdx.x < 2) sdata[threadIdx.x] += sdata[threadIdx.x + 2];
106 if (threadIdx.x == 0) {
107 workspace[0] = sdata[0] + sdata[1];
111template <
typename ValueType,
typename SizeType>
112__global__
void DOT_D_ini(SizeType n, ValueType* x, ValueType* y,
114 extern __shared__ ValueType vtmp[];
117 SizeType tid = threadIdx.x;
118 SizeType NumBlk = gridDim.x;
119 SizeType BlkSize = blockDim.x;
120 SizeType Chunk = 2 * NumBlk * BlkSize;
121 SizeType i = blockIdx.x * (2 * BlkSize) + tid;
122 volatile ValueType* vtmp2 = vtmp;
128 vtmp[tid] += x[i] * y[i];
129 vtmp[tid] += (i + BlkSize < n) ? (x[i + BlkSize] * y[i + BlkSize]) : 0;
135 vtmp[tid] += vtmp[tid + 96];
139 vtmp[tid] += vtmp[tid + 48];
143 vtmp2[tid] += vtmp2[tid + 24];
144 vtmp2[tid] += vtmp2[tid + 12];
145 vtmp2[tid] += vtmp2[tid + 6];
146 vtmp2[tid] += vtmp2[tid + 3];
149 if (tid == 0) valpha[blockIdx.x] = vtmp[0] + vtmp[1] + vtmp[2];
152template <
typename ValueType,
typename SizeType>
153__global__
void DOT_D_fin(ValueType* valpha) {
154 extern __shared__ ValueType vtmp[];
156 SizeType tid = threadIdx.x;
157 volatile ValueType* vtmp2 = vtmp;
158 vtmp[tid] = valpha[tid];
162 vtmp[tid] += vtmp[tid + 128];
166 vtmp[tid] += vtmp[tid + 64];
170 vtmp2[tid] += vtmp2[tid + 32];
171 vtmp2[tid] += vtmp2[tid + 16];
172 vtmp2[tid] += vtmp2[tid + 8];
173 vtmp2[tid] += vtmp2[tid + 4];
174 vtmp2[tid] += vtmp2[tid + 2];
175 vtmp2[tid] += vtmp2[tid + 1];
178 if (tid == 0) valpha[blockIdx.x] = *vtmp;
Generic Morpheus interfaces.
Definition: dummy.cpp:24