Morpheus 1.0.0
Dynamic matrix type and algorithms for sparse matrices
Loading...
Searching...
No Matches
Morpheus_CudaUtils.hpp
1
24#ifndef MORPHEUS_CUDA_UTILS_HPP
25#define MORPHEUS_CUDA_UTILS_HPP
26
27#include <Morpheus_Macros.hpp>
28#if defined(MORPHEUS_ENABLE_CUDA)
29
30#include <impl/Morpheus_Utils.hpp>
31
32#include <stdio.h>
33#include <stdlib.h>
34#include <cuda.h>
35
36namespace Morpheus {
37namespace Impl {
38
39// maximum number of co-resident threads
40const int MAX_BLOCK_DIM_SIZE = 65535;
41const int WARP_SIZE = 32;
42
43template <typename T>
44static const char *_cudaGetErrorEnum(T error) {
45 return cudaGetErrorName(error);
46}
47
48template <typename T>
49void check_cuda(T result, char const *const func, const char *const file,
50 int const line) {
51 if (result) {
52 fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
53 static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
54 exit(EXIT_FAILURE);
55 }
56}
57
58#define checkCudaErrors(val) check_cuda((val), #val, __FILE__, __LINE__)
59
60// This will output the proper error string when calling cudaGetLastError
61#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
62
63inline void __getLastCudaError(const char *errorMessage, const char *file,
64 const int line) {
65 cudaError_t err = cudaGetLastError();
66
67 if (cudaSuccess != err) {
68 fprintf(stderr,
69 "%s(%i) : getLastCudaError() CUDA error :"
70 " %s : (%d) %s.\n",
71 file, line, errorMessage, static_cast<int>(err),
72 cudaGetErrorString(err));
73 exit(EXIT_FAILURE);
74 }
75}
76
77template <typename KernelFunction>
78size_t max_active_blocks(KernelFunction kernel, const size_t CTA_SIZE,
79 const size_t dynamic_smem_bytes) {
80 int MAX_BLOCKS;
81 cudaOccupancyMaxActiveBlocksPerMultiprocessor(
82 &MAX_BLOCKS, kernel, (int)CTA_SIZE, dynamic_smem_bytes);
83
84 cudaDeviceProp prop;
85 int device;
86 checkCudaErrors(cudaGetDevice(&device));
87 checkCudaErrors(cudaGetDeviceProperties(&prop, device));
88
89 return (size_t)MAX_BLOCKS * prop.multiProcessorCount;
90}
91
92// Compute the number of threads and blocks to use for the given reduction
93// kernel. We set threads / block to the minimum of maxThreads and n/2.
94// We observe the maximum specified number of blocks, because
95// each thread in that kernel can process a variable number of elements.
96template <typename IndexType>
97void getNumBlocksAndThreads(
98 IndexType n, IndexType maxBlocks, IndexType maxThreads, IndexType &blocks,
99 IndexType &threads,
100 typename std::enable_if<std::is_integral<IndexType>::value>::type * =
101 nullptr) {
102 // get device capability, to avoid block/grid size exceed the upper bound
103 cudaDeviceProp prop;
104 int device;
105 checkCudaErrors(cudaGetDevice(&device));
106 checkCudaErrors(cudaGetDeviceProperties(&prop, device));
107
108 threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
109 blocks = (n + (threads * 2 - 1)) / (threads * 2);
110
111 if ((float)threads * blocks >
112 (float)prop.maxGridSize[0] * prop.maxThreadsPerBlock) {
113 printf("n is too large, please choose a smaller number!\n");
114 }
115
116 if (blocks > (IndexType)prop.maxGridSize[0]) {
117 printf(
118 "Grid size <%lld> exceeds the device capability <%lld>, set block size "
119 "as "
120 "%lld (original %lld)\n",
121 blocks, prop.maxGridSize[0], threads * 2, threads);
122
123 blocks /= 2;
124 threads *= 2;
125 }
126
127 blocks = min(maxBlocks, blocks);
128}
129
130} // namespace Impl
131} // namespace Morpheus
132
133#endif // MORPHEUS_ENABLE_CUDA
134#endif // MORPHEUS_CUDA_UTILS_HPP
Generic Morpheus interfaces.
Definition: dummy.cpp:24