24#ifndef MORPHEUS_CUDA_UTILS_HPP
25#define MORPHEUS_CUDA_UTILS_HPP
27#include <Morpheus_Macros.hpp>
28#if defined(MORPHEUS_ENABLE_CUDA)
30#include <impl/Morpheus_Utils.hpp>
40const int MAX_BLOCK_DIM_SIZE = 65535;
41const int WARP_SIZE = 32;
44static const char *_cudaGetErrorEnum(T error) {
45 return cudaGetErrorName(error);
49void check_cuda(T result,
char const *
const func,
const char *
const file,
52 fprintf(stderr,
"CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
53 static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
58#define checkCudaErrors(val) check_cuda((val), #val, __FILE__, __LINE__)
61#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
63inline void __getLastCudaError(
const char *errorMessage,
const char *file,
65 cudaError_t err = cudaGetLastError();
67 if (cudaSuccess != err) {
69 "%s(%i) : getLastCudaError() CUDA error :"
71 file, line, errorMessage,
static_cast<int>(err),
72 cudaGetErrorString(err));
77template <
typename KernelFunction>
78size_t max_active_blocks(KernelFunction kernel,
const size_t CTA_SIZE,
79 const size_t dynamic_smem_bytes) {
81 cudaOccupancyMaxActiveBlocksPerMultiprocessor(
82 &MAX_BLOCKS, kernel, (
int)CTA_SIZE, dynamic_smem_bytes);
86 checkCudaErrors(cudaGetDevice(&device));
87 checkCudaErrors(cudaGetDeviceProperties(&prop, device));
89 return (
size_t)MAX_BLOCKS * prop.multiProcessorCount;
96template <
typename IndexType>
97void getNumBlocksAndThreads(
98 IndexType n, IndexType maxBlocks, IndexType maxThreads, IndexType &blocks,
100 typename std::enable_if<std::is_integral<IndexType>::value>::type * =
105 checkCudaErrors(cudaGetDevice(&device));
106 checkCudaErrors(cudaGetDeviceProperties(&prop, device));
108 threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
109 blocks = (n + (threads * 2 - 1)) / (threads * 2);
111 if ((
float)threads * blocks >
112 (float)prop.maxGridSize[0] * prop.maxThreadsPerBlock) {
113 printf(
"n is too large, please choose a smaller number!\n");
116 if (blocks > (IndexType)prop.maxGridSize[0]) {
118 "Grid size <%lld> exceeds the device capability <%lld>, set block size "
120 "%lld (original %lld)\n",
121 blocks, prop.maxGridSize[0], threads * 2, threads);
127 blocks = min(maxBlocks, blocks);
Generic Morpheus interfaces.
Definition: dummy.cpp:24