diff --git a/CMakeLists.txt b/CMakeLists.txt
index 068baabf1e2..241b8499d26 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,7 +36,7 @@ check_cxx_compiler_flag(-fopenmp-simd CXX_HAVE_OMP_SIMD)
 
 # Build options
 option(BUILD_DALI_NODEPS "Disable components that require extra external libraries to be present in the system. Effectively, it builds only the DALI core and kernel libraries")
-option(LINK_LIBCUDA "Links directly with libcuda.so instead of dlopen it at runtime" OFF)
+option(LINK_DRIVER "Links directly with libcuda.so instead of dlopen it at runtime" OFF)
 
 # Tests use OpenCV...
 cmake_dependent_option(BUILD_TEST "Build googletest test suite" ON
@@ -142,7 +142,7 @@ propagate_option(BUILD_NVJPEG2K)
 propagate_option(BUILD_NVOF)
 propagate_option(BUILD_NVDEC)
 propagate_option(BUILD_NVML)
-propagate_option(LINK_LIBCUDA)
+propagate_option(LINK_DRIVER)
 
 get_dali_version(${PROJECT_SOURCE_DIR}/VERSION DALI_VERSION)
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 85d4487025f..60122cb1baa 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -68,6 +68,9 @@ list(APPEND DALI_EXCLUDES libculibos.a)
 if (LINK_LIBCUDA)
   CUDA_find_library_stub(CUDA_cuda_LIBRARY cuda)
   list(APPEND DALI_LIBS ${CUDA_cuda_LIBRARY})
+
+  CUDA_find_library_stub(CUDA_nvml_LIBRARY nvidia-ml)
+  list(APPEND DALI_LIBS ${CUDA_nvml_LIBRARY})
 endif()
 
 # NVTX for profiling
diff --git a/dali/CMakeLists.txt b/dali/CMakeLists.txt
index f2e35d9d841..424fd9473a4 100644
--- a/dali/CMakeLists.txt
+++ b/dali/CMakeLists.txt
@@ -57,8 +57,7 @@ collect_sources(DALI_SRCS PARENT_SCOPE)
 if (BUILD_PROTOBUF)
   set(DALI_PROTO_OBJ $<TARGET_OBJECTS:DALI_PROTO>)
   add_library(dali ${LIBTYPE} ${DALI_SRCS} ${DALI_PROTO_OBJ} ${CUDART_LIB})
-  set_target_properties(dali PROPERTIES
-      LIBRARY_OUTPUT_DIRECTORY "${DALI_LIBRARY_OUTPUT_DIR}")
+  set_target_properties(dali PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${DALI_LIBRARY_OUTPUT_DIR}")
 endif()
 
 if (BUILD_DALI_PIPELINE)
@@ -75,6 +74,10 @@ if (BUILD_DALI_PIPELINE)
   target_link_libraries(dali PRIVATE "-Wl,--exclude-libs,${exclude_libs}")
 endif()
 
+if (BUILD_NVML)
+  target_link_libraries(dali PRIVATE $<TARGET_OBJECTS:dynlink_nvml>)
+endif(BUILD_NVML)
+
 ################################################
 # Build test suite
 ################################################
@@ -84,6 +87,9 @@ if (BUILD_DALI_PIPELINE AND BUILD_TEST)
 
   target_link_libraries(dali_test PUBLIC dali dali_core dali_kernels dali_operators ${DALI_LIBS} gtest)
   target_link_libraries(dali_test PRIVATE dynlink_cuda ${CUDART_LIB})
+  if (BUILD_NVML)
+    target_link_libraries(dali_test PRIVATE $<TARGET_OBJECTS:dynlink_nvml>)
+  endif(BUILD_NVML)
   target_link_libraries(dali_test PRIVATE "-pie")
   set_target_properties(dali_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TEST_BINARY_DIR})
   set_target_properties(dali_test PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/dali/benchmark/CMakeLists.txt b/dali/benchmark/CMakeLists.txt
index c619f3dfa15..eafeadc036c 100644
--- a/dali/benchmark/CMakeLists.txt
+++ b/dali/benchmark/CMakeLists.txt
@@ -44,6 +44,9 @@ if (BUILD_BENCHMARK)
   add_executable(dali_benchmark "${DALI_BENCHMARK_SRCS}")
 
   target_link_libraries(dali_benchmark PRIVATE dali dali_operators benchmark ${DALI_LIBS})
+  if (BUILD_NVML)
+    target_link_libraries(dali_benchmark PRIVATE $<TARGET_OBJECTS:dynlink_nvml>)
+  endif(BUILD_NVML)
   target_link_libraries(dali_benchmark PRIVATE "-pie")
   set_target_properties(dali_benchmark PROPERTIES POSITION_INDEPENDENT_CODE ON)
   set_target_properties(dali_benchmark PROPERTIES OUTPUT_NAME "dali_benchmark.bin")
diff --git a/dali/core/CMakeLists.txt b/dali/core/CMakeLists.txt
index 9e0ce7804fa..5a455452590 100644
--- a/dali/core/CMakeLists.txt
+++ b/dali/core/CMakeLists.txt
@@ -27,7 +27,7 @@ foreach(incl_dir ${INFERED_COMPILER_INCLUDE})
 endforeach(incl_dir)
 separate_arguments(DEFAULT_COMPILER_INCLUDE UNIX_COMMAND  "${DEFAULT_COMPILER_INCLUDE}")
 
-if (NOT LINK_LIBCUDA)
+if (NOT LINK_DRIVER)
   set(CUDA_GENERATED_STUB "${CMAKE_CURRENT_BINARY_DIR}/dynlink_cuda_gen.cc")
   add_custom_command(
       OUTPUT ${CUDA_GENERATED_STUB}
diff --git a/dali/core/dynlink_cuda.cc b/dali/core/dynlink_cuda.cc
index c0fabafcff4..9192694bbb7 100644
--- a/dali/core/dynlink_cuda.cc
+++ b/dali/core/dynlink_cuda.cc
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <dlfcn.h>
 #include <stdio.h>
 #include <mutex>
 #include <string>
 #include <unordered_map>
 #include "dali/core/dynlink_cuda.h"
 
-#include <dlfcn.h>
-
 namespace {
 
 typedef void *CUDADRIVER;
@@ -48,7 +47,7 @@ void *LoadSymbol(const char *name) {
   return ret;
 }
 
-} // namespace
+}  // namespace
 
 // it is defined in the generated file
 typedef void *tLoadSymbol(const char *name);
@@ -67,7 +66,7 @@ bool cuInitChecked() {
       return true;
 
   // set symbol loader for this library
-#if !LINK_LIBCUDA_ENABLED
+#if !LINK_DRIVER_ENABLED
   CudaSetSymbolLoader(LoadSymbol);
 #endif
   static CUresult res = cuInit(0);
diff --git a/dali/operators/CMakeLists.txt b/dali/operators/CMakeLists.txt
index fe64fc6bd87..8825ab225a9 100644
--- a/dali/operators/CMakeLists.txt
+++ b/dali/operators/CMakeLists.txt
@@ -75,6 +75,9 @@ if (BUILD_TEST)
 
   target_link_libraries(dali_operator_test PUBLIC dali_operators)
   target_link_libraries(dali_operator_test PRIVATE gtest dynlink_cuda ${DALI_LIBS})
+  if (BUILD_NVML)
+    target_link_libraries(dali_operator_test PRIVATE $<TARGET_OBJECTS:dynlink_nvml>)
+  endif(BUILD_NVML)
   target_link_libraries(dali_operator_test PRIVATE "-Wl,--exclude-libs,${exclude_libs}")
   target_link_libraries(dali_operator_test PRIVATE "-pie")
   set_target_properties(dali_operator_test PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/dali/operators/decoder/nvjpeg/decoupled_api/nvjpeg_decoder_decoupled_api_test.cc b/dali/operators/decoder/nvjpeg/decoupled_api/nvjpeg_decoder_decoupled_api_test.cc
index 11ab892c114..34c5d94a8eb 100644
--- a/dali/operators/decoder/nvjpeg/decoupled_api/nvjpeg_decoder_decoupled_api_test.cc
+++ b/dali/operators/decoder/nvjpeg/decoupled_api/nvjpeg_decoder_decoupled_api_test.cc
@@ -209,7 +209,7 @@ class HwDecoderUtilizationTest : public ::testing::Test {
     if (!node->op->GetDiagnostic<bool>("using_hw_decoder")) {
       if (nvml::HasCuda11NvmlFunctions()) {
           unsigned int device_count;
-          DALI_CALL(nvml::wrapNvmlDeviceGetCount_v2(&device_count));
+          CUDA_CALL(nvmlDeviceGetCount_v2(&device_count));
           for (unsigned int device_idx = 0; device_idx < device_count; device_idx++) {
             auto info = nvml::GetDeviceInfo(device_idx);
             std::cerr << "Device " << device_idx
diff --git a/dali/operators/reader/nvdecoder/dynlink_nvcuvid.cc b/dali/operators/reader/nvdecoder/dynlink_nvcuvid.cc
index 9e19a94f6aa..48d88311f78 100644
--- a/dali/operators/reader/nvdecoder/dynlink_nvcuvid.cc
+++ b/dali/operators/reader/nvdecoder/dynlink_nvcuvid.cc
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <dlfcn.h>
 #include <stdio.h>
 #include <mutex>
 #include <string>
 #include <unordered_map>
 #include "dali/operators/reader/nvdecoder/dynlink_nvcuvid.h"
 
-#include <dlfcn.h>
-
 namespace {
 
 static char __DriverLibName[] = "libnvcuvid.so";
@@ -50,7 +49,7 @@ void *LoadSymbol(const char *name) {
   return ret;
 }
 
-}
+}  // namespace
 
 // it is defined in the generated file
 typedef void *tLoadSymbol(const char *name);
diff --git a/dali/operators/reader/nvdecoder/dynlink_nvcuvid.h b/dali/operators/reader/nvdecoder/dynlink_nvcuvid.h
index ae128ae9c17..52fce88e7dc 100644
--- a/dali/operators/reader/nvdecoder/dynlink_nvcuvid.h
+++ b/dali/operators/reader/nvdecoder/dynlink_nvcuvid.h
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(DALI_OPERATORS_READER_NVDECODER_DYNLINK_NVCUVID_H_)
+#ifndef DALI_OPERATORS_READER_NVDECODER_DYNLINK_NVCUVID_H_
 #define DALI_OPERATORS_READER_NVDECODER_DYNLINK_NVCUVID_H_
 
 #include <string>
@@ -24,4 +24,4 @@
 bool cuvidInitChecked(unsigned int Flags);
 bool cuvidIsSymbolAvailable(const char *name);
 
-#endif // DALI_OPERATORS_READER_NVDECODER_DYNLINK_NVCUVID_H_
+#endif  // DALI_OPERATORS_READER_NVDECODER_DYNLINK_NVCUVID_H_
diff --git a/dali/operators/reader/video_reader_op_test.cc b/dali/operators/reader/video_reader_op_test.cc
index 2f4c485d639..93847fb03e6 100644
--- a/dali/operators/reader/video_reader_op_test.cc
+++ b/dali/operators/reader/video_reader_op_test.cc
@@ -107,15 +107,12 @@ TEST_F(VideoReaderTest, MultipleVideoResolution) {
   float driverVersion = 0;
   char version[80];
 
-  if (nvml::wrapSymbols() != DALISuccess) {
-    FAIL() << "wrapSymbols() failed";
-  }
-  if (nvml::wrapNvmlInit() != DALISuccess) {
-    FAIL() << "wrapNvmlInit() failed";
+  if (nvmlInitChecked() != NVML_SUCCESS) {
+    FAIL() << "nvmlInitChecked() failed";
   }
 
-  if (nvml::wrapNvmlSystemGetDriverVersion(version, sizeof version) != DALISuccess) {
-    FAIL() << "wrapNvmlSystemGetDriverVersion failed!";
+  if (nvmlSystemGetDriverVersion(version, sizeof version) != NVML_SUCCESS) {
+    FAIL() << "nvmlSystemGetDriverVersion failed!";
   }
 
   driverVersion = std::stof(version);
diff --git a/dali/util/CMakeLists.txt b/dali/util/CMakeLists.txt
index 0f27d5ee030..1877c179b3e 100644
--- a/dali/util/CMakeLists.txt
+++ b/dali/util/CMakeLists.txt
@@ -45,10 +45,28 @@ if(BUILD_NVML)
     "${CMAKE_CURRENT_SOURCE_DIR}/nvml.h"
     "${CMAKE_CURRENT_SOURCE_DIR}/nvml_wrap.h")
 
-  set(DALI_SRCS ${DALI_SRCS}
-    "${CMAKE_CURRENT_SOURCE_DIR}/nvml_wrap.cc")
-endif()
+  if (NOT LINK_DRIVER)
+    set(NVML_GENERATED_STUB "${CMAKE_CURRENT_BINARY_DIR}/dynlink_nvml_gen.cc")
+    add_custom_command(
+        OUTPUT ${NVML_GENERATED_STUB}
+        COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/stub_generator/stub_codegen.py --unique_prefix=Nvml --
+                    "${CMAKE_CURRENT_SOURCE_DIR}/../../tools/stub_generator/nvml.json" ${NVML_GENERATED_STUB}
+                    "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/nvml.h" "-I${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
+                    # for some reason QNX fails with 'too many errors emitted' is this is not set
+                    "-ferror-limit=0"
+                    ${DEFAULT_COMPILER_INCLUDE}
+        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/stub_generator/stub_codegen.py
+                "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/nvml.h"
+                "${CMAKE_CURRENT_SOURCE_DIR}/../../tools/stub_generator/nvml.json"
+        COMMENT "Running nvml.h stub generator"
+        VERBATIM)
 
+    set_source_files_properties(${NVML_GENERATED_STUB} PROPERTIES GENERATED TRUE)
+    add_library(dynlink_nvml OBJECT nvml_wrap.cc ${NVML_GENERATED_STUB})
+  else()
+    add_library(dynlink_nvml OBJECT nvml_wrap.cc)
+  endif()
+endif()
 
 set(DALI_INST_HDRS ${DALI_INST_HDRS} PARENT_SCOPE)
 set(DALI_SRCS ${DALI_SRCS} PARENT_SCOPE)
diff --git a/dali/util/nvml.h b/dali/util/nvml.h
index 01530444722..bf10e19ff76 100644
--- a/dali/util/nvml.h
+++ b/dali/util/nvml.h
@@ -17,18 +17,114 @@
 
 #include <nvml.h>
 #include <cuda_runtime_api.h>
-
 #include <pthread.h>
 #include <sys/sysinfo.h>
-
 #include <mutex>
 #include <vector>
-
-#include "dali/core/error_handling.h"
+#include <string>
 #include "dali/core/cuda_utils.h"
 #include "dali/util/nvml_wrap.h"
+#include "dali/core/cuda_error.h"
+#include "dali/core/format.h"
 
 namespace dali {
+
+class NvmlError : public std::runtime_error {
+ public:
+  explicit NvmlError(nvmlReturn_t result, const char *details = nullptr)
+  : std::runtime_error(Message(result, details))
+  , result_(result) {}
+
+  static const char *ErrorString(nvmlReturn_t result) {
+    switch (result) {
+      case NVML_SUCCESS:
+        return "nvml operation was successful";
+      case NVML_ERROR_UNINITIALIZED:
+        return "nvml was not first initialized with nvmlInit()";
+      case NVML_ERROR_INVALID_ARGUMENT:
+        return "a nvml supplied argument is invalid";
+      case NVML_ERROR_NOT_SUPPORTED:
+        return "The nvml requested operation is not available on target device";
+      case NVML_ERROR_NO_PERMISSION:
+        return "The nvml current user does not have permission for operation";
+      case NVML_ERROR_ALREADY_INITIALIZED:
+        return "Deprecated: Multiple initializations are now allowed through ref counting";
+      case NVML_ERROR_NOT_FOUND:
+        return "A nvml query to find an object was unsuccessful";
+      case NVML_ERROR_INSUFFICIENT_SIZE:
+        return "A nvml input argument is not large enough";
+      case NVML_ERROR_INSUFFICIENT_POWER:
+        return "A nvml device's external power cables are not properly attached";
+      case NVML_ERROR_DRIVER_NOT_LOADED:
+        return "nvml: NVIDIA driver is not loaded";
+      case NVML_ERROR_TIMEOUT:
+        return "nvml user provided timeout passed";
+      case NVML_ERROR_IRQ_ISSUE:
+        return "nvml: NVIDIA Kernel detected an interrupt issue with a GPU";
+      case NVML_ERROR_LIBRARY_NOT_FOUND:
+        return "NVML Shared Library couldn't be found or loaded";
+      case NVML_ERROR_FUNCTION_NOT_FOUND:
+        return "Local version of NVML doesn't implement this function";
+      case NVML_ERROR_CORRUPTED_INFOROM:
+        return "nvml: infoROM is corrupted";
+      case NVML_ERROR_GPU_IS_LOST:
+        return "nvml: the GPU has fallen off the bus or has otherwise become inaccessible";
+      case NVML_ERROR_RESET_REQUIRED:
+        return "nvml: the GPU requires a reset before it can be used again";
+      case NVML_ERROR_OPERATING_SYSTEM:
+        return "nvml: the GPU control device has been blocked by the operating system/cgroups";
+      case NVML_ERROR_LIB_RM_VERSION_MISMATCH:
+        return "nvml: RM detects a driver/library version mismatch";
+      case NVML_ERROR_IN_USE:
+        return "A nvml operation cannot be performed because the GPU is currently in use";
+      case NVML_ERROR_MEMORY:
+        return "Nvml insufficient memory";
+      case NVML_ERROR_NO_DATA:
+        return "Nvml: no data";
+#if (CUDART_VERSION >= 11000)
+      case NVML_ERROR_VGPU_ECC_NOT_SUPPORTED:
+        return "The nvml requested vgpu operation is not available on target device, becasue ECC is"
+               "enabled";
+      case NVML_ERROR_INSUFFICIENT_RESOURCES:
+        return "Nvml: ran out of critical resources, other than memory";
+#endif
+      case NVML_ERROR_UNKNOWN:
+        return "A nvml internal driver error occurred";
+      default:
+        return "< unknown error >";
+    }
+  }
+
+  static std::string Message(nvmlReturn_t result, const char *details) {
+    if (details && *details) {
+      return make_string("nvml error: ", result, " ", ErrorString(result),
+                         "\nDetails:\n", details);
+    } else {
+      return make_string("nvml error: ", result, " ", ErrorString(result));
+    }
+  }
+
+
+  nvmlReturn_t result() const { return result_; }
+
+ private:
+  nvmlReturn_t result_;
+};
+
+class NvmlBadAlloc : public CUDABadAlloc {};
+
+template <>
+inline void cudaResultCheck<nvmlReturn_t>(nvmlReturn_t status) {
+  switch (status) {
+  case NVML_SUCCESS:
+    return;
+  case NVML_ERROR_MEMORY:
+    throw dali::NvmlBadAlloc();
+  default:
+    throw dali::NvmlError(status);
+  }
+}
+
 namespace nvml {
 
 /**
@@ -43,9 +139,7 @@ inline std::mutex& Mutex() {
  * @brief Initializes the NVML library
  */
 inline void Init() {
-  std::lock_guard<std::mutex> lock(Mutex());
-  DALI_CALL(wrapSymbols());
-  DALI_CALL(wrapNvmlInit());
+  CUDA_CALL(nvmlInitChecked());
 }
 
 /**
@@ -53,7 +147,7 @@ inline void Init() {
  *        respecting previously set mask.
  */
 inline void GetNVMLAffinityMask(cpu_set_t * mask, size_t num_cpus) {
-  if (!wrapIsInitialized()) {
+  if (!nvmlIsInitialized()) {
     return;
   }
   int device_idx;
@@ -64,16 +158,16 @@ inline void GetNVMLAffinityMask(cpu_set_t * mask, size_t num_cpus) {
   std::vector<unsigned long> nvml_mask_container(cpu_set_size);  // NOLINT(runtime/int)
   auto * nvml_mask = nvml_mask_container.data();
   nvmlDevice_t device;
-  DALI_CALL(wrapNvmlDeviceGetHandleByIndex(device_idx, &device));
+  CUDA_CALL(nvmlDeviceGetHandleByIndex(device_idx, &device));
   #if (CUDART_VERSION >= 11000)
-    if (wrapHasCuda11NvmlFunctions()) {
-      DALI_CALL(wrapNvmlDeviceGetCpuAffinityWithinScope(device, cpu_set_size, nvml_mask,
+    if (nvmlHasCuda11NvmlFunctions()) {
+      CUDA_CALL(nvmlDeviceGetCpuAffinityWithinScope(device, cpu_set_size, nvml_mask,
                                                         NVML_AFFINITY_SCOPE_SOCKET));
     } else {
-      DALI_CALL(wrapNvmlDeviceGetCpuAffinity(device, cpu_set_size, nvml_mask));
+      CUDA_CALL(nvmlDeviceGetCpuAffinity(device, cpu_set_size, nvml_mask));
     }
   #else
-    DALI_CALL(wrapNvmlDeviceGetCpuAffinity(device, cpu_set_size, nvml_mask));
+    CUDA_CALL(nvmlDeviceGetCpuAffinity(device, cpu_set_size, nvml_mask));
   #endif
 
   // Convert it to cpu_set_t
@@ -111,9 +205,8 @@ inline void SetCPUAffinity(int core = -1) {
   CPU_ZERO(&requested_set);
   if (core != -1) {
     if (core < 0 || (size_t)core >= num_cpus) {
-      DALI_WARN("Requested setting affinity to core " + to_string(core) +
-                " but only " + to_string(num_cpus) + " cores available. " +
-                "Ignoring...");
+      DALI_WARN(make_string("Requested setting affinity to core ", core,
+                            " but only ", num_cpus, " cores available. Ignoring..."));
       GetNVMLAffinityMask(&requested_set, num_cpus);
     } else {
       CPU_SET(core, &requested_set);
@@ -143,10 +236,10 @@ inline void SetCPUAffinity(int core = -1) {
 
 inline void Shutdown() {
   std::lock_guard<std::mutex> lock(Mutex());
-  if (!wrapIsInitialized()) {
+  if (!nvmlIsInitialized()) {
     return;
   }
-  DALI_CALL(wrapNvmlShutdown());
+  CUDA_CALL(nvmlShutdown());
 }
 
 #if (CUDART_VERSION >= 11000)
@@ -169,9 +262,9 @@ struct DeviceProperties {
 inline DeviceProperties GetDeviceInfo(int device_idx) {
   DeviceProperties ret;
   nvmlDevice_t device;
-  DALI_CALL(wrapNvmlDeviceGetHandleByIndex_v2(device_idx, &device));
-  DALI_CALL(wrapNvmlDeviceGetBrand(device, &ret.type));
-  DALI_CALL(wrapNvmlDeviceGetCudaComputeCapability(device, &ret.cap_major, &ret.cap_minor));
+  CUDA_CALL(nvmlDeviceGetHandleByIndex_v2(device_idx, &device));
+  CUDA_CALL(nvmlDeviceGetBrand(device, &ret.type));
+  CUDA_CALL(nvmlDeviceGetCudaComputeCapability(device, &ret.cap_major, &ret.cap_minor));
   return ret;
 }
 
@@ -181,7 +274,7 @@ inline DeviceProperties GetDeviceInfo(int device_idx) {
  * @throws std::runtime_error
  */
 inline bool HasHwDecoder(int device_idx) {
-  if (!wrapIsInitialized()) {
+  if (!nvmlIsInitialized()) {
     return false;
   }
   auto info = GetDeviceInfo(device_idx);
@@ -195,11 +288,11 @@ inline bool HasHwDecoder(int device_idx) {
  * @throws std::runtime_error
  */
 inline bool HasHwDecoder() {
-  if (!wrapIsInitialized()) {
+  if (!nvmlIsInitialized()) {
     return false;
   }
   unsigned int device_count;
-  DALI_CALL(wrapNvmlDeviceGetCount_v2(&device_count));
+  CUDA_CALL(nvmlDeviceGetCount_v2(&device_count));
   for (unsigned int device_idx = 0; device_idx < device_count; device_idx++) {
     if (HasHwDecoder(device_idx)) return true;
   }
@@ -211,24 +304,13 @@ inline bool HasHwDecoder() {
  * Checks, whether CUDA11-proper NVML functions have been successfully loaded
  */
 inline bool HasCuda11NvmlFunctions() {
-  if (!wrapIsInitialized()) {
+  if (!nvmlIsInitialized()) {
     return false;
   }
-  return wrapHasCuda11NvmlFunctions();
+  return nvmlHasCuda11NvmlFunctions();
 }
 
 }  // namespace nvml
 }  // namespace dali
 
-#define NVML_CALL(code)                                    \
-  do {                                                     \
-    nvmlReturn_t status = code;                            \
-    if (status != NVML_SUCCESS) {                          \
-      dali::string error = dali::string("NVML error \"") + \
-        nvmlErrorString(status) + "\"";                    \
-      DALI_FAIL(error);                                    \
-    }                                                      \
-  } while (0)
-
 #endif  // DALI_UTIL_NVML_H_
-
diff --git a/dali/util/nvml_wrap.cc b/dali/util/nvml_wrap.cc
index 971178a59ca..58a59b14072 100644
--- a/dali/util/nvml_wrap.cc
+++ b/dali/util/nvml_wrap.cc
@@ -1,271 +1,98 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- ************************************************************************/
-
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
 #include <dlfcn.h>
-#include <limits>
 #include <mutex>
-#include "dali/util/nvml_wrap.h"
-#include "dali/core/cuda_error.h"
-
-
-namespace dali {
-
-namespace nvml {
-
-int symbolsLoaded = 0;
-
-static nvmlReturn_t (*nvmlInternalInit)(void);
-static nvmlReturn_t (*nvmlInternalShutdown)(void);
-static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId,
-                                                             nvmlDevice_t* device);
-static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(const int device_id,
-                                                          nvmlDevice_t* device);
-static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-
-static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
-static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
-static nvmlReturn_t (*nvmlInternalSystemGetDriverVersion)(char* name, unsigned int length);
-static nvmlReturn_t (*nvmlInternalDeviceGetCpuAffinity)(nvmlDevice_t device,
-                                                        unsigned int cpuSetSize,
-                                                        unsigned long* cpuSet);  // NOLINT(*)
+#include <atomic>
+#include <string>
+#include <unordered_map>
 
-#if (CUDART_VERSION >= 11000)
-static nvmlReturn_t (*nvmlInternalDeviceGetCpuAffinityWithinScope)(nvmlDevice_t device,
-                                                                   unsigned int nodeSetSize,
-                                                                   unsigned long *nodeSet,  // NOLINT(*)
-                                                                   nvmlAffinityScope_t scope);
-static nvmlReturn_t (*nvmlInternalDeviceGetBrand)(nvmlDevice_t device, nvmlBrandType_t *type);
-static nvmlReturn_t (*nvmlInternalDeviceGetCount_v2)(unsigned int *deviceCount);
-static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex_v2)(unsigned int index,
-                                                             nvmlDevice_t *device);
-static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device,
-                                                                  int *major, int *minor);
-#endif
+#include "dali/util/nvml_wrap.h"
 
-static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
 
 namespace {
 
-std::once_flag driver_check;
+typedef void *NVMLRIVER;
 
-/*
- * This function is used to learn the real driver version, and compare with the provided value
- * It should be available as soon as possible. However it requires wrapNvmlSystemGetDriverVersion
- * and wrapNvmlInit be intialized first. If they are not it will warn and return INF driver version
- * and all checks will fail
- */
-bool is_driver_sufficient(float requestedDriverVersion) {
-  static float availableDriverVersion = std::numeric_limits<float>::max();
+static char __NvmlLibName[] = "libnvidia-ml.so";
+static char __NvmlLibName1[] = "libnvidia-ml.so.1";
 
-  std::call_once(driver_check, [] {
-      char version[80];
-      if (nvml::wrapNvmlInit() != DALISuccess) {
-        DALI_WARN("wrapNvmlInit failed, driver version check not available");
-        return;
-      }
-      nvml::wrapNvmlSystemGetDriverVersion(version, sizeof version);
-      availableDriverVersion = std::stof(version);
-    });
-
-  return requestedDriverVersion <= availableDriverVersion;
-}
+NVMLRIVER loadNvmlLibrary() {
+  NVMLRIVER ret = nullptr;
 
-}  // namespace
-
-bool wrapHasCuda11NvmlFunctions() {
-  #if (CUDART_VERSION >= 11000)
-    return nvmlInternalDeviceGetCount_v2 && nvmlInternalDeviceGetHandleByIndex_v2 &&
-           nvmlInternalDeviceGetCudaComputeCapability && nvmlInternalDeviceGetBrand &&
-           nvmlInternalDeviceGetCpuAffinityWithinScope;
-  #else
-    return false;
-  #endif
-}
+  ret = dlopen(__NvmlLibName1, RTLD_NOW);
 
+  if (!ret) {
+    ret = dlopen(__NvmlLibName, RTLD_NOW);
 
-bool wrapIsInitialized(void) {
-  return symbolsLoaded;
-}
-
-DALIError_t wrapSymbols(void) {
-  if (symbolsLoaded)
-    return DALISuccess;
-
-  static void* nvmlhandle = nullptr;
-  void* tmp;
-  void** cast;
-
-  nvmlhandle = dlopen("libnvidia-ml.so", RTLD_NOW);
-  if (!nvmlhandle) {
-    nvmlhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
-    if (!nvmlhandle) {
-      DALI_FAIL("Failed to open libnvidia-ml.so[.1]");
+    if (!ret) {
+      printf("dlopen \"%s\" failed!\n", __NvmlLibName);
     }
   }
-
-#define LOAD_SYM(handle, symbol, funcptr) do {                       \
-    cast = reinterpret_cast<void**>(&funcptr);                       \
-    tmp = dlsym(handle, symbol);                                     \
-    if (tmp == nullptr) {                                            \
-      DALI_FAIL("dlsym failed on " + symbol + " - " + dlerror());    \
-    }                                                                \
-    *cast = tmp;                                                     \
-  } while (0)
-
-#define LOAD_SYM_MIN_DRIVER(handle, symbol, funcptr, driver_v) do {          \
-    if (!is_driver_sufficient(driver_v)) {                                   \
-      funcptr = nullptr;                                                     \
-      break;                                                                 \
-    }                                                                        \
-    cast = reinterpret_cast<void**>(&funcptr);                               \
-    tmp = dlsym(handle, symbol);                                             \
-    if (tmp == nullptr) {                                                    \
-      DALI_FAIL("dlsym failed on " + symbol + " - " + dlerror());            \
-    }                                                                        \
-    *cast = tmp;                                                             \
-  } while (0)
-
-  /*
-   * make sure that nvmlInit and nvmlSystemGetDriverVersion are first on the list as they are needed
-   * by is_driver_sufficient function
-   */
-  LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit);
-  LOAD_SYM(nvmlhandle, "nvmlSystemGetDriverVersion", nvmlInternalSystemGetDriverVersion);
-
-  LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetCpuAffinity", nvmlInternalDeviceGetCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
-
-  #if (CUDART_VERSION >= 11000)
-    LOAD_SYM_MIN_DRIVER(nvmlhandle, "nvmlDeviceGetCpuAffinityWithinScope",
-                        nvmlInternalDeviceGetCpuAffinityWithinScope, 450.36);
-    LOAD_SYM_MIN_DRIVER(nvmlhandle, "nvmlDeviceGetBrand", nvmlInternalDeviceGetBrand, 450.36);
-    LOAD_SYM_MIN_DRIVER(nvmlhandle, "nvmlDeviceGetCount_v2", nvmlInternalDeviceGetCount_v2, 450.36);
-    LOAD_SYM_MIN_DRIVER(nvmlhandle, "nvmlDeviceGetHandleByIndex_v2",
-                        nvmlInternalDeviceGetHandleByIndex_v2, 450.36);
-    LOAD_SYM_MIN_DRIVER(nvmlhandle, "nvmlDeviceGetCudaComputeCapability",
-                        nvmlInternalDeviceGetCudaComputeCapability, 450.36);
-  #endif
-
-  symbolsLoaded = 1;
-  return DALISuccess;
+  return ret;
 }
 
+void *LoadSymbol(const char *name) {
+  static NVMLRIVER nvmlDrvLib = loadNvmlLibrary();
+  void *ret = nvmlDrvLib ? dlsym(nvmlDrvLib, name) : nullptr;
+  return ret;
+}
 
-#define FUNC_BODY(INTERNAL_FUNC, ARGS...)            \
-  do {                                               \
-    if (INTERNAL_FUNC == nullptr) {                  \
-      return DALIError;                              \
-    }                                                \
-    nvmlReturn_t ret = INTERNAL_FUNC(ARGS);          \
-    if (ret != NVML_SUCCESS) {                       \
-      DALI_WARN(#INTERNAL_FUNC "(...) failed: " +    \
-                nvmlInternalErrorString(ret));       \
-      return DALIError;                              \
-    }                                                \
-    return DALISuccess;                              \
-  } while (false)
+}  // namespace
 
+std::atomic_bool symbolsLoaded{false};
 
-DALIError_t wrapNvmlInit(void) {
-  FUNC_BODY(nvmlInternalInit);
-}
+// it is defined in the generated file
+typedef void *tLoadSymbol(const char *name);
+void NvmlSetSymbolLoader(tLoadSymbol loader_func);
 
-DALIError_t wrapNvmlShutdown(void) {
-  if (nvmlInternalInit == nullptr) {
-    return DALISuccess;
-  }
-  if (nvmlInternalShutdown == nullptr) {
-    DALI_FAIL("lib wrapper not initialized.");
-    return DALIError;
-  }
-  nvmlReturn_t ret = nvmlInternalShutdown();
+nvmlReturn_t nvmlInitChecked() {
+  // set symbol loader for this library
+#if !LINK_DRIVER_ENABLED
+  static std::once_flag nvml_once;
+  std::call_once(nvml_once, NvmlSetSymbolLoader, LoadSymbol);
+#endif
+  symbolsLoaded = true;
+  nvmlReturn_t ret = nvmlInit();
   if (ret != NVML_SUCCESS) {
-    DALI_FAIL("nvmlShutdown() failed: " +
-      nvmlInternalErrorString(ret));
-    return DALIError;
+    DALI_WARN("nvmlInitChecked failed: " + nvmlErrorString(ret));
   }
-  return DALISuccess;
-}
-
-DALIError_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
-  FUNC_BODY(nvmlInternalDeviceGetHandleByPciBusId, pciBusId, device);
-}
-
-DALIError_t wrapNvmlDeviceGetHandleByIndex(const int device_id, nvmlDevice_t* device) {
-  FUNC_BODY(nvmlInternalDeviceGetHandleByIndex, device_id, device);
-}
-
-DALIError_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
-  FUNC_BODY(nvmlInternalDeviceGetIndex, device, index);
-}
-
-DALIError_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  FUNC_BODY(nvmlInternalDeviceSetCpuAffinity, device);
-}
-
-DALIError_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  FUNC_BODY(nvmlInternalDeviceClearCpuAffinity, device);
-}
-
-DALIError_t wrapNvmlSystemGetDriverVersion(char* name, unsigned int length) {
-  FUNC_BODY(nvmlInternalSystemGetDriverVersion, name, length);
+  return ret;
 }
 
-DALIError_t wrapNvmlDeviceGetCpuAffinity(nvmlDevice_t device,
-                                         unsigned int cpuSetSize,
-                                         unsigned long* cpuSet) {  // NOLINT(runtime/int)
-  FUNC_BODY(nvmlInternalDeviceGetCpuAffinity, device, cpuSetSize, cpuSet);
-}
-
-#if (CUDART_VERSION >= 11000)
-
-DALIError_t wrapNvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device,
-                                                    unsigned int nodeSetSize,
-                                                    unsigned long *nodeSet,  // NOLINT(runtime/int)
-                                                    nvmlAffinityScope_t scope) {
-  FUNC_BODY(nvmlInternalDeviceGetCpuAffinityWithinScope, device, nodeSetSize, nodeSet, scope);
-}
-
-DALIError_t wrapNvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t* type) {
-  FUNC_BODY(nvmlInternalDeviceGetBrand, device, type);
-}
-
-DALIError_t wrapNvmlDeviceGetCount_v2(unsigned int* deviceCount) {
-  FUNC_BODY(nvmlInternalDeviceGetCount_v2, deviceCount);
+bool nvmlIsInitialized(void) {
+  return symbolsLoaded;
 }
 
-DALIError_t wrapNvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t* device) {
-  FUNC_BODY(nvmlInternalDeviceGetHandleByIndex_v2, index, device);
+bool nvmlIsSymbolAvailable(const char *name) {
+  static std::mutex symbol_mutex;
+  static std::unordered_map<std::string, void*> symbol_map;
+  std::lock_guard<std::mutex> lock(symbol_mutex);
+  auto it = symbol_map.find(name);
+  if (it == symbol_map.end()) {
+    auto *ptr = LoadSymbol(name);
+    symbol_map.insert({name, ptr});
+    return ptr != nullptr;
+  }
+  return it->second != nullptr;
 }
 
-DALIError_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor) {
-  FUNC_BODY(nvmlInternalDeviceGetCudaComputeCapability, device, major, minor);
+bool nvmlHasCuda11NvmlFunctions(void) {
+  return nvmlIsSymbolAvailable("nvmlDeviceGetCount_v2") &&
+         nvmlIsSymbolAvailable("nvmlDeviceGetHandleByIndex_v2") &&
+         nvmlIsSymbolAvailable("nvmlDeviceGetCudaComputeCapability") &&
+         nvmlIsSymbolAvailable("nvmlDeviceGetBrand") &&
+         nvmlIsSymbolAvailable("nvmlDeviceGetCpuAffinityWithinScope");
 }
-
-#endif
-
-#undef FUNC_BODY
-
-}  // namespace nvml
-
-}  // namespace dali
diff --git a/dali/util/nvml_wrap.h b/dali/util/nvml_wrap.h
index 419a0f9ddfd..f2cea86a8de 100644
--- a/dali/util/nvml_wrap.h
+++ b/dali/util/nvml_wrap.h
@@ -20,55 +20,22 @@
 
 #ifndef DALI_UTIL_NVML_WRAP_H_
 #define DALI_UTIL_NVML_WRAP_H_
+
+
 #include <nvml.h>
 #include <cuda_runtime_api.h>
 
 #include "dali/core/common.h"
 #include "dali/core/error_handling.h"
 
-namespace dali {
-
-namespace nvml {
-
-DLL_PUBLIC bool wrapIsInitialized(void);
-DLL_PUBLIC DALIError_t wrapSymbols(void);
-
-DLL_PUBLIC DALIError_t wrapNvmlInit(void);
-DLL_PUBLIC DALIError_t wrapNvmlShutdown(void);
-DLL_PUBLIC DALIError_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId,
-                                                         nvmlDevice_t* device);
-DLL_PUBLIC DALIError_t wrapNvmlDeviceGetHandleByIndex(const int device_id,
-                                                      nvmlDevice_t* device);
-DLL_PUBLIC DALIError_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-DLL_PUBLIC DALIError_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
-DLL_PUBLIC DALIError_t wrapNvmlSystemGetDriverVersion(char* name, unsigned int length);
-DLL_PUBLIC DALIError_t wrapNvmlDeviceGetCpuAffinity(nvmlDevice_t device,
-                                                    unsigned int cpuSetSize,
-                                                    unsigned long* cpuSet);  // NOLINT(runtime/int)
-DLL_PUBLIC DALIError_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
-
-#if (CUDART_VERSION >= 11000)
-
-DLL_PUBLIC DALIError_t wrapNvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device,
-                                                               unsigned int nodeSetSize,
-                                                               unsigned long *nodeSet,  // NOLINT(*)
-                                                               nvmlAffinityScope_t scope);
-DLL_PUBLIC DALIError_t wrapNvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t* type);
-DLL_PUBLIC DALIError_t wrapNvmlDeviceGetCount_v2(unsigned int* deviceCount);
-DLL_PUBLIC DALIError_t wrapNvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t* device);
-DLL_PUBLIC DALIError_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device,
-                                                              int* major, int* minor);
-
-#endif
+bool nvmlIsInitialized(void);
+nvmlReturn_t nvmlInitChecked(void);
+bool nvmlIsSymbolAvailable(const char *name);
 
 /**
  * Checks, whether CUDA11-proper NVML functions have been successfully loaded
  */
-DLL_PUBLIC bool wrapHasCuda11NvmlFunctions();
-
-}  // namespace nvml
-
-}  // namespace dali
+bool nvmlHasCuda11NvmlFunctions(void);
 
 #endif  // DALI_UTIL_NVML_WRAP_H_
 
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 47a2bf4a280..7883fb28dca 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -79,8 +79,8 @@ ARG BUILD_NVDEC
 ENV BUILD_NVDEC=${BUILD_NVDEC}
 ARG BUILD_NVML
 ENV BUILD_NVML=${BUILD_NVML}
-ARG LINK_LIBCUDA
-ENV LINK_LIBCUDA=${LINK_LIBCUDA}
+ARG LINK_DRIVER
+ENV LINK_DRIVER=${LINK_DRIVER}
 ARG STRIP_BINARY
 ENV STRIP_BINARY=${STRIP_BINARY}
 ARG VERBOSE_LOGS
diff --git a/docs/compilation.rst b/docs/compilation.rst
index 136a96bbf5d..04177030d06 100644
--- a/docs/compilation.rst
+++ b/docs/compilation.rst
@@ -350,7 +350,7 @@ Optional CMake build parameters
 -  ``WERROR`` - treat all build warnings as errors (default: OFF)
 -  ``BUILD_WITH_ASAN`` - build with ASAN support (default: OFF). To run issue:
 -  ``BUILD_DALI_NODEPS`` - disables support for third party libraries that are normally expected to be available in the system
--  ``LINK_LIBCUDA`` - enables direct linking with libcuda.so or an appropriate stub instead of dlopen
+-  ``LINK_DRIVER`` - enables direct linking with driver libraries or an appropriate stub instead of dlopen
    it in the runtime (removes the requirement to have clang-python bindings available to generate the stubs)
 
 .. warning::
diff --git a/include/dali/core/dynlink_cuda.h b/include/dali/core/dynlink_cuda.h
index 76da7f5496d..2613fcdcf85 100644
--- a/include/dali/core/dynlink_cuda.h
+++ b/include/dali/core/dynlink_cuda.h
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef DALI_CORE_DYNLINK_CUDA_H
-#define DALI_CORE_DYNLINK_CUDA_H
+#ifndef DALI_CORE_DYNLINK_CUDA_H_
+#define DALI_CORE_DYNLINK_CUDA_H_
 
 #include <cuda.h>
 
 bool cuInitChecked();
 bool cuIsSymbolAvailable(const char *name);
 
-#endif //DALI_CORE_DYNLINK_CUDA_H
+#endif  // DALI_CORE_DYNLINK_CUDA_H_
diff --git a/tools/lint.py b/tools/lint.py
index 7fe59aefacb..02f65e3ea53 100644
--- a/tools/lint.py
+++ b/tools/lint.py
@@ -41,12 +41,8 @@
 # Specifies, which files are to be excluded
 # These filters are regexes, not typical unix-like path specification
 negative_filters = [
-    ".*core/dynlink_cuda.cc",
     ".*operators/reader/nvdecoder/nvcuvid.h",
     ".*operators/reader/nvdecoder/cuviddec.h",
-    ".*operators/reader/nvdecoder/dynlink_nvcuvid.cc",
-    ".*operators/reader/nvdecoder/dynlink_nvcuvid.h",
-    ".*dali/core/dynlink_cuda.h",
     ".*python/dummy.cu"
 ]
 
diff --git a/tools/stub_generator/cuda.json b/tools/stub_generator/cuda.json
index a8eaa1f1d10..fd52cbae73a 100644
--- a/tools/stub_generator/cuda.json
+++ b/tools/stub_generator/cuda.json
@@ -3,6 +3,7 @@
       "<cuda.h>"
    ],
    "return_type":"CUresult",
+   "calling_conv":"CUDAAPI",
    "not_found_error":"CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND",
    "functions": {
       "cuInit": {},
diff --git a/tools/stub_generator/nvcuvid.json b/tools/stub_generator/nvcuvid.json
index a8d5e10e8b0..0589f0f7547 100644
--- a/tools/stub_generator/nvcuvid.json
+++ b/tools/stub_generator/nvcuvid.json
@@ -5,6 +5,7 @@
       "\"dali/operators/reader/nvdecoder/nvcuvid.h\""
    ],
    "return_type":"CUresult",
+   "calling_conv":"CUDAAPI",
    "not_found_error":"CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND",
    "functions": {
       "cuvidCreateVideoSource": {},
diff --git a/tools/stub_generator/nvml.json b/tools/stub_generator/nvml.json
new file mode 100644
index 00000000000..30b1903e5ce
--- /dev/null
+++ b/tools/stub_generator/nvml.json
@@ -0,0 +1,29 @@
+{
+   "extra_include":[
+      "<nvml.h>"
+   ],
+   "return_type":"nvmlReturn_t",
+   "calling_conv":"DECLDIR",
+   "not_found_error":"NVML_ERROR_FUNCTION_NOT_FOUND ",
+   "functions": {
+      "nvmlInit": {},
+      "nvmlInit_v2": {},
+      "nvmlShutdown": {},
+      "nvmlSystemGetDriverVersion": {},
+      "nvmlDeviceGetHandleByPciBusId": {},
+      "nvmlDeviceGetHandleByIndex": {},
+      "nvmlDeviceGetIndex": {},
+      "nvmlDeviceSetCpuAffinity": {},
+      "nvmlDeviceClearCpuAffinity": {},
+      "nvmlDeviceGetCpuAffinity": {},
+      "nvmlErrorString": {
+         "return_type":"char*",
+         "not_found_error":"const_cast<char *>(\"\")"
+      },
+      "nvmlDeviceGetCpuAffinityWithinScope": {},
+      "nvmlDeviceGetBrand": {},
+      "nvmlDeviceGetCount_v2": {},
+      "nvmlDeviceGetHandleByIndex_v2": {},
+      "nvmlDeviceGetCudaComputeCapability": {}
+   }
+}
diff --git a/tools/stub_generator/stub_codegen.py b/tools/stub_generator/stub_codegen.py
index d7c71a52d01..6dacb5a1455 100644
--- a/tools/stub_generator/stub_codegen.py
+++ b/tools/stub_generator/stub_codegen.py
@@ -52,7 +52,7 @@ def main():
                            reinterpret_cast<FuncPtr>(load_symbol_func("{1}")) :
                            {1}NotFound;
   return func_ptr({3});
-}}\n""" % ('CUDAAPI', 'CUDAAPI')
+}}\n""" % (config['calling_conv'], config['calling_conv'])
 
     prolog = """
 typedef void *tLoadSymbol(const char *name);