diff --git a/CMakeLists.txt b/CMakeLists.txt index 068baabf1e2..241b8499d26 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,7 +36,7 @@ check_cxx_compiler_flag(-fopenmp-simd CXX_HAVE_OMP_SIMD) # Build options option(BUILD_DALI_NODEPS "Disable components that require extra external libraries to be present in the system. Effectively, it builds only the DALI core and kernel libraries") -option(LINK_LIBCUDA "Links directly with libcuda.so instead of dlopen it at runtime" OFF) +option(LINK_DRIVER "Links directly with libcuda.so instead of dlopen it at runtime" OFF) # Tests use OpenCV... cmake_dependent_option(BUILD_TEST "Build googletest test suite" ON @@ -142,7 +142,7 @@ propagate_option(BUILD_NVJPEG2K) propagate_option(BUILD_NVOF) propagate_option(BUILD_NVDEC) propagate_option(BUILD_NVML) -propagate_option(LINK_LIBCUDA) +propagate_option(LINK_DRIVER) get_dali_version(${PROJECT_SOURCE_DIR}/VERSION DALI_VERSION) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 85d4487025f..60122cb1baa 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -68,6 +68,9 @@ list(APPEND DALI_EXCLUDES libculibos.a) if (LINK_LIBCUDA) CUDA_find_library_stub(CUDA_cuda_LIBRARY cuda) list(APPEND DALI_LIBS ${CUDA_cuda_LIBRARY}) + + CUDA_find_library_stub(CUDA_nvml_LIBRARY nvidia-ml) + list(APPEND DALI_LIBS ${CUDA_nvml_LIBRARY}) endif() # NVTX for profiling diff --git a/dali/CMakeLists.txt b/dali/CMakeLists.txt index f2e35d9d841..424fd9473a4 100644 --- a/dali/CMakeLists.txt +++ b/dali/CMakeLists.txt @@ -57,8 +57,7 @@ collect_sources(DALI_SRCS PARENT_SCOPE) if (BUILD_PROTOBUF) set(DALI_PROTO_OBJ $) add_library(dali ${LIBTYPE} ${DALI_SRCS} ${DALI_PROTO_OBJ} ${CUDART_LIB}) - set_target_properties(dali PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${DALI_LIBRARY_OUTPUT_DIR}") + set_target_properties(dali PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${DALI_LIBRARY_OUTPUT_DIR}") endif() if (BUILD_DALI_PIPELINE) @@ -75,6 +74,10 @@ if (BUILD_DALI_PIPELINE) target_link_libraries(dali PRIVATE "-Wl,--exclude-libs,${exclude_libs}") endif() +if (BUILD_NVML) + target_link_libraries(dali PRIVATE $) +endif(BUILD_NVML) + ################################################ # Build test suite ################################################ @@ -84,6 +87,9 @@ if (BUILD_DALI_PIPELINE AND BUILD_TEST) target_link_libraries(dali_test PUBLIC dali dali_core dali_kernels dali_operators ${DALI_LIBS} gtest) target_link_libraries(dali_test PRIVATE dynlink_cuda ${CUDART_LIB}) + if (BUILD_NVML) + target_link_libraries(dali_test PRIVATE $) + endif(BUILD_NVML) target_link_libraries(dali_test PRIVATE "-pie") set_target_properties(dali_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TEST_BINARY_DIR}) set_target_properties(dali_test PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/dali/benchmark/CMakeLists.txt b/dali/benchmark/CMakeLists.txt index c619f3dfa15..eafeadc036c 100644 --- a/dali/benchmark/CMakeLists.txt +++ b/dali/benchmark/CMakeLists.txt @@ -44,6 +44,9 @@ if (BUILD_BENCHMARK) add_executable(dali_benchmark "${DALI_BENCHMARK_SRCS}") target_link_libraries(dali_benchmark PRIVATE dali dali_operators benchmark ${DALI_LIBS}) + if (BUILD_NVML) + target_link_libraries(dali_benchmark PRIVATE $) + endif(BUILD_NVML) target_link_libraries(dali_benchmark PRIVATE "-pie") set_target_properties(dali_benchmark PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(dali_benchmark PROPERTIES OUTPUT_NAME "dali_benchmark.bin") diff --git a/dali/core/CMakeLists.txt b/dali/core/CMakeLists.txt index 9e0ce7804fa..5a455452590 100644 --- a/dali/core/CMakeLists.txt +++ b/dali/core/CMakeLists.txt @@ -27,7 +27,7 @@ foreach(incl_dir ${INFERED_COMPILER_INCLUDE}) endforeach(incl_dir) separate_arguments(DEFAULT_COMPILER_INCLUDE UNIX_COMMAND "${DEFAULT_COMPILER_INCLUDE}") -if (NOT LINK_LIBCUDA) +if (NOT LINK_DRIVER) set(CUDA_GENERATED_STUB "${CMAKE_CURRENT_BINARY_DIR}/dynlink_cuda_gen.cc") add_custom_command( OUTPUT ${CUDA_GENERATED_STUB} diff --git a/dali/core/dynlink_cuda.cc b/dali/core/dynlink_cuda.cc index c0fabafcff4..9192694bbb7 100644 --- a/dali/core/dynlink_cuda.cc +++ b/dali/core/dynlink_cuda.cc @@ -12,14 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include #include #include "dali/core/dynlink_cuda.h" -#include - namespace { typedef void *CUDADRIVER; @@ -48,7 +47,7 @@ void *LoadSymbol(const char *name) { return ret; } -} // namespace +} // namespace // it is defined in the generated file typedef void *tLoadSymbol(const char *name); @@ -67,7 +66,7 @@ bool cuInitChecked() { return true; // set symbol loader for this library -#if !LINK_LIBCUDA_ENABLED +#if !LINK_DRIVER_ENABLED CudaSetSymbolLoader(LoadSymbol); #endif static CUresult res = cuInit(0); diff --git a/dali/operators/CMakeLists.txt b/dali/operators/CMakeLists.txt index fe64fc6bd87..8825ab225a9 100644 --- a/dali/operators/CMakeLists.txt +++ b/dali/operators/CMakeLists.txt @@ -75,6 +75,9 @@ if (BUILD_TEST) target_link_libraries(dali_operator_test PUBLIC dali_operators) target_link_libraries(dali_operator_test PRIVATE gtest dynlink_cuda ${DALI_LIBS}) + if (BUILD_NVML) + target_link_libraries(dali_operator_test PRIVATE $) + endif(BUILD_NVML) target_link_libraries(dali_operator_test PRIVATE "-Wl,--exclude-libs,${exclude_libs}") target_link_libraries(dali_operator_test PRIVATE "-pie") set_target_properties(dali_operator_test PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/dali/operators/decoder/nvjpeg/decoupled_api/nvjpeg_decoder_decoupled_api_test.cc b/dali/operators/decoder/nvjpeg/decoupled_api/nvjpeg_decoder_decoupled_api_test.cc index 11ab892c114..34c5d94a8eb 100644 --- a/dali/operators/decoder/nvjpeg/decoupled_api/nvjpeg_decoder_decoupled_api_test.cc +++ b/dali/operators/decoder/nvjpeg/decoupled_api/nvjpeg_decoder_decoupled_api_test.cc @@ -209,7 +209,7 @@ class HwDecoderUtilizationTest : public ::testing::Test { if (!node->op->GetDiagnostic("using_hw_decoder")) { if (nvml::HasCuda11NvmlFunctions()) { unsigned int device_count; - DALI_CALL(nvml::wrapNvmlDeviceGetCount_v2(&device_count)); + CUDA_CALL(nvmlDeviceGetCount_v2(&device_count)); for (unsigned int device_idx = 0; device_idx < device_count; device_idx++) { auto info = nvml::GetDeviceInfo(device_idx); std::cerr << "Device " << device_idx diff --git a/dali/operators/reader/nvdecoder/dynlink_nvcuvid.cc b/dali/operators/reader/nvdecoder/dynlink_nvcuvid.cc index 9e19a94f6aa..48d88311f78 100644 --- a/dali/operators/reader/nvdecoder/dynlink_nvcuvid.cc +++ b/dali/operators/reader/nvdecoder/dynlink_nvcuvid.cc @@ -12,14 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include #include #include "dali/operators/reader/nvdecoder/dynlink_nvcuvid.h" -#include - namespace { static char __DriverLibName[] = "libnvcuvid.so"; @@ -50,7 +49,7 @@ void *LoadSymbol(const char *name) { return ret; } -} +} // namespace // it is defined in the generated file typedef void *tLoadSymbol(const char *name); diff --git a/dali/operators/reader/nvdecoder/dynlink_nvcuvid.h b/dali/operators/reader/nvdecoder/dynlink_nvcuvid.h index ae128ae9c17..52fce88e7dc 100644 --- a/dali/operators/reader/nvdecoder/dynlink_nvcuvid.h +++ b/dali/operators/reader/nvdecoder/dynlink_nvcuvid.h @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if !defined(DALI_OPERATORS_READER_NVDECODER_DYNLINK_NVCUVID_H_) +#ifndef DALI_OPERATORS_READER_NVDECODER_DYNLINK_NVCUVID_H_ #define DALI_OPERATORS_READER_NVDECODER_DYNLINK_NVCUVID_H_ #include @@ -24,4 +24,4 @@ bool cuvidInitChecked(unsigned int Flags); bool cuvidIsSymbolAvailable(const char *name); -#endif // DALI_OPERATORS_READER_NVDECODER_DYNLINK_NVCUVID_H_ +#endif // DALI_OPERATORS_READER_NVDECODER_DYNLINK_NVCUVID_H_ diff --git a/dali/operators/reader/video_reader_op_test.cc b/dali/operators/reader/video_reader_op_test.cc index 2f4c485d639..93847fb03e6 100644 --- a/dali/operators/reader/video_reader_op_test.cc +++ b/dali/operators/reader/video_reader_op_test.cc @@ -107,15 +107,12 @@ TEST_F(VideoReaderTest, MultipleVideoResolution) { float driverVersion = 0; char version[80]; - if (nvml::wrapSymbols() != DALISuccess) { - FAIL() << "wrapSymbols() failed"; - } - if (nvml::wrapNvmlInit() != DALISuccess) { - FAIL() << "wrapNvmlInit() failed"; + if (nvmlInitChecked() != NVML_SUCCESS) { + FAIL() << "nvmlInitChecked() failed"; } - if (nvml::wrapNvmlSystemGetDriverVersion(version, sizeof version) != DALISuccess) { - FAIL() << "wrapNvmlSystemGetDriverVersion failed!"; + if (nvmlSystemGetDriverVersion(version, sizeof version) != NVML_SUCCESS) { + FAIL() << "nvmlSystemGetDriverVersion failed!"; } driverVersion = std::stof(version); diff --git a/dali/util/CMakeLists.txt b/dali/util/CMakeLists.txt index 0f27d5ee030..1877c179b3e 100644 --- a/dali/util/CMakeLists.txt +++ b/dali/util/CMakeLists.txt @@ -45,10 +45,28 @@ if(BUILD_NVML) "${CMAKE_CURRENT_SOURCE_DIR}/nvml.h" "${CMAKE_CURRENT_SOURCE_DIR}/nvml_wrap.h") - set(DALI_SRCS ${DALI_SRCS} - "${CMAKE_CURRENT_SOURCE_DIR}/nvml_wrap.cc") -endif() + if (NOT LINK_DRIVER) + set(NVML_GENERATED_STUB "${CMAKE_CURRENT_BINARY_DIR}/dynlink_nvml_gen.cc") + add_custom_command( + OUTPUT ${NVML_GENERATED_STUB} + COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/stub_generator/stub_codegen.py --unique_prefix=Nvml -- + "${CMAKE_CURRENT_SOURCE_DIR}/../../tools/stub_generator/nvml.json" ${NVML_GENERATED_STUB} + "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/nvml.h" "-I${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" + # for some reason QNX fails with 'too many errors emitted' is this is not set + "-ferror-limit=0" + ${DEFAULT_COMPILER_INCLUDE} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/stub_generator/stub_codegen.py + "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/nvml.h" + "${CMAKE_CURRENT_SOURCE_DIR}/../../tools/stub_generator/nvml.json" + COMMENT "Running nvml.h stub generator" + VERBATIM) + set_source_files_properties(${NVML_GENERATED_STUB} PROPERTIES GENERATED TRUE) + add_library(dynlink_nvml OBJECT nvml_wrap.cc ${NVML_GENERATED_STUB}) + else() + add_library(dynlink_nvml OBJECT nvml_wrap.cc) + endif() +endif() set(DALI_INST_HDRS ${DALI_INST_HDRS} PARENT_SCOPE) set(DALI_SRCS ${DALI_SRCS} PARENT_SCOPE) diff --git a/dali/util/nvml.h b/dali/util/nvml.h index 01530444722..bf10e19ff76 100644 --- a/dali/util/nvml.h +++ b/dali/util/nvml.h @@ -17,18 +17,114 @@ #include #include - #include #include - #include #include - -#include "dali/core/error_handling.h" +#include #include "dali/core/cuda_utils.h" #include "dali/util/nvml_wrap.h" +#include "dali/core/cuda_error.h" +#include "dali/core/format.h" namespace dali { + +class NvmlError : public std::runtime_error { + public: + explicit NvmlError(nvmlReturn_t result, const char *details = nullptr) + : std::runtime_error(Message(result, details)) + , result_(result) {} + + static const char *ErrorString(nvmlReturn_t result) { + switch (result) { + case NVML_SUCCESS: + return "nvml operation was successful"; + case NVML_ERROR_UNINITIALIZED: + return "nvml was not first initialized with nvmlInit()"; + case NVML_ERROR_INVALID_ARGUMENT: + return "a nvml supplied argument is invalid"; + case NVML_ERROR_NOT_SUPPORTED: + return "The nvml requested operation is not available on target device"; + case NVML_ERROR_NO_PERMISSION: + return "The nvml current user does not have permission for operation"; + case NVML_ERROR_ALREADY_INITIALIZED: + return "Deprecated: Multiple initializations are now allowed through ref counting"; + case NVML_ERROR_NOT_FOUND: + return "A nvml query to find an object was unsuccessful"; + case NVML_ERROR_INSUFFICIENT_SIZE: + return "A nvml input argument is not large enough"; + case NVML_ERROR_INSUFFICIENT_POWER: + return "A nvml device's external power cables are not properly attached"; + case NVML_ERROR_DRIVER_NOT_LOADED: + return "nvml: NVIDIA driver is not loaded"; + case NVML_ERROR_TIMEOUT: + return "nvml user provided timeout passed"; + case NVML_ERROR_IRQ_ISSUE: + return "nvml: NVIDIA Kernel detected an interrupt issue with a GPU"; + case NVML_ERROR_LIBRARY_NOT_FOUND: + return "NVML Shared Library couldn't be found or loaded"; + case NVML_ERROR_FUNCTION_NOT_FOUND: + return "Local version of NVML doesn't implement this function"; + case NVML_ERROR_CORRUPTED_INFOROM: + return "nvml: infoROM is corrupted"; + case NVML_ERROR_GPU_IS_LOST: + return "nvml: the GPU has fallen off the bus or has otherwise become inaccessible"; + case NVML_ERROR_RESET_REQUIRED: + return "nvml: the GPU requires a reset before it can be used again"; + case NVML_ERROR_OPERATING_SYSTEM: + return "nvml: the GPU control device has been blocked by the operating system/cgroups"; + case NVML_ERROR_LIB_RM_VERSION_MISMATCH: + return "nvml: RM detects a driver/library version mismatch"; + case NVML_ERROR_IN_USE: + return "A nvml operation cannot be performed because the GPU is currently in use"; + case NVML_ERROR_MEMORY: + return "Nvml insufficient memory"; + case NVML_ERROR_NO_DATA: + return "Nvml: no data"; +#if (CUDART_VERSION >= 11000) + case NVML_ERROR_VGPU_ECC_NOT_SUPPORTED: + return "The nvml requested vgpu operation is not available on target device, becasue ECC is" + "enabled"; + case NVML_ERROR_INSUFFICIENT_RESOURCES: + return "Nvml: ran out of critical resources, other than memory"; +#endif + case NVML_ERROR_UNKNOWN: + return "A nvml internal driver error occurred"; + default: + return "< unknown error >"; + } + } + + static std::string Message(nvmlReturn_t result, const char *details) { + if (details && *details) { + return make_string("nvml error: ", result, " ", ErrorString(result), + "\nDetails:\n", details); + } else { + return make_string("nvml error: ", result, " ", ErrorString(result)); + } + } + + + nvmlReturn_t result() const { return result_; } + + private: + nvmlReturn_t result_; +}; + +class NvmlBadAlloc : public CUDABadAlloc {}; + +template <> +inline void cudaResultCheck(nvmlReturn_t status) { + switch (status) { + case NVML_SUCCESS: + return; + case NVML_ERROR_MEMORY: + throw dali::NvmlBadAlloc(); + default: + throw dali::NvmlError(status); + } +} + namespace nvml { /** @@ -43,9 +139,7 @@ inline std::mutex& Mutex() { * @brief Initializes the NVML library */ inline void Init() { - std::lock_guard lock(Mutex()); - DALI_CALL(wrapSymbols()); - DALI_CALL(wrapNvmlInit()); + CUDA_CALL(nvmlInitChecked()); } /** @@ -53,7 +147,7 @@ inline void Init() { * respecting previously set mask. */ inline void GetNVMLAffinityMask(cpu_set_t * mask, size_t num_cpus) { - if (!wrapIsInitialized()) { + if (!nvmlIsInitialized()) { return; } int device_idx; @@ -64,16 +158,16 @@ inline void GetNVMLAffinityMask(cpu_set_t * mask, size_t num_cpus) { std::vector nvml_mask_container(cpu_set_size); // NOLINT(runtime/int) auto * nvml_mask = nvml_mask_container.data(); nvmlDevice_t device; - DALI_CALL(wrapNvmlDeviceGetHandleByIndex(device_idx, &device)); + CUDA_CALL(nvmlDeviceGetHandleByIndex(device_idx, &device)); #if (CUDART_VERSION >= 11000) - if (wrapHasCuda11NvmlFunctions()) { - DALI_CALL(wrapNvmlDeviceGetCpuAffinityWithinScope(device, cpu_set_size, nvml_mask, + if (nvmlHasCuda11NvmlFunctions()) { + CUDA_CALL(nvmlDeviceGetCpuAffinityWithinScope(device, cpu_set_size, nvml_mask, NVML_AFFINITY_SCOPE_SOCKET)); } else { - DALI_CALL(wrapNvmlDeviceGetCpuAffinity(device, cpu_set_size, nvml_mask)); + CUDA_CALL(nvmlDeviceGetCpuAffinity(device, cpu_set_size, nvml_mask)); } #else - DALI_CALL(wrapNvmlDeviceGetCpuAffinity(device, cpu_set_size, nvml_mask)); + CUDA_CALL(nvmlDeviceGetCpuAffinity(device, cpu_set_size, nvml_mask)); #endif // Convert it to cpu_set_t @@ -111,9 +205,8 @@ inline void SetCPUAffinity(int core = -1) { CPU_ZERO(&requested_set); if (core != -1) { if (core < 0 || (size_t)core >= num_cpus) { - DALI_WARN("Requested setting affinity to core " + to_string(core) + - " but only " + to_string(num_cpus) + " cores available. " + - "Ignoring..."); + DALI_WARN(make_string("Requested setting affinity to core ", core, + " but only ", num_cpus, " cores available. Ignoring...")); GetNVMLAffinityMask(&requested_set, num_cpus); } else { CPU_SET(core, &requested_set); @@ -143,10 +236,10 @@ inline void SetCPUAffinity(int core = -1) { inline void Shutdown() { std::lock_guard lock(Mutex()); - if (!wrapIsInitialized()) { + if (!nvmlIsInitialized()) { return; } - DALI_CALL(wrapNvmlShutdown()); + CUDA_CALL(nvmlShutdown()); } #if (CUDART_VERSION >= 11000) @@ -169,9 +262,9 @@ struct DeviceProperties { inline DeviceProperties GetDeviceInfo(int device_idx) { DeviceProperties ret; nvmlDevice_t device; - DALI_CALL(wrapNvmlDeviceGetHandleByIndex_v2(device_idx, &device)); - DALI_CALL(wrapNvmlDeviceGetBrand(device, &ret.type)); - DALI_CALL(wrapNvmlDeviceGetCudaComputeCapability(device, &ret.cap_major, &ret.cap_minor)); + CUDA_CALL(nvmlDeviceGetHandleByIndex_v2(device_idx, &device)); + CUDA_CALL(nvmlDeviceGetBrand(device, &ret.type)); + CUDA_CALL(nvmlDeviceGetCudaComputeCapability(device, &ret.cap_major, &ret.cap_minor)); return ret; } @@ -181,7 +274,7 @@ inline DeviceProperties GetDeviceInfo(int device_idx) { * @throws std::runtime_error */ inline bool HasHwDecoder(int device_idx) { - if (!wrapIsInitialized()) { + if (!nvmlIsInitialized()) { return false; } auto info = GetDeviceInfo(device_idx); @@ -195,11 +288,11 @@ inline bool HasHwDecoder(int device_idx) { * @throws std::runtime_error */ inline bool HasHwDecoder() { - if (!wrapIsInitialized()) { + if (!nvmlIsInitialized()) { return false; } unsigned int device_count; - DALI_CALL(wrapNvmlDeviceGetCount_v2(&device_count)); + CUDA_CALL(nvmlDeviceGetCount_v2(&device_count)); for (unsigned int device_idx = 0; device_idx < device_count; device_idx++) { if (HasHwDecoder(device_idx)) return true; } @@ -211,24 +304,13 @@ inline bool HasHwDecoder() { * Checks, whether CUDA11-proper NVML functions have been successfully loaded */ inline bool HasCuda11NvmlFunctions() { - if (!wrapIsInitialized()) { + if (!nvmlIsInitialized()) { return false; } - return wrapHasCuda11NvmlFunctions(); + return nvmlHasCuda11NvmlFunctions(); } } // namespace nvml } // namespace dali -#define NVML_CALL(code) \ - do { \ - nvmlReturn_t status = code; \ - if (status != NVML_SUCCESS) { \ - dali::string error = dali::string("NVML error \"") + \ - nvmlErrorString(status) + "\""; \ - DALI_FAIL(error); \ - } \ - } while (0) - #endif // DALI_UTIL_NVML_H_ - diff --git a/dali/util/nvml_wrap.cc b/dali/util/nvml_wrap.cc index 971178a59ca..58a59b14072 100644 --- a/dali/util/nvml_wrap.cc +++ b/dali/util/nvml_wrap.cc @@ -1,271 +1,98 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - ************************************************************************/ - +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include #include -#include #include -#include "dali/util/nvml_wrap.h" -#include "dali/core/cuda_error.h" - - -namespace dali { - -namespace nvml { - -int symbolsLoaded = 0; - -static nvmlReturn_t (*nvmlInternalInit)(void); -static nvmlReturn_t (*nvmlInternalShutdown)(void); -static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, - nvmlDevice_t* device); -static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(const int device_id, - nvmlDevice_t* device); -static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index); - -static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device); -static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device); -static nvmlReturn_t (*nvmlInternalSystemGetDriverVersion)(char* name, unsigned int length); -static nvmlReturn_t (*nvmlInternalDeviceGetCpuAffinity)(nvmlDevice_t device, - unsigned int cpuSetSize, - unsigned long* cpuSet); // NOLINT(*) +#include +#include +#include -#if (CUDART_VERSION >= 11000) -static nvmlReturn_t (*nvmlInternalDeviceGetCpuAffinityWithinScope)(nvmlDevice_t device, - unsigned int nodeSetSize, - unsigned long *nodeSet, // NOLINT(*) - nvmlAffinityScope_t scope); -static nvmlReturn_t (*nvmlInternalDeviceGetBrand)(nvmlDevice_t device, nvmlBrandType_t *type); -static nvmlReturn_t (*nvmlInternalDeviceGetCount_v2)(unsigned int *deviceCount); -static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex_v2)(unsigned int index, - nvmlDevice_t *device); -static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, - int *major, int *minor); -#endif +#include "dali/util/nvml_wrap.h" -static const char* (*nvmlInternalErrorString)(nvmlReturn_t r); namespace { -std::once_flag driver_check; +typedef void *NVMLRIVER; -/* - * This function is used to learn the real driver version, and compare with the provided value - * It should be available as soon as possible. However it requires wrapNvmlSystemGetDriverVersion - * and wrapNvmlInit be intialized first. If they are not it will warn and return INF driver version - * and all checks will fail - */ -bool is_driver_sufficient(float requestedDriverVersion) { - static float availableDriverVersion = std::numeric_limits::max(); +static char __NvmlLibName[] = "libnvidia-ml.so"; +static char __NvmlLibName1[] = "libnvidia-ml.so.1"; - std::call_once(driver_check, [] { - char version[80]; - if (nvml::wrapNvmlInit() != DALISuccess) { - DALI_WARN("wrapNvmlInit failed, driver version check not available"); - return; - } - nvml::wrapNvmlSystemGetDriverVersion(version, sizeof version); - availableDriverVersion = std::stof(version); - }); - - return requestedDriverVersion <= availableDriverVersion; -} +NVMLRIVER loadNvmlLibrary() { + NVMLRIVER ret = nullptr; -} // namespace - -bool wrapHasCuda11NvmlFunctions() { - #if (CUDART_VERSION >= 11000) - return nvmlInternalDeviceGetCount_v2 && nvmlInternalDeviceGetHandleByIndex_v2 && - nvmlInternalDeviceGetCudaComputeCapability && nvmlInternalDeviceGetBrand && - nvmlInternalDeviceGetCpuAffinityWithinScope; - #else - return false; - #endif -} + ret = dlopen(__NvmlLibName1, RTLD_NOW); + if (!ret) { + ret = dlopen(__NvmlLibName, RTLD_NOW); -bool wrapIsInitialized(void) { - return symbolsLoaded; -} - -DALIError_t wrapSymbols(void) { - if (symbolsLoaded) - return DALISuccess; - - static void* nvmlhandle = nullptr; - void* tmp; - void** cast; - - nvmlhandle = dlopen("libnvidia-ml.so", RTLD_NOW); - if (!nvmlhandle) { - nvmlhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW); - if (!nvmlhandle) { - DALI_FAIL("Failed to open libnvidia-ml.so[.1]"); + if (!ret) { + printf("dlopen \"%s\" failed!\n", __NvmlLibName); } } - -#define LOAD_SYM(handle, symbol, funcptr) do { \ - cast = reinterpret_cast(&funcptr); \ - tmp = dlsym(handle, symbol); \ - if (tmp == nullptr) { \ - DALI_FAIL("dlsym failed on " + symbol + " - " + dlerror()); \ - } \ - *cast = tmp; \ - } while (0) - -#define LOAD_SYM_MIN_DRIVER(handle, symbol, funcptr, driver_v) do { \ - if (!is_driver_sufficient(driver_v)) { \ - funcptr = nullptr; \ - break; \ - } \ - cast = reinterpret_cast(&funcptr); \ - tmp = dlsym(handle, symbol); \ - if (tmp == nullptr) { \ - DALI_FAIL("dlsym failed on " + symbol + " - " + dlerror()); \ - } \ - *cast = tmp; \ - } while (0) - - /* - * make sure that nvmlInit and nvmlSystemGetDriverVersion are first on the list as they are needed - * by is_driver_sufficient function - */ - LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit); - LOAD_SYM(nvmlhandle, "nvmlSystemGetDriverVersion", nvmlInternalSystemGetDriverVersion); - - LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown); - LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId); - LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex); - LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex); - LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity); - LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity); - LOAD_SYM(nvmlhandle, "nvmlDeviceGetCpuAffinity", nvmlInternalDeviceGetCpuAffinity); - LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString); - - #if (CUDART_VERSION >= 11000) - LOAD_SYM_MIN_DRIVER(nvmlhandle, "nvmlDeviceGetCpuAffinityWithinScope", - nvmlInternalDeviceGetCpuAffinityWithinScope, 450.36); - LOAD_SYM_MIN_DRIVER(nvmlhandle, "nvmlDeviceGetBrand", nvmlInternalDeviceGetBrand, 450.36); - LOAD_SYM_MIN_DRIVER(nvmlhandle, "nvmlDeviceGetCount_v2", nvmlInternalDeviceGetCount_v2, 450.36); - LOAD_SYM_MIN_DRIVER(nvmlhandle, "nvmlDeviceGetHandleByIndex_v2", - nvmlInternalDeviceGetHandleByIndex_v2, 450.36); - LOAD_SYM_MIN_DRIVER(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", - nvmlInternalDeviceGetCudaComputeCapability, 450.36); - #endif - - symbolsLoaded = 1; - return DALISuccess; + return ret; } +void *LoadSymbol(const char *name) { + static NVMLRIVER nvmlDrvLib = loadNvmlLibrary(); + void *ret = nvmlDrvLib ? dlsym(nvmlDrvLib, name) : nullptr; + return ret; +} -#define FUNC_BODY(INTERNAL_FUNC, ARGS...) \ - do { \ - if (INTERNAL_FUNC == nullptr) { \ - return DALIError; \ - } \ - nvmlReturn_t ret = INTERNAL_FUNC(ARGS); \ - if (ret != NVML_SUCCESS) { \ - DALI_WARN(#INTERNAL_FUNC "(...) failed: " + \ - nvmlInternalErrorString(ret)); \ - return DALIError; \ - } \ - return DALISuccess; \ - } while (false) +} // namespace +std::atomic_bool symbolsLoaded{false}; -DALIError_t wrapNvmlInit(void) { - FUNC_BODY(nvmlInternalInit); -} +// it is defined in the generated file +typedef void *tLoadSymbol(const char *name); +void NvmlSetSymbolLoader(tLoadSymbol loader_func); -DALIError_t wrapNvmlShutdown(void) { - if (nvmlInternalInit == nullptr) { - return DALISuccess; - } - if (nvmlInternalShutdown == nullptr) { - DALI_FAIL("lib wrapper not initialized."); - return DALIError; - } - nvmlReturn_t ret = nvmlInternalShutdown(); +nvmlReturn_t nvmlInitChecked() { + // set symbol loader for this library +#if !LINK_DRIVER_ENABLED + static std::once_flag nvml_once; + std::call_once(nvml_once, NvmlSetSymbolLoader, LoadSymbol); +#endif + symbolsLoaded = true; + nvmlReturn_t ret = nvmlInit(); if (ret != NVML_SUCCESS) { - DALI_FAIL("nvmlShutdown() failed: " + - nvmlInternalErrorString(ret)); - return DALIError; + DALI_WARN("nvmlInitChecked failed: " + nvmlErrorString(ret)); } - return DALISuccess; -} - -DALIError_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { - FUNC_BODY(nvmlInternalDeviceGetHandleByPciBusId, pciBusId, device); -} - -DALIError_t wrapNvmlDeviceGetHandleByIndex(const int device_id, nvmlDevice_t* device) { - FUNC_BODY(nvmlInternalDeviceGetHandleByIndex, device_id, device); -} - -DALIError_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { - FUNC_BODY(nvmlInternalDeviceGetIndex, device, index); -} - -DALIError_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) { - FUNC_BODY(nvmlInternalDeviceSetCpuAffinity, device); -} - -DALIError_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) { - FUNC_BODY(nvmlInternalDeviceClearCpuAffinity, device); -} - -DALIError_t wrapNvmlSystemGetDriverVersion(char* name, unsigned int length) { - FUNC_BODY(nvmlInternalSystemGetDriverVersion, name, length); + return ret; } -DALIError_t wrapNvmlDeviceGetCpuAffinity(nvmlDevice_t device, - unsigned int cpuSetSize, - unsigned long* cpuSet) { // NOLINT(runtime/int) - FUNC_BODY(nvmlInternalDeviceGetCpuAffinity, device, cpuSetSize, cpuSet); -} - -#if (CUDART_VERSION >= 11000) - -DALIError_t wrapNvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, - unsigned int nodeSetSize, - unsigned long *nodeSet, // NOLINT(runtime/int) - nvmlAffinityScope_t scope) { - FUNC_BODY(nvmlInternalDeviceGetCpuAffinityWithinScope, device, nodeSetSize, nodeSet, scope); -} - -DALIError_t wrapNvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t* type) { - FUNC_BODY(nvmlInternalDeviceGetBrand, device, type); -} - -DALIError_t wrapNvmlDeviceGetCount_v2(unsigned int* deviceCount) { - FUNC_BODY(nvmlInternalDeviceGetCount_v2, deviceCount); +bool nvmlIsInitialized(void) { + return symbolsLoaded; } -DALIError_t wrapNvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t* device) { - FUNC_BODY(nvmlInternalDeviceGetHandleByIndex_v2, index, device); +bool nvmlIsSymbolAvailable(const char *name) { + static std::mutex symbol_mutex; + static std::unordered_map symbol_map; + std::lock_guard lock(symbol_mutex); + auto it = symbol_map.find(name); + if (it == symbol_map.end()) { + auto *ptr = LoadSymbol(name); + symbol_map.insert({name, ptr}); + return ptr != nullptr; + } + return it->second != nullptr; } -DALIError_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor) { - FUNC_BODY(nvmlInternalDeviceGetCudaComputeCapability, device, major, minor); +bool nvmlHasCuda11NvmlFunctions(void) { + return nvmlIsSymbolAvailable("nvmlDeviceGetCount_v2") && + nvmlIsSymbolAvailable("nvmlDeviceGetHandleByIndex_v2") && + nvmlIsSymbolAvailable("nvmlDeviceGetCudaComputeCapability") && + nvmlIsSymbolAvailable("nvmlDeviceGetBrand") && + nvmlIsSymbolAvailable("nvmlDeviceGetCpuAffinityWithinScope"); } - -#endif - -#undef FUNC_BODY - -} // namespace nvml - -} // namespace dali diff --git a/dali/util/nvml_wrap.h b/dali/util/nvml_wrap.h index 419a0f9ddfd..f2cea86a8de 100644 --- a/dali/util/nvml_wrap.h +++ b/dali/util/nvml_wrap.h @@ -20,55 +20,22 @@ #ifndef DALI_UTIL_NVML_WRAP_H_ #define DALI_UTIL_NVML_WRAP_H_ + + #include #include #include "dali/core/common.h" #include "dali/core/error_handling.h" -namespace dali { - -namespace nvml { - -DLL_PUBLIC bool wrapIsInitialized(void); -DLL_PUBLIC DALIError_t wrapSymbols(void); - -DLL_PUBLIC DALIError_t wrapNvmlInit(void); -DLL_PUBLIC DALIError_t wrapNvmlShutdown(void); -DLL_PUBLIC DALIError_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, - nvmlDevice_t* device); -DLL_PUBLIC DALIError_t wrapNvmlDeviceGetHandleByIndex(const int device_id, - nvmlDevice_t* device); -DLL_PUBLIC DALIError_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index); -DLL_PUBLIC DALIError_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device); -DLL_PUBLIC DALIError_t wrapNvmlSystemGetDriverVersion(char* name, unsigned int length); -DLL_PUBLIC DALIError_t wrapNvmlDeviceGetCpuAffinity(nvmlDevice_t device, - unsigned int cpuSetSize, - unsigned long* cpuSet); // NOLINT(runtime/int) -DLL_PUBLIC DALIError_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device); - -#if (CUDART_VERSION >= 11000) - -DLL_PUBLIC DALIError_t wrapNvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, - unsigned int nodeSetSize, - unsigned long *nodeSet, // NOLINT(*) - nvmlAffinityScope_t scope); -DLL_PUBLIC DALIError_t wrapNvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t* type); -DLL_PUBLIC DALIError_t wrapNvmlDeviceGetCount_v2(unsigned int* deviceCount); -DLL_PUBLIC DALIError_t wrapNvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t* device); -DLL_PUBLIC DALIError_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, - int* major, int* minor); - -#endif +bool nvmlIsInitialized(void); +nvmlReturn_t nvmlInitChecked(void); +bool nvmlIsSymbolAvailable(const char *name); /** * Checks, whether CUDA11-proper NVML functions have been successfully loaded */ -DLL_PUBLIC bool wrapHasCuda11NvmlFunctions(); - -} // namespace nvml - -} // namespace dali +bool nvmlHasCuda11NvmlFunctions(void); #endif // DALI_UTIL_NVML_WRAP_H_ diff --git a/docker/Dockerfile b/docker/Dockerfile index 47a2bf4a280..7883fb28dca 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -79,8 +79,8 @@ ARG BUILD_NVDEC ENV BUILD_NVDEC=${BUILD_NVDEC} ARG BUILD_NVML ENV BUILD_NVML=${BUILD_NVML} -ARG LINK_LIBCUDA -ENV LINK_LIBCUDA=${LINK_LIBCUDA} +ARG LINK_DRIVER +ENV LINK_DRIVER=${LINK_DRIVER} ARG STRIP_BINARY ENV STRIP_BINARY=${STRIP_BINARY} ARG VERBOSE_LOGS diff --git a/docs/compilation.rst b/docs/compilation.rst index 136a96bbf5d..04177030d06 100644 --- a/docs/compilation.rst +++ b/docs/compilation.rst @@ -350,7 +350,7 @@ Optional CMake build parameters - ``WERROR`` - treat all build warnings as errors (default: OFF) - ``BUILD_WITH_ASAN`` - build with ASAN support (default: OFF). To run issue: - ``BUILD_DALI_NODEPS`` - disables support for third party libraries that are normally expected to be available in the system -- ``LINK_LIBCUDA`` - enables direct linking with libcuda.so or an appropriate stub instead of dlopen +- ``LINK_DRIVER`` - enables direct linking with driver libraries or an appropriate stub instead of dlopen it in the runtime (removes the requirement to have clang-python bindings available to generate the stubs) .. warning:: diff --git a/include/dali/core/dynlink_cuda.h b/include/dali/core/dynlink_cuda.h index 76da7f5496d..2613fcdcf85 100644 --- a/include/dali/core/dynlink_cuda.h +++ b/include/dali/core/dynlink_cuda.h @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef DALI_CORE_DYNLINK_CUDA_H -#define DALI_CORE_DYNLINK_CUDA_H +#ifndef DALI_CORE_DYNLINK_CUDA_H_ +#define DALI_CORE_DYNLINK_CUDA_H_ #include bool cuInitChecked(); bool cuIsSymbolAvailable(const char *name); -#endif //DALI_CORE_DYNLINK_CUDA_H +#endif // DALI_CORE_DYNLINK_CUDA_H_ diff --git a/tools/lint.py b/tools/lint.py index 7fe59aefacb..02f65e3ea53 100644 --- a/tools/lint.py +++ b/tools/lint.py @@ -41,12 +41,8 @@ # Specifies, which files are to be excluded # These filters are regexes, not typical unix-like path specification negative_filters = [ - ".*core/dynlink_cuda.cc", ".*operators/reader/nvdecoder/nvcuvid.h", ".*operators/reader/nvdecoder/cuviddec.h", - ".*operators/reader/nvdecoder/dynlink_nvcuvid.cc", - ".*operators/reader/nvdecoder/dynlink_nvcuvid.h", - ".*dali/core/dynlink_cuda.h", ".*python/dummy.cu" ] diff --git a/tools/stub_generator/cuda.json b/tools/stub_generator/cuda.json index a8eaa1f1d10..fd52cbae73a 100644 --- a/tools/stub_generator/cuda.json +++ b/tools/stub_generator/cuda.json @@ -3,6 +3,7 @@ "" ], "return_type":"CUresult", + "calling_conv":"CUDAAPI", "not_found_error":"CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", "functions": { "cuInit": {}, diff --git a/tools/stub_generator/nvcuvid.json b/tools/stub_generator/nvcuvid.json index a8d5e10e8b0..0589f0f7547 100644 --- a/tools/stub_generator/nvcuvid.json +++ b/tools/stub_generator/nvcuvid.json @@ -5,6 +5,7 @@ "\"dali/operators/reader/nvdecoder/nvcuvid.h\"" ], "return_type":"CUresult", + "calling_conv":"CUDAAPI", "not_found_error":"CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", "functions": { "cuvidCreateVideoSource": {}, diff --git a/tools/stub_generator/nvml.json b/tools/stub_generator/nvml.json new file mode 100644 index 00000000000..30b1903e5ce --- /dev/null +++ b/tools/stub_generator/nvml.json @@ -0,0 +1,29 @@ +{ + "extra_include":[ + "" + ], + "return_type":"nvmlReturn_t", + "calling_conv":"DECLDIR", + "not_found_error":"NVML_ERROR_FUNCTION_NOT_FOUND ", + "functions": { + "nvmlInit": {}, + "nvmlInit_v2": {}, + "nvmlShutdown": {}, + "nvmlSystemGetDriverVersion": {}, + "nvmlDeviceGetHandleByPciBusId": {}, + "nvmlDeviceGetHandleByIndex": {}, + "nvmlDeviceGetIndex": {}, + "nvmlDeviceSetCpuAffinity": {}, + "nvmlDeviceClearCpuAffinity": {}, + "nvmlDeviceGetCpuAffinity": {}, + "nvmlErrorString": { + "return_type":"char*", + "not_found_error":"const_cast(\"\")" + }, + "nvmlDeviceGetCpuAffinityWithinScope": {}, + "nvmlDeviceGetBrand": {}, + "nvmlDeviceGetCount_v2": {}, + "nvmlDeviceGetHandleByIndex_v2": {}, + "nvmlDeviceGetCudaComputeCapability": {} + } +} diff --git a/tools/stub_generator/stub_codegen.py b/tools/stub_generator/stub_codegen.py index d7c71a52d01..6dacb5a1455 100644 --- a/tools/stub_generator/stub_codegen.py +++ b/tools/stub_generator/stub_codegen.py @@ -52,7 +52,7 @@ def main(): reinterpret_cast(load_symbol_func("{1}")) : {1}NotFound; return func_ptr({3}); -}}\n""" % ('CUDAAPI', 'CUDAAPI') +}}\n""" % (config['calling_conv'], config['calling_conv']) prolog = """ typedef void *tLoadSymbol(const char *name);