Skip to content

Commit

Permalink
Enable CUDA 11.8 and Hopper support (NVIDIA#4308)
Browse files Browse the repository at this point in the history
- adds CUDA 11.8 based build
- extend the image decoder to support H100 Hopper

Signed-off-by: Janusz Lisiecki <[email protected]>
  • Loading branch information
JanuszL authored Oct 5, 2022
1 parent 06e18be commit cd16f63
Show file tree
Hide file tree
Showing 9 changed files with 69 additions and 18 deletions.
4 changes: 2 additions & 2 deletions cmake/CUDA_utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,12 @@ elseif (${ARCH} MATCHES "aarch64")
# from the whole list/; "70" "75" "80" "86"
# we pick only major arch as minor should be compatible without JITing, it should
# shrink the output binary
set(CUDA_known_archs "70" "80")
set(CUDA_known_archs "70" "80" "90")
else()
# from the whole list: "35" "50" "52" "60" "61" "70" "75" "80" "86"
# we pick only major arch as minor should be compatible without JITing, it should
# shrink the output binary
set(CUDA_known_archs "35" "50" "60" "70" "80")
set(CUDA_known_archs "35" "50" "60" "70" "80" "90")
endif()

set(CUDA_TARGET_ARCHS ${CUDA_known_archs} CACHE STRING "List of target CUDA architectures")
Expand Down
6 changes: 3 additions & 3 deletions dali/operators/decoder/image_decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ the largest allocation value that is printed in the statistics.)code",
.AddOptionalArg("hw_decoder_load",
R"code(The percentage of the image data to be processed by the HW JPEG decoder.
Applies **only** to the ``mixed`` backend type in NVIDIA Ampere GPU architecture.
Applies **only** to the ``mixed`` backend type in NVIDIA Ampere GPU and newer architecture.
Determines the percentage of the workload that will be offloaded to the hardware decoder,
if available. The optimal workload depends on the number of threads that are provided to
Expand All @@ -105,14 +105,14 @@ the DALI pipeline and should be found empirically. More details can be found at
.AddOptionalArg("preallocate_width_hint",
R"code(Image width hint.
Applies **only** to the ``mixed`` backend type in NVIDIA Ampere GPU architecture.
Applies **only** to the ``mixed`` backend type in NVIDIA Ampere GPU and newer architecture.
The hint is used to preallocate memory for the HW JPEG decoder.)code",
0)
.AddOptionalArg("preallocate_height_hint",
R"code(Image width hint.
Applies **only** to the ``mixed`` backend type in NVIDIA Ampere GPU architecture.
Applies **only** to the ``mixed`` backend type in NVIDIA Ampere GPU and newer architecture.
The hint is used to preallocate memory for the HW JPEG decoder.)code",
0)
Expand Down
18 changes: 15 additions & 3 deletions dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
spec.GetArgument<int>("device_id"),
spec.GetArgument<bool>("affine"),
"image decoder nvJPEG2k") {
// TODO(ktokarski) TODO(jlisiecki) For now it is unused,
// adjust NVJPEG to (full capacity of) H100
(void) num_hw_engines_;
#if IS_HW_DECODER_COMPATIBLE
// if hw_decoder_load is not present in the schema (crop/sliceDecoder) then it is not supported
bool try_init_hw_decoder = false;
Expand Down Expand Up @@ -115,6 +118,14 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
#endif
LOG_LINE << "Using NVJPEG_BACKEND_HARDWARE" << std::endl;
CUDA_CALL(nvjpegJpegStateCreate(handle_, &state_hw_batched_));
if (nvjpegIsSymbolAvailable("nvjpegGetHardwareDecoderInfo")) {
nvjpegGetHardwareDecoderInfo(handle_, &num_hw_engines_, &num_hw_cores_per_engine_);
// ToDo adjust hw_decoder_load_ based on num_hw_engines_ and num_hw_cores_per_engine_
} else {
// assume pre H100 so the defaults are as follow
num_hw_engines_ = 1;
num_hw_cores_per_engine_ = 5;
}
if (!RestrictPinnedMemUsage()) {
hw_decoder_images_staging_.set_pinned(true);
// assume close the worst case size 300kb per image
Expand Down Expand Up @@ -1135,10 +1146,9 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
if (hw_decoder_load == 0.f) return 0;
auto hw_batch_size = static_cast<int>(std::round(hw_decoder_load * curr_batch_size));

constexpr int kNumHwDecoders = 5;
int tail = hw_batch_size % kNumHwDecoders;
int tail = hw_batch_size % num_hw_cores_per_engine_;
if (tail > 0) {
hw_batch_size = hw_batch_size + kNumHwDecoders - tail;
hw_batch_size = hw_batch_size + num_hw_cores_per_engine_ - tail;
}
if (hw_batch_size > curr_batch_size) {
hw_batch_size = curr_batch_size;
Expand Down Expand Up @@ -1166,6 +1176,8 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {

// Used to ensure the work in the thread pool is picked FIFO
int64_t task_priority_seq_ = 0;
unsigned int num_hw_engines_ = 1;
unsigned int num_hw_cores_per_engine_ = 1;
};

} // namespace dali
Expand Down
2 changes: 1 addition & 1 deletion dali/operators/sequence/optical_flow/optical_flow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ The main input for this operator is a sequence of frames. Optionally, the operat
can be provided with external hints for the optical flow calculation. The output format of this operator
matches the output format of the optical flow driver API.
Refer to https://developer.nvidia.com/opticalflow-sdk for more information about the
Turing and Ampere optical flow hardware that is used by DALI.
Turing, Ampere and Hopper optical flow hardware that is used by DALI.
)code")
.NumInput(1, 2)
.NumOutput(1)
Expand Down
12 changes: 12 additions & 0 deletions docker/Dockerfile.cuda118.aarch64.deps
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
ARG TOOLKIT_BASE_IMAGE=ubuntu:20.04
FROM ${TOOLKIT_BASE_IMAGE} as cuda

ENV DEBIAN_FRONTEND=noninteractive

RUN apt update && apt install -y libxml2 curl perl gcc && \
rm -rf /var/lib/apt/lists/*

RUN curl -LO https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux_sbsa.run && \
chmod +x cuda_*.run && \
./cuda_*.run --silent --no-opengl-libs --toolkit && \
rm -f cuda_*.run;
26 changes: 26 additions & 0 deletions docker/Dockerfile.cuda118.x86_64.deps
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
ARG TOOLKIT_BASE_IMAGE=ubuntu:20.04
FROM ${TOOLKIT_BASE_IMAGE} as cuda

ENV DEBIAN_FRONTEND=noninteractive

RUN apt update && apt install -y libxml2 curl perl gcc && \
rm -rf /var/lib/apt/lists/*

RUN curl -LO https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run && \
chmod +x cuda_*.run && \
./cuda_*.run --silent --no-opengl-libs --toolkit && \
rm -f cuda_*.run;

RUN NVJPEG2K_VERSION=0.5.0.25-1 && \
CUFILE_VERSION=1.4.0.31-1 && \
apt-get update && \
apt-get install wget software-properties-common -y && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub && \
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && \
apt-get update && \
apt-get install libnvjpeg2k0=${NVJPEG2K_VERSION} libnvjpeg2k-dev=${NVJPEG2K_VERSION} -y && \
apt-get install libcufile-dev-11-8=${CUFILE_VERSION} -y && \
cp /usr/include/nvjpeg2k* /usr/local/cuda/include/ && \
cp /usr/lib/x86_64-linux-gnu/libnvjpeg2k* /usr/local/cuda/lib64/ && \
rm -rf /var/lib/apt/lists/*
8 changes: 4 additions & 4 deletions docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ a build environment
To change build configuration please export appropriate env variables (for exact meaning please check the README):
PYVER=[default 3.6, required only by Run image]
CUDA_VERSION=[default 11.7, accepts also 10.2, 11.0 and 11.1, 11.2, 11.3, 11.4, 11.5, 11.6]
CUDA_VERSION=[default 11.8, accepts also 10.2, 11.0 and 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7]
NVIDIA_BUILD_ID=[default 12345]
CREATE_WHL=[default YES]
CREATE_RUNNER=[default NO]
Expand Down Expand Up @@ -40,16 +40,16 @@ shift $((OPTIND - 1))
export ARCH=${ARCH:-x86_64}
export PYVER=${PYVER:-3.6}
export PYV=${PYVER/./}
export CUDA_VERSION=${CUDA_VERSION:-11.7}
export CUDA_VERSION=${CUDA_VERSION:-11.8}
export CUDA_VER=${CUDA_VERSION//./}

if [ "${CUDA_VERSION%%\.*}" ]
then
if [ $CUDA_VER != "100" ] && [ $CUDA_VER != "102" ] && [ $CUDA_VER != "110" ] && [ $CUDA_VER != "111" ] && \
[ $CUDA_VER != "112" ] && [ $CUDA_VER != "113" ] && [ $CUDA_VER != "114" ] && [ $CUDA_VER != "115" ] && \
[ $CUDA_VER != "116" ]
[ $CUDA_VER != "116" ] && [ $CUDA_VER != "117" ] && [ $CUDA_VER != "118" ]
then
echo "Wrong CUDA_VERSION=$CUDA_VERSION provided. Only 10.0, 10.2, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5 and 11.6 are supported"
echo "Wrong CUDA_VERSION=$CUDA_VERSION provided. Only 10.0, 10.2, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7 and 11.8 are supported"
exit 1
fi
else
Expand Down
8 changes: 4 additions & 4 deletions docs/compilation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ Building Python Wheel
Change directory (``cd``) into ``docker`` directory and run ``./build.sh``. If needed,
set the following environment variables:

* | CUDA_VERSION - CUDA toolkit version (10.2 and 11.7 are officially supported, 10.0, 11.0, 11.1,
11.2, 11.4, 11.5 and 11.6 are deprecated and may not work).
| The default is ``11.7``. Thanks to CUDA extended compatibility mode, CUDA 11.1, 11.2, 11.3, 11.4
11.5, 11.6 and 11.7 wheels are named as CUDA 11.0 because it can work with the CUDA 11.0 R450.x driver
* | CUDA_VERSION - CUDA toolkit version (10.2 and 11.8 are officially supported, 10.0, 11.0, 11.1,
11.2, 11.4, 11.5, 11.6 and 11.7 are deprecated and may not work).
| The default is ``11.8``. Thanks to CUDA extended compatibility mode, CUDA 11.1, 11.2, 11.3, 11.4
11.5, 11.6, 11.7 and 11.8 wheels are named as CUDA 11.0 because it can work with the CUDA 11.0 R450.x driver
family. Please update to the latest recommended driver version in that family.
| If the value of the CUDA_VERSION is prefixed with `.` then any value ``.XX.Y`` can be passed,
the supported version check is suppressed, and the user needs to make sure that
Expand Down
3 changes: 2 additions & 1 deletion tools/stub_generator/nvjpeg.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
"nvjpegDecodeJpegHost": {},
"nvjpegDecodeJpegDevice": {},
"nvjpegBufferDeviceCreate": {},
"nvjpegGetProperty": {}
"nvjpegGetProperty": {},
"nvjpegGetHardwareDecoderInfo": {}
}
}

0 comments on commit cd16f63

Please sign in to comment.