Skip to content

Commit

Permalink
Merge pull request #4 from NVIDIA/dev21.07
Browse files Browse the repository at this point in the history
Dev21.07
  • Loading branch information
beberg authored Jul 20, 2021
2 parents a3363ad + c5a4897 commit 397688c
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 55 deletions.
9 changes: 5 additions & 4 deletions dev-env/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,11 @@ docker pull nvcr.io/nvidia/magnum-io/magnum-io:TAG
4. Run

```bash
docker run --gpus all --rm -it
--user "$(id -u):$(id -g)"
--volume $HOME:$HOME
--workdir $HOME
docker run --gpus all --rm -it \
--user "$(id -u):$(id -g)" \
--volume $HOME:$HOME \
--volume /run/udev:/run/udev:ro \
--workdir $HOME \
magnum-io:TAG
```

Expand Down
33 changes: 18 additions & 15 deletions dev-env/installer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.

# Global Paramaters
MAGNUM_IO_VERSION=21.04-dev
MIN_DRIVER=460.32.03
MIN_CUDA=11.0.228
MIN_DOCKER=19.03.4
MAGNUM_IO_VERSION=21.07
MIN_DRIVER=470.42
MIN_CUDA=11.4.0
MIN_DOCKER=20.10.3

SCRIPT_NAME=$(basename $0)
RUNFROM=$(dirname $(readlink -f $0))
Expand Down Expand Up @@ -105,7 +105,7 @@ EOF
sudo add-apt-repository -y ppa:graphics-drivers/ppa
sudo apt-get -y update
sudo apt-get -y upgrade
sudo apt-get -y install nvidia-driver-455
sudo apt-get -y install nvidia-driver-470
sudo apt-get -y autoremove
REBOOT=1
else
Expand Down Expand Up @@ -191,10 +191,10 @@ install_cuda () {
sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
sudo apt-get update
sudo apt-get -y install cuda-toolkit-11-0
sudo apt-get -y install cuda-toolkit-11-4

echo "export PATH=/usr/local/cuda/bin/:\$PATH # MAGNUMIO-DEV-ENV-ADDED" >> ${HOME}/.bashrc
echo "export LD_LIBRARY_PATH=/usr/local/cuda-11.0/lib64:/lib:\$LD_LIBRARY_PATH # MAGNUMIO-DEV-ENV-ADDED" >> ${HOME}/.bashrc
echo "export LD_LIBRARY_PATH=/usr/local/cuda-11.4/lib64:/lib:\$LD_LIBRARY_PATH # MAGNUMIO-DEV-ENV-ADDED" >> ${HOME}/.bashrc
source ${HOME}/.bashrc
else
curl https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin \
Expand All @@ -203,25 +203,25 @@ install_cuda () {
sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub
sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
sudo apt-get update
sudo apt-get -y install cuda-toolkit-11-0
sudo apt-get -y install cuda-toolkit-11-4

echo "export PATH=/usr/local/cuda/bin/:\$PATH # MAGNUMIO-DEV-ENV-ADDED" >> ${HOME}/.bashrc
echo "export LD_LIBRARY_PATH=/usr/local/cuda-11.0/lib64:/lib:\$LD_LIBRARY_PATH # MAGNUMIO-DEV-ENV-ADDED" >> ${HOME}/.bashrc
echo "export LD_LIBRARY_PATH=/usr/local/cuda-11.4/lib64:/lib:\$LD_LIBRARY_PATH # MAGNUMIO-DEV-ENV-ADDED" >> ${HOME}/.bashrc
source ${HOME}/.bashrc
fi
else
if [ $OS_FLAVOR = "redhat7" ]; then
sudo yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
sudo yum clean all
sudo yum install -y cuda-toolkit-11-0
sudo yum install -y cuda-toolkit-11-4
else
sudo dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
sudo dnf clean all
sudo dnf -y install cuda-toolkit-11-0
sudo dnf -y install cuda-toolkit-11-4
fi

echo "export PATH=/usr/local/cuda/bin/:\$PATH # MAGNUMIO-DEV-ENV-ADDED" >> ${HOME}/.bashrc
echo "export LD_LIBRARY_PATH=/usr/local/cuda-11.0/lib64:/lib:\$LD_LIBRARY_PATH # MAGNUMIO-DEV-ENV-ADDED" >> ${HOME}/.bashrc
echo "export LD_LIBRARY_PATH=/usr/local/cuda-11.4/lib64:/lib:\$LD_LIBRARY_PATH # MAGNUMIO-DEV-ENV-ADDED" >> ${HOME}/.bashrc
source ${HOME}/.bashrc
fi
set +e
Expand Down Expand Up @@ -556,9 +556,10 @@ case "$1" in
;;

build-dockerfile)
nvlog "To rebuild Dockerfile, currently need top-of-tree version of HPC Container Maker (HPCCM)"
nvlog "Clonable from https://github.com/NVIDIA/hpc-container-maker"
~/hpc-container-maker/hpccm.sh --recipe magnum-io-hpccm.py --format docker > magnum-io.Dockerfile
nvlog "To rebuild Dockerfile, HPC Container Maker (HPCCM) is used"
nvlog "Installable with \"pip install hpccm\" or from https://github.com/NVIDIA/hpc-container-maker"
hpccm --recipe magnum-io-hpccm.py --format docker > magnum-io.Dockerfile
nvlog "Finished Dockerfile rebuild"
;;
build-container)
build_container
Expand All @@ -572,11 +573,13 @@ case "$1" in
docker run --gpus all --rm -it \
--user "$(id -u):$(id -g)" \
--volume $HOME:$HOME \
--volume /run/udev:/run/udev:ro \
--workdir $HOME \
magnum-io:'${MAGNUM_IO_VERSION}
docker run --gpus all --rm -it \
--user $(id -u):$(id -g) \
--volume $HOME:$HOME \
--volume /run/udev:/run/udev:ro \
--workdir $HOME \
magnum-io:${MAGNUM_IO_VERSION}
;;
Expand Down
22 changes: 9 additions & 13 deletions dev-env/magnum-io-hpccm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,23 @@

Stage0 += comment('GENERATED FILE, DO NOT EDIT')

Stage0 += baseimage(image='nvcr.io/nvidia/cuda:11.2.2-devel-ubuntu20.04')
Stage0 += baseimage(image='nvcr.io/nvidia/cuda:11.4.0-devel-ubuntu20.04')
# GDS 1.0 is part of the CUDA base image

Stage0 += nsight_systems(cli=True, version='2021.1.1')
Stage0 += mlnx_ofed(version='5.2-2.2.0.0', oslabel='ubuntu20.04')
Stage0 += nsight_systems(cli=True, version='2021.2.1')
Stage0 += mlnx_ofed(version='5.3-1.0.0.1')
Stage0 += gdrcopy(ldconfig=True, version='2.2')
Stage0 += ucx(version='1.10.0', cuda=True,
Stage0 += ucx(version='1.10.1', cuda=True,
gdrcopy='/usr/local/gdrcopy', ldconfig=True,
disable_static=True, enable_mt=True)
Stage0 += nvshmem(version='2.0.2-0')
Stage0 += nvshmem(version='2.2.1') # See hack in instaler.sh for 2.2.1 artifact renaming
Stage0 += nccl(cuda='11.4', version='2.10.3-1')

Stage0 += comment('GDS 0.95')
Stage0 += apt_get(
keys=['https://repo.download.nvidia.com/baseos/GPG-KEY-dgx-cosmos-support'],
ospackages=['libcufile-11-2=0.95.0.94-1',
'libcufile-dev-11-2=0.95.0.94-1',
'gds-tools-11-2=0.95.0.94-1'],
repositories=['deb https://repo.download.nvidia.com/baseos/ubuntu/focal/x86_64/ focal-updates preview'])
Stage0 += apt_get(ospackages=['cuda-tools-11-4'])

Stage0 += copy(src=['magnum-io.Dockerfile', 'third_party.txt', 'README.md'], dest='/')

Stage0 += environment(variables={'MAGNUM_IO_VERSION': '21.04'})
Stage0 += environment(variables={'MAGNUM_IO_VERSION': '21.07'})

Stage0 += raw(docker='SHELL ["/bin/bash", "-c"]\n\
CMD ["/bin/bash" ]')
56 changes: 33 additions & 23 deletions dev-env/magnum-io.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# GENERATED FILE, DO NOT EDIT

FROM nvcr.io/nvidia/cuda:11.2.2-devel-ubuntu20.04
FROM nvcr.io/nvidia/cuda:11.4.0-devel-ubuntu20.04

# NVIDIA Nsight Systems 2021.1.1
# NVIDIA Nsight Systems 2021.2.1
RUN apt-get update -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
apt-transport-https \
Expand All @@ -14,18 +14,18 @@ RUN wget -qO - https://developer.download.nvidia.com/devtools/repos/ubuntu2004/a
echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/ /" >> /etc/apt/sources.list.d/hpccm.list && \
apt-get update -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
nsight-systems-cli-2021.1.1 && \
nsight-systems-cli-2021.2.1 && \
rm -rf /var/lib/apt/lists/*

# Mellanox OFED version 5.2-2.2.0.0
# Mellanox OFED version 5.3-1.0.0.1
RUN apt-get update -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ca-certificates \
gnupg \
wget && \
rm -rf /var/lib/apt/lists/*
RUN wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add - && \
mkdir -p /etc/apt/sources.list.d && wget -q -nc --no-check-certificate -P /etc/apt/sources.list.d https://linux.mellanox.com/public/repo/mlnx_ofed/5.2-2.2.0.0/ubuntu20.04/mellanox_mlnx_ofed.list && \
mkdir -p /etc/apt/sources.list.d && wget -q -nc --no-check-certificate -P /etc/apt/sources.list.d https://linux.mellanox.com/public/repo/mlnx_ofed/5.3-1.0.0.1/ubuntu20.04/mellanox_mlnx_ofed.list && \
apt-get update -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ibverbs-providers \
Expand Down Expand Up @@ -56,7 +56,7 @@ RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://
ENV CPATH=/usr/local/gdrcopy/include:$CPATH \
LIBRARY_PATH=/usr/local/gdrcopy/lib:$LIBRARY_PATH

# UCX version 1.10.0
# UCX version 1.10.1
RUN apt-get update -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
binutils-dev \
Expand All @@ -65,49 +65,59 @@ RUN apt-get update -y && \
make \
wget && \
rm -rf /var/lib/apt/lists/*
RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://github.com/openucx/ucx/releases/download/v1.10.0/ucx-1.10.0.tar.gz && \
mkdir -p /var/tmp && tar -x -f /var/tmp/ucx-1.10.0.tar.gz -C /var/tmp -z && \
cd /var/tmp/ucx-1.10.0 && ./configure --prefix=/usr/local/ucx --disable-assertions --disable-debug --disable-doxygen-doc --disable-logging --disable-params-check --disable-static --enable-mt --enable-optimizations --with-cuda=/usr/local/cuda --with-gdrcopy=/usr/local/gdrcopy && \
RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://github.com/openucx/ucx/releases/download/v1.10.1/ucx-1.10.1.tar.gz && \
mkdir -p /var/tmp && tar -x -f /var/tmp/ucx-1.10.1.tar.gz -C /var/tmp -z && \
cd /var/tmp/ucx-1.10.1 && ./configure --prefix=/usr/local/ucx --disable-assertions --disable-debug --disable-doxygen-doc --disable-logging --disable-params-check --disable-static --enable-mt --enable-optimizations --with-cuda=/usr/local/cuda --with-gdrcopy=/usr/local/gdrcopy && \
make -j$(nproc) && \
make -j$(nproc) install && \
echo "/usr/local/ucx/lib" >> /etc/ld.so.conf.d/hpccm.conf && ldconfig && \
rm -rf /var/tmp/ucx-1.10.0 /var/tmp/ucx-1.10.0.tar.gz
rm -rf /var/tmp/ucx-1.10.1 /var/tmp/ucx-1.10.1.tar.gz
ENV CPATH=/usr/local/ucx/include:$CPATH \
LIBRARY_PATH=/usr/local/ucx/lib:$LIBRARY_PATH \
PATH=/usr/local/ucx/bin:$PATH

# NVSHMEM 2.0.2-0
# NVSHMEM 2.2.1
RUN apt-get update -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
make \
wget && \
rm -rf /var/lib/apt/lists/*
RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://developer.nvidia.com/nvshmem-src-202-0 && \
mkdir -p /var/tmp && tar -x -f /var/tmp/nvshmem-src-202-0 -C /var/tmp && \
cd /var/tmp/nvshmem_src_2.0.2-0 && \
RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://developer.download.nvidia.com/compute/redist/nvshmem/2.2.1/source/nvshmem_src_2.2.1-0.txz && \
mkdir -p /var/tmp && tar -x -f /var/tmp/nvshmem_src_2.2.1-0.txz -C /var/tmp -J && \
cd /var/tmp/nvshmem_src_2.2.1-0 && \
CUDA_HOME=/usr/local/cuda NVSHMEM_MPI_SUPPORT=0 NVSHMEM_PREFIX=/usr/local/nvshmem make -j$(nproc) install && \
rm -rf /var/tmp/nvshmem_src_2.0.2-0 /var/tmp/nvshmem-src-202-0
rm -rf /var/tmp/nvshmem_src_2.2.1-0 /var/tmp/nvshmem_src_2.2.1-0.txz
ENV CPATH=/usr/local/nvshmem/include:$CPATH \
LIBRARY_PATH=/usr/local/nvshmem/lib:$LIBRARY_PATH \
PATH=/usr/local/nvshmem/bin:$PATH

# GDS 0.95

RUN wget -qO - https://repo.download.nvidia.com/baseos/GPG-KEY-dgx-cosmos-support | apt-key add - && \
echo "deb https://repo.download.nvidia.com/baseos/ubuntu/focal/x86_64/ focal-updates preview" >> /etc/apt/sources.list.d/hpccm.list && \
# NCCL 2.10.3-1
RUN apt-get update -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
apt-transport-https \
ca-certificates \
gnupg \
wget && \
rm -rf /var/lib/apt/lists/*
RUN wget -qO - https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" >> /etc/apt/sources.list.d/hpccm.list && \
apt-get update -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
gds-tools-11-2=0.95.0.94-1 \
libcufile-11-2=0.95.0.94-1 \
libcufile-dev-11-2=0.95.0.94-1 && \
libnccl-dev=2.10.3-1+cuda11.4 \
libnccl2=2.10.3-1+cuda11.4 && \
rm -rf /var/lib/apt/lists/*

RUN apt-get update -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
cuda-tools-11-4 && \
rm -rf /var/lib/apt/lists/*

COPY magnum-io.Dockerfile \
third_party.txt \
README.md \
/

ENV MAGNUM_IO_VERSION=21.04
ENV MAGNUM_IO_VERSION=21.07

SHELL ["/bin/bash", "-c"]
CMD ["/bin/bash" ]
Expand Down

0 comments on commit 397688c

Please sign in to comment.