Skip to content

Commit

Permalink
[ROCm]: Updates for container and build script
Browse files Browse the repository at this point in the history
	-Updated dockerfile.ms
	-Updated build script to switch building against XLA repo
  	-Update CI script
	-Update jaxlib setup.py to add rocm version
  • Loading branch information
Rahul Batra committed Jun 19, 2023
1 parent 94674b9 commit b0e541a
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 133 deletions.
66 changes: 46 additions & 20 deletions build/rocm/Dockerfile.ms
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,26 @@
FROM rocm/dev-ubuntu-20.04:5.4-complete as rt_build
MAINTAINER Rahul Batra<[email protected]>
################################################################################
ARG ROCM_PATH=/opt/rocm-5.4.0
ARG ROCM_DEB_REPO=http://repo.radeon.com/rocm/apt/5.5/
ARG ROCM_BUILD_NAME=ubuntu
ARG ROCM_BUILD_NUM=main
ARG ROCM_PATH=/opt/rocm-5.5.0

ARG DEBIAN_FRONTEND=noninteractive
ARG PYTHON_VERSION=3.9.0
ENV HOME /root/
ENV ROCM_PATH=$ROCM_PATH

RUN apt-get --allow-unauthenticated update && apt install -y wget software-properties-common
RUN apt-get clean all
RUN wget -qO - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -;
RUN bin/bash -c 'if [[ $ROCM_DEB_REPO == http://repo.radeon.com/rocm/* ]] ; then \
echo "deb [arch=amd64] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" > /etc/apt/sources.list.d/rocm.list; \
else \
echo "deb [arch=amd64 trusted=yes] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" > /etc/apt/sources.list.d/rocm.list ; \
fi'


RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y \
build-essential \
software-properties-common \
Expand All @@ -21,10 +35,34 @@ RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteracti
virtualenv \
python3-pip \
pciutils \
python-is-python3 \
libffi-dev \
libssl-dev \
build-essential \
zlib1g-dev \
libbz2-dev \
libreadline-dev \
libsqlite3-dev curl \
libncursesw5-dev \
xz-utils \
tk-dev \
libxml2-dev \
libxmlsec1-dev \
libffi-dev \
liblzma-dev \
wget && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Add to get ppa
RUN apt-get update
RUN apt-get install -y software-properties-common
# Install rocm pkgs
RUN apt-get update --allow-insecure-repositories && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
rocm-dev rocm-libs rccl && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Set up paths
ENV HCC_HOME=$ROCM_PATH/hcc
Expand Down Expand Up @@ -55,26 +93,14 @@ ENV PATH="/root/bin:/root/.local/bin:$PATH"
# Install python3.9
RUN add-apt-repository ppa:deadsnakes/ppa && \
apt update && \
apt install -y python3.9-dev \
python3-pip \
python3.9-distutils \
python-is-python3
# Install pyenv with different python versions
RUN git clone https://github.com/pyenv/pyenv.git /pyenv
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1
ENV PYENV_ROOT /pyenv
ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
RUN pip3 install --upgrade --force-reinstall setuptools pip
RUN pyenv install $PYTHON_VERSION
RUN pip3 install absl-py numpy==1.20.0 scipy wheel six setuptools pytest pytest-rerunfailures matplotlib
RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip==22.0 && pip install numpy==1.21.0 setuptools wheel six auditwheel scipy pytest pytest-rerunfailures matplotlib absl-py
# Get jax and build it with ROCm
RUN git clone https://github.com/google/jax.git
################################################################################
FROM rt_build as ci_build
################################################################################
WORKDIR /jax
RUN ./build/rocm/build_rocm.sh
RUN ./build/rocm/run_single_gpu.py
RUN ./build/rocm/run_multi_gpu.sh
90 changes: 0 additions & 90 deletions build/rocm/Dockerfile.rocm

This file was deleted.

58 changes: 45 additions & 13 deletions build/rocm/build_rocm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,55 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# Environment Var Notes
# XLA_CLONE_DIR -
# Specifies filepath to where XLA repo is cloned.
# NOTE:, if this is set then XLA repo is not cloned. Must clone repo before running this script.
# Also, if this is set then setting XLA_REPO and XLA_BRANCH have no effect.
# XLA_REPO
# XLA repo to clone from. Default is https://github.com/ROCmSoftwarePlatform/tensorflow-upstream
# XLA_BRANCH
# XLA branch in the XLA repo. Default is develop-upstream-jax
#

set -eux
python -V

#If XLA_REPO is not set, then use default
if [ ! -v XLA_REPO ]; then
XLA_REPO="https://github.com/ROCmSoftwarePlatform/tensorflow-upstream"
XLA_BRANCH="develop-upstream-jax"
elif [ -z "$XLA_REPO" ]; then
XLA_REPO="https://github.com/ROCmSoftwarePlatform/tensorflow-upstream"
XLA_BRANCH="develop-upstream-jax"
fi

ROCM_TF_FORK_REPO="https://github.com/ROCmSoftwarePlatform/tensorflow-upstream"
ROCM_TF_FORK_BRANCH="develop-upstream"
rm -rf /tmp/tensorflow-upstream || true
git clone -b ${ROCM_TF_FORK_BRANCH} ${ROCM_TF_FORK_REPO} /tmp/tensorflow-upstream
if [ ! -v TENSORFLOW_ROCM_COMMIT ]; then
echo "The TENSORFLOW_ROCM_COMMIT environment variable is not set, using top of branch"
elif [ ! -z "$TENSORFLOW_ROCM_COMMIT" ]
then
echo "Using tensorflow-rocm at commit: $TENSORFLOW_ROCM_COMMIT"
cd /tmp/tensorflow-upstream
git checkout $TENSORFLOW_ROCM_COMMIT
cd -
#If XLA_CLONE_PATH is not set, then use default path.
#Note, setting XLA_CLONE_PATH makes setting XLA_REPO and XLA_BRANCH a no-op
#Set this when XLA repository has been already clone. This is useful in CI
#environments and when doing local development
if [ ! -v XLA_CLONE_DIR ]; then
XLA_CLONE_DIR=/tmp/tensorflow-upstream
rm -rf /tmp/tensorflow-upstream || true
git clone -b ${XLA_BRANCH} ${XLA_REPO} /tmp/tensorflow-upstream
elif [ -z "$XLA_CLONE_DIR" ]; then
XLA_CLONE_DIR=/tmp/tensorflow-upstream
rm -rf /tmp/tensorflow-upstream || true
git clone -b ${XLA_BRANCH} ${XLA_REPO} /tmp/tensorflow-upstream
fi


python3 ./build/build.py --enable_rocm --rocm_path=${ROCM_PATH} --bazel_options=--override_repository=org_tensorflow=/tmp/tensorflow-upstream
#Export JAX_ROCM_VERSION so that it is appened in the wheel name
rocm_version=$(cat /opt/rocm/.info/version | cut -d "-" -f 1)
export JAX_ROCM_VERSION=${rocm_version//./}

#Build and install wheel
python3 ./build/build.py --enable_rocm --rocm_path=${ROCM_PATH} --bazel_options=--override_repository=xla=${XLA_CLONE_DIR}
pip3 install --force-reinstall dist/*.whl # installs jaxlib (includes XLA)
pip3 install --force-reinstall . # installs jax

#This is for CI to read without having to start the container again
if [ -v CI_RUN ]; then
pip3 list | grep jaxlib | tr -s ' ' | cut -d " " -f 2 | cut -d "+" -f 1 > jax_version_installed
cat /opt/rocm/.info/version | cut -d "-" -f 1 > jax_rocm_version
fi
54 changes: 44 additions & 10 deletions build/rocm/ci_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,26 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# Usage: ci_build.sh [--dockerfile <DOCKERFILE_PATH> --keep_image]
# Usage: ci_build.sh [--dockerfile <DOCKERFILE_PATH> --keep_image --py_version <PYTHON_VERSION>]
# <COMMAND>
#
# DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build.
# DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docer build.
# If this optional value is not supplied (via the --dockerfile flag)
# Dockerfile.rocm (located in the same directory as this script)
# Dockerfile.ms (located in the same directory as this script)
# will be used.
# KEEP_IMAGE: (Optional) If this flag is set, the container will be committed as an image
#
# PYTHON_VERSION: Python version to use
#
# COMMAND: Command to be executed in the docker container
#
# Environment variables read by this script
# WORKSPACE
# XLA_REPO
# XLA_BRANCH
# XLA_CLONE_DIR
# BUILD_TAG
#

set -eux

Expand All @@ -33,12 +43,17 @@ CONTAINER_TYPE="rocm"
DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile.ms"
DOCKER_CONTEXT_PATH="${SCRIPT_DIR}"
KEEP_IMAGE="--rm"
KEEP_CONTAINER="--rm"
POSITIONAL_ARGS=()

RUNTIME_FLAG=0
RUNTIME_FLAG=1

while [[ $# -gt 0 ]]; do
case $1 in
--py_version)
PYTHON_VERSION="$2"
shift 2
;;
--dockerfile)
DOCKERFILE_PATH="$2"
DOCKER_CONTEXT_PATH=$(dirname "${DOCKERFILE_PATH}")
Expand All @@ -52,6 +67,11 @@ while [[ $# -gt 0 ]]; do
RUNTIME_FLAG=1
shift 1
;;
--keep_container)
KEEP_CONTAINER=""
shift 1
;;

*)
POSITIONAL_ARGS+=("$1")
shift
Expand All @@ -78,7 +98,7 @@ WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
BUILD_TAG="${BUILD_TAG:-jax}"

# Determine the docker image name and BUILD_TAG.
DOCKER_IMG_NAME="${BUILD_TAG}_${CONTAINER_TYPE}"
DOCKER_IMG_NAME="${BUILD_TAG}.${CONTAINER_TYPE}"

# Under Jenkins matrix build, the build tag may contain characters such as
# commas (,) and equal signs (=), which are not valid inside docker image names.
Expand All @@ -94,13 +114,17 @@ echo "BUILD_TAG: ${BUILD_TAG}"
echo " (docker container name will be ${DOCKER_IMG_NAME})"
echo ""

echo "Building container (${DOCKER_IMG_NAME})..."
echo "Python Version (${PYTHON_VERSION})"
if [[ "${RUNTIME_FLAG}" -eq 1 ]]; then
echo "Building (runtime) container (${DOCKER_IMG_NAME}) with Dockerfile($DOCKERFILE_PATH)..."
docker build --target rt_build --tag ${DOCKER_IMG_NAME} \
--build-arg PYTHON_VERSION=$PYTHON_VERSION \
-f "${DOCKERFILE_PATH}" "${DOCKER_CONTEXT_PATH}"
else
echo "Building (CI) container (${DOCKER_IMG_NAME}) with Dockerfile($DOCKERFILE_PATH)..."
docker build --target ci_build --tag ${DOCKER_IMG_NAME} \
--build-arg PYTHON_VERSION=$PYTHON_VERSION \
-f "${DOCKERFILE_PATH}" "${DOCKER_CONTEXT_PATH}"
fi

Expand All @@ -112,22 +136,32 @@ fi
# Run the command inside the container.
echo "Running '${POSITIONAL_ARGS[*]}' inside ${DOCKER_IMG_NAME}..."

export TENSORFLOW_ROCM_COMMIT="${TENSORFLOW_ROCM_COMMIT:-}"
export XLA_REPO="${XLA_REPO:-}"
export XLA_BRANCH="${XLA_BRANCH:-}"
export XLA_CLONE_DIR="${XLA_CLONE_DIR:-}"
export JAX_RENAME_WHL="${XLA_CLONE_DIR:-}"

if [ ! -z ${XLA_CLONE_DIR} ]; then
ROCM_EXTRA_PARAMS=${ROCM_EXTRA_PARAMS}" -v ${XLA_CLONE_DIR}:${XLA_CLONE_DIR}"
fi

docker run ${KEEP_IMAGE} --name ${DOCKER_IMG_NAME} --pid=host \
-v ${WORKSPACE}:/workspace \
-w /workspace \
-e TENSORFLOW_ROCM_COMMIT=${TENSORFLOW_ROCM_COMMIT} \
-e XLA_REPO=${XLA_REPO} \
-e XLA_BRANCH=${XLA_BRANCH} \
-e XLA_CLONE_DIR=${XLA_CLONE_DIR} \
-e PYTHON_VERSION=$PYTHON_VERSION \
-e CI_RUN=1 \
${ROCM_EXTRA_PARAMS} \
"${DOCKER_IMG_NAME}" \
${POSITIONAL_ARGS[@]}

if [[ "${KEEP_IMAGE}" != "--rm" ]] && [[ $? == "0" ]]; then
echo "Committing the docker container as jax-rocm"
echo "Committing the docker container as ${DOCKER_IMG_NAME}"
docker stop ${DOCKER_IMG_NAME}
docker commit ${DOCKER_IMG_NAME} jax-rocm
docker commit ${DOCKER_IMG_NAME} ${DOCKER_IMG_NAME}
docker rm ${DOCKER_IMG_NAME} # remove this temp container
docker rmi ${DOCKER_IMG_NAME} # remote this temp image
fi

echo "Jax-ROCm build was successful!"
4 changes: 4 additions & 0 deletions jaxlib/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@
if cuda_version and cudnn_version:
__version__ += f"+cuda{cuda_version.replace('.', '')}-cudnn{cudnn_version.replace('.', '')}"

rocm_version = os.environ.get("JAX_ROCM_VERSION")
if rocm_version:
__version__ += f"+rocm{rocm_version.replace('.', '')}"

class BinaryDistribution(Distribution):
"""This class makes 'bdist_wheel' include an ABI tag on the wheel."""

Expand Down

0 comments on commit b0e541a

Please sign in to comment.