From 64833b5185893cbc71ea80c9b01443f762b5cba4 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Fri, 17 Mar 2023 12:21:44 -0700 Subject: [PATCH] Add support for building FBGEMM_GPU against Python 3.11 in OSS (#1646) Summary: - Parallelize the FBGEMM CI builds to build and test static and shared libraries independently instead of in serial - Move the FBGEMM CI builds to run inside Docker containers - Add support for building FBGEMM_GPU against Python 3.11 in OSS - Move all FBGEMM_GPU nightly and release build jobs to run inside `amazonlinux:2023` Docker container - Assuming no build errors or resource starvation, the full OSS build process now runs under 30 minutes. Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1646 Reviewed By: shintaro-iwasaki Differential Revision: D44157228 Pulled By: q10 fbshipit-source-id: 6403ea9955856157785c50837b0b8e4c0cd26d53 --- .github/scripts/setup_env.bash | 100 +++++-- .github/workflows/fbgemm_ci.yml | 244 ++++++++---------- .github/workflows/fbgemm_gpu_ci.yml | 32 +-- .github/workflows/fbgemm_nightly_build.yml | 27 +- .../workflows/fbgemm_nightly_build_cpu.yml | 24 +- .github/workflows/fbgemm_release_build.yml | 27 +- .../workflows/fbgemm_release_build_cpu.yml | 24 +- fbgemm_gpu/docs/BuildInstructions.md | 10 +- .../split_table_batched_embeddings_ops.py | 4 +- 9 files changed, 257 insertions(+), 235 deletions(-) diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash index ccdac79097..a22a09b19e 100755 --- a/.github/scripts/setup_env.bash +++ b/.github/scripts/setup_env.bash @@ -264,22 +264,13 @@ print_gpu_info () { if which nvidia-smi; then # If nvidia-smi is installed on a machine without GPUs, this will return error (print_exec nvidia-smi) || true + else + echo "[CHECK] nvidia-smi not found" fi fi } -print_system_info () { - echo "################################################################################" - echo "# Print System Info" - echo "#" - echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" - echo "################################################################################" - echo "" - - echo "################################################################################" - echo "[INFO] Printing environment variables ..." - print_exec printenv - +__print_system_info_linux () { echo "################################################################################" echo "[INFO] Check ldd version ..." print_exec ldd --version @@ -296,6 +287,36 @@ print_system_info () { print_exec cat /etc/os-release } +__print_system_info_macos () { + echo "################################################################################" + echo "[INFO] Check CPU info ..." + sysctl -a | grep machdep.cpu + + echo "################################################################################" + echo "[INFO] Check MacOS version info ..." + print_exec uname -a + print_exec sw_vers +} + +print_system_info () { + echo "################################################################################" + echo "# Print System Info" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + + echo "################################################################################" + echo "[INFO] Printing environment variables ..." + print_exec printenv + + if [[ $OSTYPE == 'darwin'* ]]; then + __print_system_info_macos + else + __print_system_info_linux + fi +} + print_ec2_info () { echo "################################################################################" echo "# Print EC2 Instance Info" @@ -316,6 +337,30 @@ print_ec2_info () { echo "instance-type: $(get_ec2_metadata instance-type)" } +print_glibc_info () { + local library_path="$1" + if [ "$library_path" == "" ]; then + echo "Usage: ${FUNCNAME[0]} LIBRARY_PATH" + echo "Example(s):" + echo " ${FUNCNAME[0]} /usr/lib/x86_64-linux-gnu/libstdc++.so.6" + return 1 + fi + + if [ -f "${library_path}" ]; then + echo "[CHECK] Listing out the GLIBC versions referenced by: ${library_path}" + objdump -TC "${library_path}" | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/GLIBC_\1/g' | sort -Vu | cat + echo "" + + echo "[CHECK] Listing out the GLIBCXX versions referenced by: ${library_path}" + objdump -TC "${library_path}" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + echo "" + + else + echo "[CHECK] No file at path: ${library_path}" + return 1 + fi +} + ################################################################################ # Miniconda Setup Functions @@ -342,7 +387,7 @@ setup_miniconda () { print_exec mkdir -p "$miniconda_prefix" echo "[SETUP] Downloading the Miniconda installer ..." - print_exec wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh + (exec_with_retries wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh) || return 1 echo "[SETUP] Installing Miniconda ..." print_exec bash miniconda.sh -b -p "$miniconda_prefix" -u @@ -360,9 +405,16 @@ setup_miniconda () { print_exec conda info # These variables will be exported outside + echo "[SETUP] Exporting Miniconda variables ..." export PATH="${miniconda_prefix}/bin:${PATH}" export CONDA="${miniconda_prefix}" + if [ -f "${GITHUB_PATH}" ]; then + echo "[SETUP] Saving Miniconda variables to ${GITHUB_PATH} ..." + echo "${miniconda_prefix}/bin" >> "${GITHUB_PATH}" + echo "CONDA=${miniconda_prefix}" >> "${GITHUB_PATH}" + fi + echo "[SETUP] Successfully set up Miniconda at ${miniconda_prefix}" } @@ -448,9 +500,11 @@ install_pytorch_conda () { fi # Install PyTorch packages + # NOTE: Installation of large package might fail due to corrupt package download + # Use --force-reinstall to address this on retries - https://datascience.stackexchange.com/questions/41732/conda-verification-failed echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, CPU=${pytorch_cpu:-0}) through Conda using channel '${pytorch_channel}' ..." # shellcheck disable=SC2086 - (exec_with_retries conda install -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1 + (exec_with_retries conda install --force-reinstall -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1 # Run check for GPU variant if [ "$pytorch_cpu" == "" ]; then @@ -612,7 +666,7 @@ install_cuda () { # Install CUDA packages echo "[INSTALL] Installing CUDA ${cuda_version} ..." - (exec_with_retries conda install -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1 + (exec_with_retries conda install --force-reinstall -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1 # Ensure that nvcc is properly installed (test_binpath "${env_name}" nvcc) || return 1 @@ -806,15 +860,19 @@ install_cxx_compiler () { install_system_packages gcc gcc-c++ else - # Install gxx_linux-64 from main instead of cxx-compiler from conda-forge, as - # the latter breaks builds: + # Install gxx_linux-64 from conda-forge instead of from anaconda channel. + # sysroot_linux-64 needs to be installed alongside this: + # # https://root-forum.cern.ch/t/error-timespec-get-has-not-been-declared-with-conda-root-package/45712/6 + # https://github.com/conda-forge/conda-forge.github.io/issues/1625 + # https://conda-forge.org/docs/maintainer/knowledge_base.html#using-centos-7 + # https://github.com/conda/conda-build/issues/4371 # - # NOTE: Install g++ 9.x instead of 11.x becaue 11.x builds libraries with - # references to GLIBCXX_3.4.29, which is not available on systems with older + # NOTE: We install g++ 10.x instead of 11.x becaue 11.x builds binaries that + # reference GLIBCXX_3.4.29, which may not be available on systems with older # versions of libstdc++.so.6 such as CentOS Stream 8 and Ubuntu 20.04 echo "[INSTALL] Installing C/C++ compilers through Conda ..." - (exec_with_retries conda install -n "${env_name}" -y gxx_linux-64=9.3.0) || return 1 + (exec_with_retries conda install -n "${env_name}" -y gxx_linux-64=10.4.0 sysroot_linux-64=2.17 -c conda-forge) || return 1 # The compilers are visible in the PATH as `x86_64-conda-linux-gnu-cc` and # `x86_64-conda-linux-gnu-c++`, so symlinks will need to be created @@ -1055,7 +1113,7 @@ check_fbgemm_gpu_build () { for library in "${fbgemm_gpu_so_files[@]}"; do echo "[CHECK] Listing out the GLIBCXX versions referenced by the library: ${library}" - objdump -TC "${library}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + print_glibc_info "${library}" echo "[CHECK] Verifying sample subset of symbols in the library ..." for symbol in "${lib_symbols_to_check[@]}"; do diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml index 977b443a2b..9b18dfb884 100644 --- a/.github/workflows/fbgemm_ci.yml +++ b/.github/workflows/fbgemm_ci.yml @@ -19,185 +19,165 @@ concurrency: cancel-in-progress: true jobs: - build-posix: - runs-on: ${{ matrix.os }} + build-linux: + runs-on: linux.12xlarge + container: + image: ${{ matrix.container-image }} + options: --user root + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_DIR: build_${{ matrix.library-type }} + DEBIAN_FRONTEND: noninteractive strategy: + fail-fast: false matrix: - os: [ ubuntu-latest, macos-latest ] + container-image: [ "ubuntu:20.04" ] + library-type: [ static, shared ] steps: - - uses: actions/checkout@v3 - - name: Checkout submodules - shell: bash + - name: Setup Build Container run: | - auth_header="$(git config --local --get http.https://github.com/.extraheader)" - git submodule sync --recursive - git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1 + apt update -y + apt install -y binutils build-essential cmake git libblas-dev python3 sudo wget + git config --global --add safe.directory '*' - - name: Get CPU info on Ubuntu - if: contains(runner.os, 'linux') - run: | - cat /proc/cpuinfo + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true - - name: Get CPU info on macOS - if: contains(runner.os, 'macOs') - run: | - sysctl -a | grep machdep.cpu + - name: Display System Info + run: . $PRELUDE; print_system_info - - name: Get env vars - run: | - echo GITHUB_WORKFLOW = $GITHUB_WORKFLOW - echo HOME = $HOME - echo GITHUB_ACTION = $GITHUB_ACTION - echo GITHUB_ACTIONS = $GITHUB_ACTIONS - echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY - echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME - echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH - echo GITHUB_WORKSPACE = $GITHUB_WORKSPACE - echo GITHUB_SHA = $GITHUB_SHA - echo GITHUB_REF = $GITHUB_REF - c++ --verbose - - - name: Build static FBGEMM lib + - name: Build FBGEMM Library (${{ matrix.library-type }}) run: | set -e - mkdir build_static - cd build_static - cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=static .. - make + mkdir $BUILD_DIR; cd $BUILD_DIR + cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DPYTHON_EXECUTABLE=/usr/bin/python3 .. + make -j - - name: Test static FBGEMM lib - if: contains(runner.os, 'linux') # not run on macos-latest now due to supporting AVX2 + - name: Test FBGEMM Library (${{ matrix.library-type }}) run: | set -e - cd build_static + cd $BUILD_DIR ctest --rerun-failed --output-on-failure - - name: Build shared FBGEMM lib + + build-macos: + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_DIR: build_${{ matrix.library-type }} + strategy: + fail-fast: false + matrix: + os: [ macos-latest ] + library-type: [ static, shared ] + + steps: + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Display System Info + run: . $PRELUDE; print_system_info + + # Build but skip tests due to lack of support for AVX2 + - name: Build FBGEMM Library (${{ matrix.library-type }}) run: | set -e - mkdir build_shared - cd build_shared - cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=shared .. - make + mkdir $BUILD_DIR; cd $BUILD_DIR + cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} .. + make -j + + + build-bazel: + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest ] - - name: Test shared FBGEMM lib - if: contains(runner.os, 'linux') # not run on macos-latest now due to supporting AVX2 + steps: + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Display System Info + run: . $PRELUDE; print_system_info + + - name: Download bazel run: | set -e - cd build_shared - ctest --rerun-failed --output-on-failure + wget https://github.com/bazelbuild/bazel/releases/download/2.2.0/bazel-2.2.0-linux-x86_64 -O bazel + # verify content + echo 'b2f002ea0e6194a181af6ac84cd94bd8dc797722eb2354690bebac92dda233ff bazel' | sha256sum --quiet -c + chmod +x bazel + + - name: Build FBGEMM with bazel + run: ./bazel build --verbose_explanations --verbose_failures --compilation_mode opt :* + + - name: Test FBGEMM bazel build + run: ./bazel test --test_output=all --verbose_explanations --verbose_failures --compilation_mode opt :* + build-windows: runs-on: ${{ matrix.os }} + defaults: + run: + shell: cmd + env: + BUILD_DIR: build_${{ matrix.library-type }} strategy: + fail-fast: false matrix: - os: [windows-2019] + os: [ windows-2019 ] + library-type: [ static, shared ] steps: - - uses: actions/checkout@v3 - - name: Checkout submodules - shell: bash - run: | - auth_header="$(git config --local --get http.https://github.com/.extraheader)" - git submodule sync --recursive - git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1 + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true - name: Get CPU info on Windows shell: cmd run: | wmic cpu list full - - name: Build static FBGEMM lib + - name: Build FBGEMM Library (${{ matrix.library-type }}) shell: cmd run: | call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 echo "INSTALL NINJA:" pip install ninja which ninja - mkdir build_static - cd build_static + mkdir %BUILD_DIR% + cd %BUILD_DIR% echo "STARTING CMAKE" - cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=static -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" .. + cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" .. ninja all echo "Build Success" - - name: Test static FBGEMM lib - shell: cmd - run: | - echo %cd% - cd build_static - ctest --rerun-failed --output-on-failure - if errorlevel 1 exit /b 1 - - - name: Build shared FBGEMM lib - shell: cmd - run: | - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 - echo "INSTALL NINJA:" - pip install ninja - which ninja - mkdir build_shared - cd build_shared - echo "STARTING CMAKE" - cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=shared -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" .. - ninja all - if errorlevel 1 exit /b 1 - - - name: Test shared FBGEMM lib + - name: Test FBGEMM Library (${{ matrix.library-type }}) shell: cmd run: | echo %cd% - cd build_shared + cd %BUILD_DIR% set PATH=%PATH%;%cd%;%cd%\asmjit echo %PATH% ctest --rerun-failed --output-on-failure if errorlevel 1 exit /b 1 - - build-bazel: - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ ubuntu-latest ] - - steps: - - uses: actions/checkout@v3 - - name: Checkout submodules - shell: bash - run: | - auth_header="$(git config --local --get http.https://github.com/.extraheader)" - git submodule sync --recursive - git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1 - - - name: Get env vars - run: | - echo GITHUB_WORKFLOW = $GITHUB_WORKFLOW - echo HOME = $HOME - echo GITHUB_ACTION = $GITHUB_ACTION - echo GITHUB_ACTIONS = $GITHUB_ACTIONS - echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY - echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME - echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH - echo GITHUB_WORKSPACE = $GITHUB_WORKSPACE - echo GITHUB_SHA = $GITHUB_SHA - echo GITHUB_REF = $GITHUB_REF - c++ --verbose - - - name: Download bazel - run: | - set -e - wget https://github.com/bazelbuild/bazel/releases/download/2.2.0/bazel-2.2.0-linux-x86_64 -O bazel - # verify content - echo 'b2f002ea0e6194a181af6ac84cd94bd8dc797722eb2354690bebac92dda233ff bazel' | sha256sum --quiet -c - chmod +x bazel - - - - name: Build FBGEMM with bazel - run: | - set -e - ./bazel build --verbose_explanations --verbose_failures --compilation_mode opt :* - - - name: Test FBGEMM bazel build - run: | - set -e - ./bazel test --test_output=all --verbose_explanations --verbose_failures --compilation_mode opt :* diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml index bd62f23761..adf8443eae 100644 --- a/.github/workflows/fbgemm_gpu_ci.yml +++ b/.github/workflows/fbgemm_gpu_ci.yml @@ -20,7 +20,7 @@ concurrency: jobs: build_and_test_amd: - runs-on: ${{ matrix.os }} + runs-on: linux.12xlarge container: image: ${{ matrix.container-image }} options: --user root @@ -33,9 +33,8 @@ jobs: strategy: fail-fast: false matrix: - os: [ linux.12xlarge ] container-image: [ "ubuntu:20.04" ] - python-version: [ "3.10" ] + python-version: [ "3.8", "3.9", "3.10" ] rocm-version: [ "5.3" ] steps: @@ -60,10 +59,7 @@ jobs: run: . $PRELUDE; free_disk_space - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -77,7 +73,7 @@ jobs: - name: Install PyTorch-ROCm Nightly run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }} - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU-ROCM Nightly @@ -146,7 +142,10 @@ jobs: build_and_test_cpu: - runs-on: ${{ matrix.os }} + runs-on: linux.12xlarge + container: + image: ${{ matrix.container-image }} + options: --user root defaults: run: shell: bash @@ -156,10 +155,16 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-20.04, ubuntu-latest ] + container-image: [ "ubuntu:20.04", "ubuntu:22.04" ] python-version: [ "3.8", "3.9", "3.10" ] steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils build-essential git sudo wget + git config --global --add safe.directory '*' + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -172,10 +177,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -186,7 +188,7 @@ jobs: - name: Install PyTorch run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build and Install FBGEMM_GPU (CPU version) diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml index bc699ef62b..b0ac76900c 100644 --- a/.github/workflows/fbgemm_nightly_build.yml +++ b/.github/workflows/fbgemm_nightly_build.yml @@ -38,7 +38,10 @@ concurrency: jobs: # Build on CPU hosts and upload to GHA build_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.24xlarge + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -49,11 +52,13 @@ jobs: # Don't fast-fail all the other builds if one of the them fails fail-fast: false matrix: - os: [ linux.12xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo tar wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -66,10 +71,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -89,7 +91,7 @@ jobs: - name: Install cuDNN run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }} - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU Nightly @@ -116,7 +118,7 @@ jobs: fail-fast: false matrix: os: [ linux.g5.4xlarge.nvidia.gpu ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] # Specify exactly ONE CUDA version for artifact publish cuda-version-publish: [ "11.7.1" ] @@ -135,10 +137,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -149,7 +148,7 @@ jobs: - name: Install PyTorch Nightly run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Download Wheel Artifact from GHA diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml index 1125b17a0d..d99c3f73ee 100644 --- a/.github/workflows/fbgemm_nightly_build_cpu.yml +++ b/.github/workflows/fbgemm_nightly_build_cpu.yml @@ -39,7 +39,7 @@ concurrency: jobs: # Build on CPU hosts, run tests, and upload to GHA build_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.4xlarge container: image: amazonlinux:2023 options: --user root @@ -53,8 +53,7 @@ jobs: # Don't fast-fail all the other builds if one of the them fails fail-fast: false matrix: - os: [ linux.4xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] steps: - name: Setup Build Container @@ -72,10 +71,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -89,7 +85,7 @@ jobs: - name: Install PyTorch-CPU Nightly run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU Nightly (CPU version) @@ -104,7 +100,7 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.4xlarge container: image: amazonlinux:2023 options: --user root @@ -117,8 +113,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ linux.4xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] needs: build_artifact steps: @@ -137,10 +132,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -148,7 +140,7 @@ jobs: - name: Install PyTorch Nightly run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Download Wheel Artifact from GHA diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml index def6002a76..75d5235b69 100644 --- a/.github/workflows/fbgemm_release_build.yml +++ b/.github/workflows/fbgemm_release_build.yml @@ -30,7 +30,10 @@ concurrency: jobs: # Build on CPU hosts and upload to GHA build_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.24xlarge + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -41,11 +44,13 @@ jobs: # Don't fast-fail all the other builds if one of the them fails fail-fast: false matrix: - os: [ linux.12xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo tar wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -58,10 +63,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -81,7 +83,7 @@ jobs: - name: Install cuDNN run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }} - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU @@ -108,7 +110,7 @@ jobs: fail-fast: false matrix: os: [ linux.g5.4xlarge.nvidia.gpu ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] # Specify exactly ONE CUDA version for artifact publish cuda-version-publish: [ "11.7.1" ] @@ -126,10 +128,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -140,7 +139,7 @@ jobs: - name: Install PyTorch Test run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Download Wheel Artifact from GHA diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml index c7fb53cabd..f13ebd32c9 100644 --- a/.github/workflows/fbgemm_release_build_cpu.yml +++ b/.github/workflows/fbgemm_release_build_cpu.yml @@ -30,7 +30,7 @@ concurrency: jobs: # Build on CPU hosts, run tests, and upload to GHA build_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.4xlarge container: image: amazonlinux:2023 options: --user root @@ -44,8 +44,7 @@ jobs: # Don't fast-fail all the other builds if one of the them fails fail-fast: false matrix: - os: [ linux.4xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] steps: - name: Setup Build Container @@ -63,10 +62,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -80,7 +76,7 @@ jobs: - name: Install PyTorch-CPU Test run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU (CPU version) @@ -95,7 +91,7 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.4xlarge container: image: amazonlinux:2023 options: --user root @@ -108,8 +104,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ linux.4xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] needs: build_artifact steps: @@ -128,10 +123,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -139,7 +131,7 @@ jobs: - name: Install PyTorch Test run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Download Wheel Artifact from GHA diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md index a90a059b40..56aa780fe3 100644 --- a/fbgemm_gpu/docs/BuildInstructions.md +++ b/fbgemm_gpu/docs/BuildInstructions.md @@ -22,7 +22,7 @@ environment is recommended for reproducible builds: # Set the Miniconda prefix directory miniconda_prefix=$HOME/miniconda -# Download the Miniconfs installer +# Download the Miniconda installer wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh # Run the installer @@ -59,7 +59,7 @@ conda run -n "${env_name}" python -m pip install pyOpenSSL>22.1.0 ### C/C++ Compiler -Install the GCC toolchain. Note that GCC (as opposed to LLVM for example) is +Install the GCC toolchain. Note that GCC (as opposed to Clang for example) is required for GPU (CUDA) builds because NVIDIA's `nvcc` relies on `gcc` and `g++` in the path. @@ -71,7 +71,7 @@ Note that while newer versions of GCC can be used, binaries compiled under newer versions of GCC will not be compatible with older systems such as Ubuntu 20.04 or CentOS Stream 8, because the compiled library will reference symbols from versions of `GLIBCXX` that the system's `libstdc++.so.6` will not support. To -see what versions of GLIBCXX that the available `libstdc++.so.6` supports: +see what versions of GLIBCXX the available `libstdc++.so.6` supports: ```sh libcxx_path=/path/to/libstdc++.so.6 @@ -193,7 +193,7 @@ From there, the rest of the build environment may be constructed through Conda. ### Install ROCm -Install the full ROCm package through the operating system package manger. The +Install the full ROCm package through the operating system package manager. The full instructions can be found in the [ROCm installation guide](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.3/page/How_to_Install_ROCm.html): @@ -346,7 +346,7 @@ package_name=fbgemm_gpu # Build for SM70/80 (V100/A100 GPU); update as needed # If not specified, only the CUDA architecture supported by current system will be targeted -# Ifo CUDA device is present either, all CUDA architectures will be targeted +# If no CUDA device is present either, all CUDA architectures will be targeted cuda_arch_list=7.0;8.0 # Build the wheel artifact only diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py index 87b9b1a559..8120cdcb03 100644 --- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py +++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py @@ -9,7 +9,7 @@ import enum import logging -from dataclasses import dataclass +from dataclasses import dataclass, field from itertools import accumulate from math import log2 from typing import Dict, List, NamedTuple, Optional, Tuple, Type, Union @@ -106,7 +106,7 @@ class CounterBasedRegularizationDefinition: adjustment_ub: float = 1.0 learning_rate_mode: LearningRateMode = LearningRateMode.EQUAL grad_sum_decay: GradSumDecay = GradSumDecay.NO_DECAY - tail_id_threshold: TailIdThreshold = TailIdThreshold(val=0, is_ratio=False) + tail_id_threshold: TailIdThreshold = field(default_factory=TailIdThreshold) max_counter_update_freq: int = 1000