From 64833b5185893cbc71ea80c9b01443f762b5cba4 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Fri, 17 Mar 2023 12:21:44 -0700
Subject: [PATCH] Add support for building FBGEMM_GPU against Python 3.11 in
 OSS (#1646)

Summary:
- Parallelize the FBGEMM CI builds to build and test static and shared libraries independently instead of in serial
- Move the FBGEMM CI builds to run inside Docker containers
- Add support for building FBGEMM_GPU against Python 3.11 in OSS
- Move all FBGEMM_GPU nightly and release build jobs to run inside `amazonlinux:2023` Docker container
- Assuming no build errors or resource starvation, the full OSS build process now runs under 30 minutes.

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1646

Reviewed By: shintaro-iwasaki

Differential Revision: D44157228

Pulled By: q10

fbshipit-source-id: 6403ea9955856157785c50837b0b8e4c0cd26d53
---
 .github/scripts/setup_env.bash                | 100 +++++--
 .github/workflows/fbgemm_ci.yml               | 244 ++++++++----------
 .github/workflows/fbgemm_gpu_ci.yml           |  32 +--
 .github/workflows/fbgemm_nightly_build.yml    |  27 +-
 .../workflows/fbgemm_nightly_build_cpu.yml    |  24 +-
 .github/workflows/fbgemm_release_build.yml    |  27 +-
 .../workflows/fbgemm_release_build_cpu.yml    |  24 +-
 fbgemm_gpu/docs/BuildInstructions.md          |  10 +-
 .../split_table_batched_embeddings_ops.py     |   4 +-
 9 files changed, 257 insertions(+), 235 deletions(-)

diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
index ccdac79097..a22a09b19e 100755
--- a/.github/scripts/setup_env.bash
+++ b/.github/scripts/setup_env.bash
@@ -264,22 +264,13 @@ print_gpu_info () {
     if which nvidia-smi; then
       # If nvidia-smi is installed on a machine without GPUs, this will return error
       (print_exec nvidia-smi) || true
+    else
+      echo "[CHECK] nvidia-smi not found"
     fi
   fi
 }
 
-print_system_info () {
-  echo "################################################################################"
-  echo "# Print System Info"
-  echo "#"
-  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
-  echo "################################################################################"
-  echo ""
-
-  echo "################################################################################"
-  echo "[INFO] Printing environment variables ..."
-  print_exec printenv
-
+__print_system_info_linux () {
   echo "################################################################################"
   echo "[INFO] Check ldd version ..."
   print_exec ldd --version
@@ -296,6 +287,36 @@ print_system_info () {
   print_exec cat /etc/os-release
 }
 
+__print_system_info_macos () {
+  echo "################################################################################"
+  echo "[INFO] Check CPU info ..."
+  sysctl -a | grep machdep.cpu
+
+  echo "################################################################################"
+  echo "[INFO] Check MacOS version info ..."
+  print_exec uname -a
+  print_exec sw_vers
+}
+
+print_system_info () {
+  echo "################################################################################"
+  echo "# Print System Info"
+  echo "#"
+  echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
+  echo "################################################################################"
+  echo ""
+
+  echo "################################################################################"
+  echo "[INFO] Printing environment variables ..."
+  print_exec printenv
+
+  if [[ $OSTYPE == 'darwin'* ]]; then
+    __print_system_info_macos
+  else
+    __print_system_info_linux
+  fi
+}
+
 print_ec2_info () {
   echo "################################################################################"
   echo "# Print EC2 Instance Info"
@@ -316,6 +337,30 @@ print_ec2_info () {
   echo "instance-type: $(get_ec2_metadata instance-type)"
 }
 
+print_glibc_info () {
+  local library_path="$1"
+  if [ "$library_path" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} LIBRARY_PATH"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} /usr/lib/x86_64-linux-gnu/libstdc++.so.6"
+    return 1
+  fi
+
+  if [ -f "${library_path}" ]; then
+    echo "[CHECK] Listing out the GLIBC versions referenced by: ${library_path}"
+    objdump -TC "${library_path}" | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/GLIBC_\1/g' | sort -Vu | cat
+    echo ""
+
+    echo "[CHECK] Listing out the GLIBCXX versions referenced by: ${library_path}"
+    objdump -TC "${library_path}" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+    echo ""
+
+  else
+    echo "[CHECK] No file at path: ${library_path}"
+    return 1
+  fi
+}
+
 
 ################################################################################
 # Miniconda Setup Functions
@@ -342,7 +387,7 @@ setup_miniconda () {
     print_exec mkdir -p "$miniconda_prefix"
 
     echo "[SETUP] Downloading the Miniconda installer ..."
-    print_exec wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+    (exec_with_retries wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh) || return 1
 
     echo "[SETUP] Installing Miniconda ..."
     print_exec bash miniconda.sh -b -p "$miniconda_prefix" -u
@@ -360,9 +405,16 @@ setup_miniconda () {
   print_exec conda info
 
   # These variables will be exported outside
+  echo "[SETUP] Exporting Miniconda variables ..."
   export PATH="${miniconda_prefix}/bin:${PATH}"
   export CONDA="${miniconda_prefix}"
 
+  if [ -f "${GITHUB_PATH}" ]; then
+    echo "[SETUP] Saving Miniconda variables to ${GITHUB_PATH} ..."
+    echo "${miniconda_prefix}/bin" >> "${GITHUB_PATH}"
+    echo "CONDA=${miniconda_prefix}" >> "${GITHUB_PATH}"
+  fi
+
   echo "[SETUP] Successfully set up Miniconda at ${miniconda_prefix}"
 }
 
@@ -448,9 +500,11 @@ install_pytorch_conda () {
   fi
 
   # Install PyTorch packages
+  # NOTE: Installation of large package might fail due to corrupt package download
+  # Use --force-reinstall to address this on retries - https://datascience.stackexchange.com/questions/41732/conda-verification-failed
   echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, CPU=${pytorch_cpu:-0}) through Conda using channel '${pytorch_channel}' ..."
   # shellcheck disable=SC2086
-  (exec_with_retries conda install -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1
+  (exec_with_retries conda install --force-reinstall -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1
 
   # Run check for GPU variant
   if [ "$pytorch_cpu" == "" ]; then
@@ -612,7 +666,7 @@ install_cuda () {
 
   # Install CUDA packages
   echo "[INSTALL] Installing CUDA ${cuda_version} ..."
-  (exec_with_retries conda install -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1
+  (exec_with_retries conda install --force-reinstall -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1
 
   # Ensure that nvcc is properly installed
   (test_binpath "${env_name}" nvcc) || return 1
@@ -806,15 +860,19 @@ install_cxx_compiler () {
     install_system_packages gcc gcc-c++
 
   else
-    # Install gxx_linux-64 from main instead of cxx-compiler from conda-forge, as
-    # the latter breaks builds:
+    # Install gxx_linux-64 from conda-forge instead of from anaconda channel.
+    # sysroot_linux-64 needs to be installed alongside this:
+    #
     #   https://root-forum.cern.ch/t/error-timespec-get-has-not-been-declared-with-conda-root-package/45712/6
+    #   https://github.com/conda-forge/conda-forge.github.io/issues/1625
+    #   https://conda-forge.org/docs/maintainer/knowledge_base.html#using-centos-7
+    #   https://github.com/conda/conda-build/issues/4371
     #
-    # NOTE: Install g++ 9.x instead of 11.x becaue 11.x builds libraries with
-    # references to GLIBCXX_3.4.29, which is not available on systems with older
+    # NOTE: We install g++ 10.x instead of 11.x becaue 11.x builds binaries that
+    # reference GLIBCXX_3.4.29, which may not be available on systems with older
     # versions of libstdc++.so.6 such as CentOS Stream 8 and Ubuntu 20.04
     echo "[INSTALL] Installing C/C++ compilers through Conda ..."
-    (exec_with_retries conda install -n "${env_name}" -y gxx_linux-64=9.3.0) || return 1
+    (exec_with_retries conda install -n "${env_name}" -y gxx_linux-64=10.4.0 sysroot_linux-64=2.17 -c conda-forge) || return 1
 
     # The compilers are visible in the PATH as `x86_64-conda-linux-gnu-cc` and
     # `x86_64-conda-linux-gnu-c++`, so symlinks will need to be created
@@ -1055,7 +1113,7 @@ check_fbgemm_gpu_build () {
 
   for library in "${fbgemm_gpu_so_files[@]}"; do
     echo "[CHECK] Listing out the GLIBCXX versions referenced by the library: ${library}"
-    objdump -TC "${library}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat
+    print_glibc_info "${library}"
 
     echo "[CHECK] Verifying sample subset of symbols in the library ..."
     for symbol in "${lib_symbols_to_check[@]}"; do
diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
index 977b443a2b..9b18dfb884 100644
--- a/.github/workflows/fbgemm_ci.yml
+++ b/.github/workflows/fbgemm_ci.yml
@@ -19,185 +19,165 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build-posix:
-    runs-on: ${{ matrix.os }}
+  build-linux:
+    runs-on: linux.12xlarge
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_DIR: build_${{ matrix.library-type }}
+      DEBIAN_FRONTEND: noninteractive
     strategy:
+      fail-fast: false
       matrix:
-        os: [ ubuntu-latest, macos-latest ]
+        container-image: [ "ubuntu:20.04" ]
+        library-type: [ static, shared ]
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Checkout submodules
-      shell: bash
+    - name: Setup Build Container
       run: |
-        auth_header="$(git config --local --get http.https://github.com/.extraheader)"
-        git submodule sync --recursive
-        git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
+        apt update -y
+        apt install -y binutils build-essential cmake git libblas-dev python3 sudo wget
+        git config --global --add safe.directory '*'
 
-    - name: Get CPU info on Ubuntu
-      if: contains(runner.os, 'linux')
-      run: |
-        cat /proc/cpuinfo
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
 
-    - name: Get CPU info on macOS
-      if: contains(runner.os, 'macOs')
-      run: |
-        sysctl -a | grep machdep.cpu
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
 
-    - name: Get env vars
-      run: |
-        echo GITHUB_WORKFLOW   = $GITHUB_WORKFLOW
-        echo HOME              = $HOME
-        echo GITHUB_ACTION     = $GITHUB_ACTION
-        echo GITHUB_ACTIONS    = $GITHUB_ACTIONS
-        echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY
-        echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME
-        echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH
-        echo GITHUB_WORKSPACE  = $GITHUB_WORKSPACE
-        echo GITHUB_SHA        = $GITHUB_SHA
-        echo GITHUB_REF        = $GITHUB_REF
-        c++ --verbose
-
-    - name: Build static FBGEMM lib
+    - name: Build FBGEMM Library (${{ matrix.library-type }})
       run: |
         set -e
-        mkdir build_static
-        cd build_static
-        cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=static ..
-        make
+        mkdir $BUILD_DIR; cd $BUILD_DIR
+        cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DPYTHON_EXECUTABLE=/usr/bin/python3 ..
+        make -j
 
-    - name: Test static FBGEMM lib
-      if: contains(runner.os, 'linux')   # not run on macos-latest now due to supporting AVX2
+    - name: Test FBGEMM Library (${{ matrix.library-type }})
       run: |
         set -e
-        cd build_static
+        cd $BUILD_DIR
         ctest --rerun-failed --output-on-failure
 
-    - name: Build shared FBGEMM lib
+
+  build-macos:
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_DIR: build_${{ matrix.library-type }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ macos-latest ]
+        library-type: [ static, shared ]
+
+    steps:
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    # Build but skip tests due to lack of support for AVX2
+    - name: Build FBGEMM Library (${{ matrix.library-type }})
       run: |
         set -e
-        mkdir build_shared
-        cd build_shared
-        cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=shared ..
-        make
+        mkdir $BUILD_DIR; cd $BUILD_DIR
+        cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} ..
+        make -j
+
+
+  build-bazel:
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest ]
 
-    - name: Test shared FBGEMM lib
-      if: contains(runner.os, 'linux')   # not run on macos-latest now due to supporting AVX2
+    steps:
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Download bazel
       run: |
         set -e
-        cd build_shared
-        ctest --rerun-failed --output-on-failure
+        wget https://github.com/bazelbuild/bazel/releases/download/2.2.0/bazel-2.2.0-linux-x86_64 -O bazel
+        # verify content
+        echo 'b2f002ea0e6194a181af6ac84cd94bd8dc797722eb2354690bebac92dda233ff bazel' | sha256sum --quiet -c
+        chmod +x bazel
+
+    - name: Build FBGEMM with bazel
+      run: ./bazel build --verbose_explanations --verbose_failures --compilation_mode opt :*
+
+    - name: Test FBGEMM bazel build
+      run: ./bazel test --test_output=all --verbose_explanations --verbose_failures --compilation_mode opt :*
+
 
   build-windows:
     runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: cmd
+    env:
+      BUILD_DIR: build_${{ matrix.library-type }}
     strategy:
+      fail-fast: false
       matrix:
-        os: [windows-2019]
+        os: [ windows-2019 ]
+        library-type: [ static, shared ]
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Checkout submodules
-      shell: bash
-      run: |
-        auth_header="$(git config --local --get http.https://github.com/.extraheader)"
-        git submodule sync --recursive
-        git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
 
     - name: Get CPU info on Windows
       shell: cmd
       run: |
         wmic cpu list full
 
-    - name: Build static FBGEMM lib
+    - name: Build FBGEMM Library (${{ matrix.library-type }})
       shell: cmd
       run: |
         call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
         echo "INSTALL NINJA:"
         pip install ninja
         which ninja
-        mkdir build_static
-        cd build_static
+        mkdir %BUILD_DIR%
+        cd %BUILD_DIR%
         echo "STARTING CMAKE"
-        cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=static -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" ..
+        cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" ..
         ninja all
         echo "Build Success"
 
-    - name: Test static FBGEMM lib
-      shell: cmd
-      run: |
-        echo %cd%
-        cd build_static
-        ctest --rerun-failed --output-on-failure
-        if errorlevel 1 exit /b 1
-
-    - name: Build shared FBGEMM lib
-      shell: cmd
-      run: |
-        call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
-        echo "INSTALL NINJA:"
-        pip install ninja
-        which ninja
-        mkdir build_shared
-        cd build_shared
-        echo "STARTING CMAKE"
-        cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=shared -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" ..
-        ninja all
-        if errorlevel 1 exit /b 1
-
-    - name: Test shared FBGEMM lib
+    - name: Test FBGEMM Library (${{ matrix.library-type }})
       shell: cmd
       run: |
         echo %cd%
-        cd build_shared
+        cd %BUILD_DIR%
         set PATH=%PATH%;%cd%;%cd%\asmjit
         echo %PATH%
         ctest --rerun-failed --output-on-failure
         if errorlevel 1 exit /b 1
-
-  build-bazel:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ ubuntu-latest ]
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Checkout submodules
-      shell: bash
-      run: |
-        auth_header="$(git config --local --get http.https://github.com/.extraheader)"
-        git submodule sync --recursive
-        git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
-
-    - name: Get env vars
-      run: |
-        echo GITHUB_WORKFLOW   = $GITHUB_WORKFLOW
-        echo HOME              = $HOME
-        echo GITHUB_ACTION     = $GITHUB_ACTION
-        echo GITHUB_ACTIONS    = $GITHUB_ACTIONS
-        echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY
-        echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME
-        echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH
-        echo GITHUB_WORKSPACE  = $GITHUB_WORKSPACE
-        echo GITHUB_SHA        = $GITHUB_SHA
-        echo GITHUB_REF        = $GITHUB_REF
-        c++ --verbose
-
-    - name: Download bazel
-      run: |
-        set -e
-        wget https://github.com/bazelbuild/bazel/releases/download/2.2.0/bazel-2.2.0-linux-x86_64 -O bazel
-        # verify content
-        echo 'b2f002ea0e6194a181af6ac84cd94bd8dc797722eb2354690bebac92dda233ff bazel' | sha256sum --quiet -c
-        chmod +x bazel
-
-
-    - name: Build FBGEMM with bazel
-      run: |
-        set -e
-        ./bazel build --verbose_explanations --verbose_failures --compilation_mode opt :*
-
-    - name: Test FBGEMM bazel build
-      run: |
-        set -e
-        ./bazel test --test_output=all --verbose_explanations --verbose_failures --compilation_mode opt :*
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
index bd62f23761..adf8443eae 100644
--- a/.github/workflows/fbgemm_gpu_ci.yml
+++ b/.github/workflows/fbgemm_gpu_ci.yml
@@ -20,7 +20,7 @@ concurrency:
 
 jobs:
   build_and_test_amd:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.12xlarge
     container:
       image: ${{ matrix.container-image }}
       options: --user root
@@ -33,9 +33,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ linux.12xlarge ]
         container-image: [ "ubuntu:20.04" ]
-        python-version: [ "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10" ]
         rocm-version: [ "5.3" ]
 
     steps:
@@ -60,10 +59,7 @@ jobs:
       run: . $PRELUDE; free_disk_space
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -77,7 +73,7 @@ jobs:
     - name: Install PyTorch-ROCm Nightly
       run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU-ROCM Nightly
@@ -146,7 +142,10 @@ jobs:
 
 
   build_and_test_cpu:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.12xlarge
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -156,10 +155,16 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ ubuntu-20.04, ubuntu-latest ]
+        container-image: [ "ubuntu:20.04", "ubuntu:22.04" ]
         python-version: [ "3.8", "3.9", "3.10" ]
 
     steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils build-essential git sudo wget
+        git config --global --add safe.directory '*'
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -172,10 +177,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -186,7 +188,7 @@ jobs:
     - name: Install PyTorch
       run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build and Install FBGEMM_GPU (CPU version)
diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml
index bc699ef62b..b0ac76900c 100644
--- a/.github/workflows/fbgemm_nightly_build.yml
+++ b/.github/workflows/fbgemm_nightly_build.yml
@@ -38,7 +38,10 @@ concurrency:
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.24xlarge
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -49,11 +52,13 @@ jobs:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
-        os: [ linux.12xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo tar wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -66,10 +71,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -89,7 +91,7 @@ jobs:
     - name: Install cuDNN
       run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU Nightly
@@ -116,7 +118,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ linux.g5.4xlarge.nvidia.gpu ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "11.7.1" ]
@@ -135,10 +137,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -149,7 +148,7 @@ jobs:
     - name: Install PyTorch Nightly
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Download Wheel Artifact from GHA
diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml
index 1125b17a0d..d99c3f73ee 100644
--- a/.github/workflows/fbgemm_nightly_build_cpu.yml
+++ b/.github/workflows/fbgemm_nightly_build_cpu.yml
@@ -39,7 +39,7 @@ concurrency:
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.4xlarge
     container:
       image: amazonlinux:2023
       options: --user root
@@ -53,8 +53,7 @@ jobs:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
-        os: [ linux.4xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
 
     steps:
     - name: Setup Build Container
@@ -72,10 +71,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -89,7 +85,7 @@ jobs:
     - name: Install PyTorch-CPU Nightly
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU Nightly (CPU version)
@@ -104,7 +100,7 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.4xlarge
     container:
       image: amazonlinux:2023
       options: --user root
@@ -117,8 +113,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ linux.4xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
     needs: build_artifact
 
     steps:
@@ -137,10 +132,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -148,7 +140,7 @@ jobs:
     - name: Install PyTorch Nightly
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Download Wheel Artifact from GHA
diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml
index def6002a76..75d5235b69 100644
--- a/.github/workflows/fbgemm_release_build.yml
+++ b/.github/workflows/fbgemm_release_build.yml
@@ -30,7 +30,10 @@ concurrency:
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.24xlarge
+    container:
+      image: amazonlinux:2023
+      options: --user root
     defaults:
       run:
         shell: bash
@@ -41,11 +44,13 @@ jobs:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
-        os: [ linux.12xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
 
     steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git sudo tar wget which
+
     - name: Checkout the Repository
       uses: actions/checkout@v3
       with:
@@ -58,10 +63,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -81,7 +83,7 @@ jobs:
     - name: Install cuDNN
       run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU
@@ -108,7 +110,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ linux.g5.4xlarge.nvidia.gpu ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "11.7.1" ]
@@ -126,10 +128,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -140,7 +139,7 @@ jobs:
     - name: Install PyTorch Test
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Download Wheel Artifact from GHA
diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml
index c7fb53cabd..f13ebd32c9 100644
--- a/.github/workflows/fbgemm_release_build_cpu.yml
+++ b/.github/workflows/fbgemm_release_build_cpu.yml
@@ -30,7 +30,7 @@ concurrency:
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.4xlarge
     container:
       image: amazonlinux:2023
       options: --user root
@@ -44,8 +44,7 @@ jobs:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
-        os: [ linux.4xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
 
     steps:
     - name: Setup Build Container
@@ -63,10 +62,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -80,7 +76,7 @@ jobs:
     - name: Install PyTorch-CPU Test
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Build FBGEMM_GPU (CPU version)
@@ -95,7 +91,7 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: ${{ matrix.os }}
+    runs-on: linux.4xlarge
     container:
       image: amazonlinux:2023
       options: --user root
@@ -108,8 +104,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ linux.4xlarge ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
     needs: build_artifact
 
     steps:
@@ -128,10 +123,7 @@ jobs:
       run: . $PRELUDE; print_gpu_info
 
     - name: Setup Miniconda
-      run: |
-        . $PRELUDE; setup_miniconda $HOME/miniconda
-        echo "${HOME}/miniconda/bin" >> $GITHUB_PATH
-        echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
 
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
@@ -139,7 +131,7 @@ jobs:
     - name: Install PyTorch Test
       run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly
 
-    - name: Prepare FBGEMM Build
+    - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
     - name: Download Wheel Artifact from GHA
diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md
index a90a059b40..56aa780fe3 100644
--- a/fbgemm_gpu/docs/BuildInstructions.md
+++ b/fbgemm_gpu/docs/BuildInstructions.md
@@ -22,7 +22,7 @@ environment is recommended for reproducible builds:
 # Set the Miniconda prefix directory
 miniconda_prefix=$HOME/miniconda
 
-# Download the Miniconfs installer
+# Download the Miniconda installer
 wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
 
 # Run the installer
@@ -59,7 +59,7 @@ conda run -n "${env_name}" python -m pip install pyOpenSSL>22.1.0
 
 ### C/C++ Compiler
 
-Install the GCC toolchain.  Note that GCC (as opposed to LLVM for example) is
+Install the GCC toolchain.  Note that GCC (as opposed to Clang for example) is
 required for GPU (CUDA) builds because NVIDIA's `nvcc` relies on `gcc` and `g++`
 in the path.
 
@@ -71,7 +71,7 @@ Note that while newer versions of GCC can be used, binaries compiled under newer
 versions of GCC will not be compatible with older systems such as Ubuntu 20.04
 or CentOS Stream 8, because the compiled library will reference symbols from
 versions of `GLIBCXX` that the system's `libstdc++.so.6` will not support.  To
-see what versions of GLIBCXX that the available `libstdc++.so.6` supports:
+see what versions of GLIBCXX the available `libstdc++.so.6` supports:
 
 ```sh
 libcxx_path=/path/to/libstdc++.so.6
@@ -193,7 +193,7 @@ From there, the rest of the build environment may be constructed through Conda.
 
 ### Install ROCm
 
-Install the full ROCm package through the operating system package manger. The
+Install the full ROCm package through the operating system package manager. The
 full instructions can be found in the
 [ROCm installation guide](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.3/page/How_to_Install_ROCm.html):
 
@@ -346,7 +346,7 @@ package_name=fbgemm_gpu
 
 # Build for SM70/80 (V100/A100 GPU); update as needed
 # If not specified, only the CUDA architecture supported by current system will be targeted
-# Ifo CUDA device is present either, all CUDA architectures will be targeted
+# If no CUDA device is present either, all CUDA architectures will be targeted
 cuda_arch_list=7.0;8.0
 
 # Build the wheel artifact only
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
index 87b9b1a559..8120cdcb03 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
@@ -9,7 +9,7 @@
 
 import enum
 import logging
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from itertools import accumulate
 from math import log2
 from typing import Dict, List, NamedTuple, Optional, Tuple, Type, Union
@@ -106,7 +106,7 @@ class CounterBasedRegularizationDefinition:
     adjustment_ub: float = 1.0
     learning_rate_mode: LearningRateMode = LearningRateMode.EQUAL
     grad_sum_decay: GradSumDecay = GradSumDecay.NO_DECAY
-    tail_id_threshold: TailIdThreshold = TailIdThreshold(val=0, is_ratio=False)
+    tail_id_threshold: TailIdThreshold = field(default_factory=TailIdThreshold)
     max_counter_update_freq: int = 1000