notebooks/src/ddp_launcher.py

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
"""(Native PyTorch) DistributedDataParallel launcher

Use this entrypoint script to launch training with native PyTorch DDP on SageMaker. You don't need
it if using SageMaker DDP - in which case directly set 'train.py' as your entrypoint.
"""
# Python Built-Ins:
import json
import os
import socket
import subprocess
import sys


# Path to resource config file IF running on SageMaker:
SM_CONFIG_PATH = "/opt/ml/input/config/resourceconfig.json"

if __name__ != "__main__":
    # If the file is imported as a module, we're in inference mode and should pass through the
    # override functions defined in the inference module. This is to support directly deploying the
    # model via SageMaker SDK's Estimator.deploy(), which will carry over the environment variable
    # SAGEMAKER_PROGRAM=ddp_launcher.py from training - causing the server to try and load handlers
    # from here rather than inference.py.
    from code.inference import *
else:
    if os.path.exists(SM_CONFIG_PATH):
        # Running on SageMaker: Load distribution configs from the resourceconfig file
        with open(SM_CONFIG_PATH) as file:
            cluster_config = json.load(file)

        host_names = cluster_config["hosts"]
        default_n_nodes = len(host_names)
        default_node_rank = host_names.index(os.environ.get("SM_CURRENT_HOST"))

        # Elect first listed host as the leader for PyTorch DDP
        print("CLUSTER HOSTS:")
        host_ips = [socket.gethostbyname(host) for host in host_names]
        for ix, host in enumerate(host_names):
            print(
                " - {}host: {}, IP: {}".format(
                    "(leader) " if ix == 0 else "",
                    host,
                    host_ips[ix],
                )
            )
        leader = host_ips[0]

        # Set the network interface for inter node communication
        os.environ["NCCL_SOCKET_IFNAME"] = cluster_config["network_interface_name"]

    else:
        # Seems not to be a SageMaker training job (could be e.g. testing on notebook, local).
        # Default to single-machine setup:
        default_n_nodes = 1
        default_node_rank = 0
        leader = "127.0.0.1"

    # Set up DDP & NCCL environment variables:
    # https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html#ncclknobs
    # https://github.com/aws/sagemaker-pytorch-training-toolkit/blob/88ca48a831bf4f099d4c57f3c18e0ff92fa2b48c/src/sagemaker_pytorch_container/training.py#L103
    #
    # Disable IB transport and force to use IP sockets by default:
    os.environ["NCCL_IB_DISABLE"] = "1"
    # Set NCCL log level (could be INFO for more debugging information):
    if not os.environ.get("NCCL_DEBUG"):
        os.environ["NCCL_DEBUG"] = "WARN"

    # Launch PyTorch DDP:
    ddp_cmd = (
        [
            "python",
            "-m",
            "torch.distributed.launch",
            "--nproc_per_node",
            os.environ["SM_NUM_GPUS"],
            "--nnodes",
            str(default_n_nodes),
            "--node_rank",
            str(default_node_rank),
            "--master_addr",
            leader,
            "--master_port",
            "7777",
        ]
        # ...And pass through arguments for the actual train script:
        + ["train.py"]
        + [arg for arg in sys.argv[1:]]
    )
    print("LAUNCHING: " + " ".join(ddp_cmd))
    subprocess.check_call(ddp_cmd)