docs/2.0/ddp_comm_hooks.html



<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  <meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />

  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
  <title>DDP Communication Hooks &mdash; PyTorch 2.0 documentation</title>
  

    <link rel="canonical" href="https://pytorch.org/docs/stable/ddp_comm_hooks.html"/>
  

  <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
  <!-- <link rel="stylesheet" href="_static/pygments.css" type="text/css" /> -->
  <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="_static/copybutton.css" type="text/css" />
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css" type="text/css" />
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.11/dist/katex.min.css" type="text/css" />
  <link rel="stylesheet" href="_static/katex-math.css" type="text/css" />
  <link rel="stylesheet" href="_static/sphinx-dropdown.css" type="text/css" />
  <link rel="stylesheet" href="_static/panels-bootstrap.min.css" type="text/css" />
  <link rel="stylesheet" href="_static/css/jit.css" type="text/css" />
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="Pipeline Parallelism" href="pipeline.html" />
    <link rel="prev" title="Complex Numbers" href="complex_numbers.html" />


  <!-- Google Analytics -->
  
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-117752657-2"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());

      gtag('config', 'UA-117752657-2');
    </script>
  
  <!-- End Google Analytics -->
  

  <script src="_static/js/modernizr.min.js"></script>

  <!-- Preload the theme fonts -->

<link rel="preload" href="_static/fonts/FreightSans/freight-sans-book.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="_static/fonts/FreightSans/freight-sans-medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="_static/fonts/FreightSans/freight-sans-bold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="_static/fonts/FreightSans/freight-sans-medium-italic.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2" as="font" type="font/woff2" crossorigin="anonymous">

<!-- Preload the katex fonts -->

<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Math-Italic.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Main-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Main-Bold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size1-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size4-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size2-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size3-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Caligraphic-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
  <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.2/css/all.css" integrity="sha384-vSIIfh2YWi9wW0r9iZe7RJPrKwp6bG+s9QZMoITbCckVJqGCCRhc+ccxNcdpHuYu" crossorigin="anonymous">
</head>

<div class="container-fluid header-holder tutorials-header" id="header-holder">
  <div class="container">
    <div class="header-container">
      <a class="header-logo" href="https://pytorch.org/" aria-label="PyTorch"></a>

      <div class="main-menu">
        <ul>
          <li>
            <a href="https://pytorch.org/get-started">Get Started</a>
          </li>

          <li>
            <a href="https://pytorch.org/ecosystem">Ecosystem</a>
          </li>

          <li>
            <a href="https://pytorch.org/mobile">Mobile</a>
          </li>

          <li>
            <a href="https://pytorch.org/blog/">Blog</a>
          </li>

          <li>
            <a href="https://pytorch.org/tutorials">Tutorials</a>
          </li>

          <li class="active docs-active">
            <div id="resourcesDropdownButton" data-toggle="resources-dropdown" class="resources-dropdown">
              <a class="resource-option with-down-orange-arrow">
                Docs
              </a>
              <div class="resources-dropdown-menu">
                <a class="doc-dropdown-option nav-dropdown-item" href="https://pytorch.org/docs/stable/index.html">
                  <span class="dropdown-title">PyTorch</span>
                  <p></p>
                </a>
                <a class="doc-dropdown-option nav-dropdown-item" href="https://pytorch.org/audio/stable/index.html">
                  <span class="dropdown-title">torchaudio</span>
                  <p></p>
                </a>
                <a class="doc-dropdown-option nav-dropdown-item" href="https://pytorch.org/text/stable/index.html">
                  <span class="dropdown-title">torchtext</span>
                  <p></p>
                </a>
                <a class="doc-dropdown-option nav-dropdown-item" href="https://pytorch.org/vision/stable/index.html">
                  <span class="dropdown-title">torchvision</span>
                  <p></p>
                </a>
                <a class="doc-dropdown-option nav-dropdown-item" href="https://pytorch.org/torcharrow">
                  <span class="dropdown-title">torcharrow</span>
                  <p></p>
                </a>
                <a class="doc-dropdown-option nav-dropdown-item" href="https://pytorch.org/data">
                  <span class="dropdown-title">TorchData</span>
                  <p></p>
                </a>
                <a class="doc-dropdown-option nav-dropdown-item" href="https://pytorch.org/torchrec">
                  <span class="dropdown-title">TorchRec</span>
                  <p></p>
                </a>
                <a class="doc-dropdown-option nav-dropdown-item" href="https://pytorch.org/serve/">
                  <span class="dropdown-title">TorchServe</span>
                  <p></p>
                </a>
                <a class="doc-dropdown-option nav-dropdown-item" href="https://pytorch.org/torchx/">
                  <span class="dropdown-title">TorchX</span>
                  <p></p>
                </a>
                <a class="doc-dropdown-option nav-dropdown-item" href="https://pytorch.org/xla">
                  <span class="dropdown-title">PyTorch on XLA Devices</span>
                  <p></p>
                </a>
            </div>
          </li>

          <li>
            <div id="resourcesDropdownButton" data-toggle="resources-dropdown" class="resources-dropdown">
              <a class="resource-option with-down-arrow">
                Resources
              </a>
              <div class="resources-dropdown-menu">
                <a class="nav-dropdown-item" href="https://pytorch.org/features">
                  <span class="dropdown-title">About</span>
                  <p>Learn about PyTorch’s features and capabilities</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/foundation">
                  <span class="dropdown-title">PyTorch Foundation</span>
                  <p>Learn about the PyTorch foundation</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/#community-module">
                  <span class="dropdown-title">Community</span>
                  <p>Join the PyTorch developer community to contribute, learn, and get your questions answered.</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/community-stories">
                  <span class="dropdown-title">Community Stories</span>
                  <p>Learn how our community solves real, everyday machine learning problems with PyTorch.</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/resources">
                  <span class="dropdown-title">Developer Resources</span>
                  <p>Find resources and get questions answered</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/events">
                  <span class="dropdown-title">Events</span>
                  <p>Find events, webinars, and podcasts</p>
                </a>
                <a class="nav-dropdown-item" href="https://discuss.pytorch.org/" target="_blank">
                  <span class="dropdown-title">Forums</span>
                  <p>A place to discuss PyTorch code, issues, install, research</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/hub">
                  <span class="dropdown-title">Models (Beta)</span>
                  <p>Discover, publish, and reuse pre-trained models</p>
                </a>
              </div>
            </div>
          </li>

          <li>
            <a href="https://github.com/pytorch/pytorch">GitHub</a>
          </li>
        </ul>
      </div>

      <a class="main-menu-open-button" href="#" data-behavior="open-mobile-menu"></a>
    </div>
  </div>
</div>

<body class="pytorch-body">

   
    <div class="table-of-contents-link-wrapper">
      <span>Table of Contents</span>
      <a href="#" class="toggle-table-of-contents" data-behavior="toggle-table-of-contents"></a>
    </div>

    <nav data-toggle="wy-nav-shift" class="pytorch-left-menu" id="pytorch-left-menu">
      <div class="pytorch-side-scroll">
        <div class="pytorch-menu pytorch-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
          <div class="pytorch-left-menu-search">
            
    <div class="version">
      <a href='https://pytorch.org/docs/versions.html'>2.0 &#x25BC</a>
    </div>
    

<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search Docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>

          </div>

          
              <p class="caption" role="heading"><span class="caption-text">Community</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="community/build_ci_governance.html">PyTorch Governance | Build + CI</a></li>
<li class="toctree-l1"><a class="reference internal" href="community/contribution_guide.html">PyTorch Contribution Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="community/design.html">PyTorch Design Philosophy</a></li>
<li class="toctree-l1"><a class="reference internal" href="community/governance.html">PyTorch Governance | Mechanics</a></li>
<li class="toctree-l1"><a class="reference internal" href="community/persons_of_interest.html">PyTorch Governance | Maintainers</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Developer Notes</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="notes/amp_examples.html">CUDA Automatic Mixed Precision examples</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/autograd.html">Autograd mechanics</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/broadcasting.html">Broadcasting semantics</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/cpu_threading_torchscript_inference.html">CPU threading and TorchScript inference</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/cuda.html">CUDA semantics</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/ddp.html">Distributed Data Parallel</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/extending.html">Extending PyTorch</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/extending.func.html">Extending torch.func with autograd.Function</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/faq.html">Frequently Asked Questions</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/gradcheck.html">Gradcheck mechanics</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/hip.html">HIP (ROCm) semantics</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/large_scale_deployments.html">Features for large-scale deployments</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/modules.html">Modules</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/mps.html">MPS backend</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/multiprocessing.html">Multiprocessing best practices</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/numerical_accuracy.html">Numerical accuracy</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/randomness.html">Reproducibility</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/serialization.html">Serialization semantics</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/windows.html">Windows FAQ</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">torch.compile</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="dynamo/index.html">TorchDynamo Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="dynamo/installation.html">Installing TorchDynamo</a></li>
<li class="toctree-l1"><a class="reference internal" href="dynamo/get-started.html">Getting Started</a></li>
<li class="toctree-l1"><a class="reference internal" href="dynamo/guards-overview.html">Guards Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="dynamo/custom-backends.html">Custom Backends</a></li>
<li class="toctree-l1"><a class="reference internal" href="dynamo/deep-dive.html">TorchDynamo Deeper Dive</a></li>
<li class="toctree-l1"><a class="reference internal" href="dynamo/troubleshooting.html">TorchDynamo Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="dynamo/faq.html">Frequently Asked Questions</a></li>
<li class="toctree-l1"><a class="reference internal" href="ir.html">IRs</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Language Bindings</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="cpp_index.html">C++</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/javadoc/">Javadoc</a></li>
<li class="toctree-l1"><a class="reference internal" href="deploy.html">torch::deploy</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Python API</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="torch.html">torch</a></li>
<li class="toctree-l1"><a class="reference internal" href="nn.html">torch.nn</a></li>
<li class="toctree-l1"><a class="reference internal" href="nn.functional.html">torch.nn.functional</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensors.html">torch.Tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensor_attributes.html">Tensor Attributes</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensor_view.html">Tensor Views</a></li>
<li class="toctree-l1"><a class="reference internal" href="amp.html">torch.amp</a></li>
<li class="toctree-l1"><a class="reference internal" href="autograd.html">torch.autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="library.html">torch.library</a></li>
<li class="toctree-l1"><a class="reference internal" href="cuda.html">torch.cuda</a></li>
<li class="toctree-l1"><a class="reference internal" href="mps.html">torch.mps</a></li>
<li class="toctree-l1"><a class="reference internal" href="backends.html">torch.backends</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.html">torch.distributed</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.algorithms.join.html">torch.distributed.algorithms.join</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.elastic.html">torch.distributed.elastic</a></li>
<li class="toctree-l1"><a class="reference internal" href="fsdp.html">torch.distributed.fsdp</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.optim.html">torch.distributed.optim</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.tensor.parallel.html">torch.distributed.tensor.parallel</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.checkpoint.html">torch.distributed.checkpoint</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributions.html">torch.distributions</a></li>
<li class="toctree-l1"><a class="reference internal" href="_dynamo.html">torch._dynamo</a></li>
<li class="toctree-l1"><a class="reference internal" href="fft.html">torch.fft</a></li>
<li class="toctree-l1"><a class="reference internal" href="func.html">torch.func</a></li>
<li class="toctree-l1"><a class="reference internal" href="futures.html">torch.futures</a></li>
<li class="toctree-l1"><a class="reference internal" href="fx.html">torch.fx</a></li>
<li class="toctree-l1"><a class="reference internal" href="hub.html">torch.hub</a></li>
<li class="toctree-l1"><a class="reference internal" href="jit.html">torch.jit</a></li>
<li class="toctree-l1"><a class="reference internal" href="linalg.html">torch.linalg</a></li>
<li class="toctree-l1"><a class="reference internal" href="monitor.html">torch.monitor</a></li>
<li class="toctree-l1"><a class="reference internal" href="signal.html">torch.signal</a></li>
<li class="toctree-l1"><a class="reference internal" href="special.html">torch.special</a></li>
<li class="toctree-l1"><a class="reference internal" href="torch.overrides.html">torch.overrides</a></li>
<li class="toctree-l1"><a class="reference internal" href="package.html">torch.package</a></li>
<li class="toctree-l1"><a class="reference internal" href="profiler.html">torch.profiler</a></li>
<li class="toctree-l1"><a class="reference internal" href="nn.init.html">torch.nn.init</a></li>
<li class="toctree-l1"><a class="reference internal" href="onnx.html">torch.onnx</a></li>
<li class="toctree-l1"><a class="reference internal" href="onnx_diagnostics.html">torch.onnx diagnostics</a></li>
<li class="toctree-l1"><a class="reference internal" href="optim.html">torch.optim</a></li>
<li class="toctree-l1"><a class="reference internal" href="complex_numbers.html">Complex Numbers</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">DDP Communication Hooks</a></li>
<li class="toctree-l1"><a class="reference internal" href="pipeline.html">Pipeline Parallelism</a></li>
<li class="toctree-l1"><a class="reference internal" href="quantization.html">Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="rpc.html">Distributed RPC Framework</a></li>
<li class="toctree-l1"><a class="reference internal" href="random.html">torch.random</a></li>
<li class="toctree-l1"><a class="reference internal" href="masked.html">torch.masked</a></li>
<li class="toctree-l1"><a class="reference internal" href="nested.html">torch.nested</a></li>
<li class="toctree-l1"><a class="reference internal" href="sparse.html">torch.sparse</a></li>
<li class="toctree-l1"><a class="reference internal" href="storage.html">torch.Storage</a></li>
<li class="toctree-l1"><a class="reference internal" href="testing.html">torch.testing</a></li>
<li class="toctree-l1"><a class="reference internal" href="benchmark_utils.html">torch.utils.benchmark</a></li>
<li class="toctree-l1"><a class="reference internal" href="bottleneck.html">torch.utils.bottleneck</a></li>
<li class="toctree-l1"><a class="reference internal" href="checkpoint.html">torch.utils.checkpoint</a></li>
<li class="toctree-l1"><a class="reference internal" href="cpp_extension.html">torch.utils.cpp_extension</a></li>
<li class="toctree-l1"><a class="reference internal" href="data.html">torch.utils.data</a></li>
<li class="toctree-l1"><a class="reference internal" href="jit_utils.html">torch.utils.jit</a></li>
<li class="toctree-l1"><a class="reference internal" href="dlpack.html">torch.utils.dlpack</a></li>
<li class="toctree-l1"><a class="reference internal" href="mobile_optimizer.html">torch.utils.mobile_optimizer</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_zoo.html">torch.utils.model_zoo</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensorboard.html">torch.utils.tensorboard</a></li>
<li class="toctree-l1"><a class="reference internal" href="type_info.html">Type Info</a></li>
<li class="toctree-l1"><a class="reference internal" href="named_tensor.html">Named Tensors</a></li>
<li class="toctree-l1"><a class="reference internal" href="name_inference.html">Named Tensors operator coverage</a></li>
<li class="toctree-l1"><a class="reference internal" href="config_mod.html">torch.__config__</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Libraries</span></p>
<ul>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/audio/stable">torchaudio</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/data">TorchData</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/torchrec">TorchRec</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/serve">TorchServe</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/text/stable">torchtext</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/vision/stable">torchvision</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/xla/">PyTorch on XLA Devices</a></li>
</ul>

            
        </div>
      </div>
    </nav>

    <div class="pytorch-container">
      <div class="pytorch-page-level-bar" id="pytorch-page-level-bar">
        <div class="pytorch-breadcrumbs-wrapper">
          

<div role="navigation" aria-label="breadcrumbs navigation">

  <ul class="pytorch-breadcrumbs">
    
      <li>
        <a href="index.html">
          
            Docs
          
        </a> &gt;
      </li>

        
      <li>DDP Communication Hooks</li>
    
    
      <li class="pytorch-breadcrumbs-aside">
        
            
            <a href="_sources/ddp_comm_hooks.rst.txt" rel="nofollow"><img src="_static/images/view-page-source-icon.svg"></a>
          
        
      </li>
    
  </ul>

  
</div>
        </div>

        <div class="pytorch-shortcuts-wrapper" id="pytorch-shortcuts-wrapper">
          Shortcuts
        </div>
      </div>

      <section data-toggle="wy-nav-shift" id="pytorch-content-wrap" class="pytorch-content-wrap">
        <div class="pytorch-content-left">

        
          <div class="rst-content">
          
            <div role="main" class="main-content" itemscope="itemscope" itemtype="http://schema.org/Article">
             <article itemprop="articleBody" id="pytorch-article" class="pytorch-article">
              
  <section id="ddp-communication-hooks">
<h1>DDP Communication Hooks<a class="headerlink" href="#ddp-communication-hooks" title="Permalink to this heading">¶</a></h1>
<p>DDP communication hook is a generic interface to control how to communicate
gradients across workers by overriding the vanilla allreduce in
<a class="reference external" href="https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.">DistributedDataParallel</a>.
A few built-in communication hooks are provided,
and users can easily apply any of these hooks to optimize communication.
Besides, the hook interface can also support user-defined communication
strategies for more advanced use cases.</p>
<section id="how-to-use-a-communication-hook">
<h2>How to Use a Communication Hook?<a class="headerlink" href="#how-to-use-a-communication-hook" title="Permalink to this heading">¶</a></h2>
<p>To use a communication hook, the user just needs to let the DDP model register
the hook before the training loop as below.</p>
<p><a class="reference internal" href="generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.register_comm_hook" title="torch.nn.parallel.DistributedDataParallel.register_comm_hook"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.nn.parallel.DistributedDataParallel.register_comm_hook()</span></code></a></p>
</section>
<section id="what-does-a-communication-hook-operate-on">
<h2>What Does a Communication Hook Operate On?<a class="headerlink" href="#what-does-a-communication-hook-operate-on" title="Permalink to this heading">¶</a></h2>
<p>A communication hook provides a flexible way to allreduce gradients.
Therefore, it mainly operates on the gradients on each replica before allreduce,
which are bucketized to increase the overlap between communication and computation.
Particularly, <a class="reference internal" href="#torch.distributed.GradBucket" title="torch.distributed.GradBucket"><code class="xref py py-class docutils literal notranslate"><span class="pre">torch.distributed.GradBucket</span></code></a> represents a bucket of gradient tensors to be allreduced.</p>
<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.GradBucket">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">GradBucket</span></span><a class="headerlink" href="#torch.distributed.GradBucket" title="Permalink to this definition">¶</a></dt>
<dd><p>This class mainly passes a flattened gradient tensor
(returned by <a class="reference internal" href="#torch.distributed.GradBucket.buffer" title="torch.distributed.GradBucket.buffer"><code class="xref py py-meth docutils literal notranslate"><span class="pre">buffer()</span></code></a>)
to DDP communication hook.
This tensor can be further decomposed into a list of per-parameter tensors within this bucket
(returned by <code class="xref py py-meth docutils literal notranslate"><span class="pre">get_per_parameter_tensors()</span></code>)
to apply layer-wise operations.</p>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.GradBucket.index">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.GradBucket.</span></span><span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#torch.distributed.GradBucket" title="torch._C._distributed_c10d.GradBucket"><span class="pre">torch._C._distributed_c10d.GradBucket</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><span class="pre">int</span></a></span></span><a class="headerlink" href="#torch.distributed.GradBucket.index" title="Permalink to this definition">¶</a></dt>
<dd><div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Since the buckets are rebuilt after the first iteration, should not rely on the indices at the beginning of training.</p>
</div>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>The index of a bucket that stores gradients of a few contiguous layers.
All the gradients are bucketized.</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.GradBucket.buffer">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.GradBucket.</span></span><span class="sig-name descname"><span class="pre">buffer</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#torch.distributed.GradBucket" title="torch._C._distributed_c10d.GradBucket"><span class="pre">torch._C._distributed_c10d.GradBucket</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><span class="pre">torch.Tensor</span></a></span></span><a class="headerlink" href="#torch.distributed.GradBucket.buffer" title="Permalink to this definition">¶</a></dt>
<dd><dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>A flattened 1D <code class="docutils literal notranslate"><span class="pre">torch.Tensor</span></code> buffer,
which can be further decomposed into a list of per-parameter tensors within this bucket.</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.GradBucket.gradients">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.GradBucket.</span></span><span class="sig-name descname"><span class="pre">gradients</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#torch.distributed.GradBucket" title="torch._C._distributed_c10d.GradBucket"><span class="pre">torch._C._distributed_c10d.GradBucket</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><span class="pre">torch.Tensor</span></a><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#torch.distributed.GradBucket.gradients" title="Permalink to this definition">¶</a></dt>
<dd><dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>A list of <code class="docutils literal notranslate"><span class="pre">torch.Tensor</span></code>. Each tensor in the list corresponds to a gradient.</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.GradBucket.is_last">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.GradBucket.</span></span><span class="sig-name descname"><span class="pre">is_last</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#torch.distributed.GradBucket" title="torch._C._distributed_c10d.GradBucket"><span class="pre">torch._C._distributed_c10d.GradBucket</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.11)"><span class="pre">bool</span></a></span></span><a class="headerlink" href="#torch.distributed.GradBucket.is_last" title="Permalink to this definition">¶</a></dt>
<dd><dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>Whether this bucket is the last bucket to allreduce in an iteration.
This also means that this bucket corresponds to the first few layers in the forward pass.</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.GradBucket.set_buffer">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.GradBucket.</span></span><span class="sig-name descname"><span class="pre">set_buffer</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#torch.distributed.GradBucket" title="torch._C._distributed_c10d.GradBucket"><span class="pre">torch._C._distributed_c10d.GradBucket</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">buffer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><span class="pre">torch.Tensor</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.11)"><span class="pre">None</span></a></span></span><a class="headerlink" href="#torch.distributed.GradBucket.set_buffer" title="Permalink to this definition">¶</a></dt>
<dd><p>Replaces the tensor in the bucket with the input tensor buffer.</p>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.GradBucket.parameters">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.GradBucket.</span></span><span class="sig-name descname"><span class="pre">parameters</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#torch.distributed.GradBucket" title="torch._C._distributed_c10d.GradBucket"><span class="pre">torch._C._distributed_c10d.GradBucket</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><span class="pre">torch.Tensor</span></a><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#torch.distributed.GradBucket.parameters" title="Permalink to this definition">¶</a></dt>
<dd><dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>A list of <code class="docutils literal notranslate"><span class="pre">torch.Tensor</span></code>. Each tensor in the list corresponds to a model
parameter.</p>
</dd>
</dl>
</dd></dl>

</section>
<section id="default-communication-hooks">
<h2>Default Communication Hooks<a class="headerlink" href="#default-communication-hooks" title="Permalink to this heading">¶</a></h2>
<p>Default communication hooks are simple <strong>stateless</strong> hooks, so the input state
in <code class="docutils literal notranslate"><span class="pre">register_comm_hook</span></code> is either a process group or <code class="docutils literal notranslate"><span class="pre">None</span></code>.
The input <code class="docutils literal notranslate"><span class="pre">bucket</span></code> is a <a class="reference internal" href="#torch.distributed.GradBucket" title="torch.distributed.GradBucket"><code class="xref py py-class docutils literal notranslate"><span class="pre">torch.distributed.GradBucket</span></code></a> object.</p>
<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.algorithms.ddp_comm_hooks.default_hooks.allreduce_hook">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.algorithms.ddp_comm_hooks.default_hooks.</span></span><span class="sig-name descname"><span class="pre">allreduce_hook</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">process_group</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bucket</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.html#allreduce_hook"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.allreduce_hook" title="Permalink to this definition">¶</a></dt>
<dd><p>This DDP communication hook just calls <code class="docutils literal notranslate"><span class="pre">allreduce</span></code> using <code class="docutils literal notranslate"><span class="pre">GradBucket</span></code>
tensors. Once gradient tensors are aggregated across all workers, its <code class="docutils literal notranslate"><span class="pre">then</span></code>
callback takes the mean and returns the result. If user registers this hook,
DDP results is expected to be same as the case where no hook was registered.
Hence, this won’t change behavior of DDP and user can use this as a reference
or modify this hook to log useful information or any other purposes while
unaffecting DDP behavior.</p>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">ddp_model</span><span class="o">.</span><span class="n">register_comm_hook</span><span class="p">(</span><span class="n">process_group</span><span class="p">,</span> <span class="n">allreduce_hook</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
<dl class="field-list simple">
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference internal" href="futures.html#torch.futures.Future" title="torch.jit.Future"><em>Future</em></a>[<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>]</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_hook">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.algorithms.ddp_comm_hooks.default_hooks.</span></span><span class="sig-name descname"><span class="pre">fp16_compress_hook</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">process_group</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bucket</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.html#fp16_compress_hook"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_hook" title="Permalink to this definition">¶</a></dt>
<dd><p>This DDP communication hook implements a simple gradient compression
approach that casts <code class="docutils literal notranslate"><span class="pre">GradBucket</span></code> tensor to half-precision floating-point format (<code class="docutils literal notranslate"><span class="pre">torch.float16</span></code>)
and then divides it by the process group size.
It allreduces those <code class="docutils literal notranslate"><span class="pre">float16</span></code> gradient tensors. Once compressed gradient
tensors are allreduced, the chained callback <code class="docutils literal notranslate"><span class="pre">decompress</span></code> casts it back to the input data type (such as <code class="docutils literal notranslate"><span class="pre">float32</span></code>).</p>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">ddp_model</span><span class="o">.</span><span class="n">register_comm_hook</span><span class="p">(</span><span class="n">process_group</span><span class="p">,</span> <span class="n">fp16_compress_hook</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
<dl class="field-list simple">
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference internal" href="futures.html#torch.futures.Future" title="torch.jit.Future"><em>Future</em></a>[<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>]</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.algorithms.ddp_comm_hooks.default_hooks.bf16_compress_hook">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.algorithms.ddp_comm_hooks.default_hooks.</span></span><span class="sig-name descname"><span class="pre">bf16_compress_hook</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">process_group</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bucket</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.html#bf16_compress_hook"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.bf16_compress_hook" title="Permalink to this definition">¶</a></dt>
<dd><p>Warning: This API is experimental, and it requires NCCL version later than 2.9.6.</p>
<p>This DDP communication hook implements a simple gradient compression
approach that casts <code class="docutils literal notranslate"><span class="pre">GradBucket</span></code> tensor to half-precision
<a class="reference external" href="https://en.wikipedia.org/wiki/Bfloat16_floating-point_format">Brain floating point format</a> (<code class="docutils literal notranslate"><span class="pre">torch.bfloat16</span></code>)
and then divides it by the process group size.
It allreduces those <code class="docutils literal notranslate"><span class="pre">bfloat16</span></code> gradient tensors. Once compressed gradient
tensors are allreduced, the chained callback <code class="docutils literal notranslate"><span class="pre">decompress</span></code> casts it back to the input data type (such as <code class="docutils literal notranslate"><span class="pre">float32</span></code>).</p>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">ddp_model</span><span class="o">.</span><span class="n">register_comm_hook</span><span class="p">(</span><span class="n">process_group</span><span class="p">,</span> <span class="n">bf16_compress_hook</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
<dl class="field-list simple">
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference internal" href="futures.html#torch.futures.Future" title="torch.jit.Future"><em>Future</em></a>[<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>]</p>
</dd>
</dl>
</dd></dl>

<p>Additionally, a communication hook wrapper is provided to support <a class="reference internal" href="#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_hook" title="torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_hook"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fp16_compress_hook()</span></code></a> or <a class="reference internal" href="#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.bf16_compress_hook" title="torch.distributed.algorithms.ddp_comm_hooks.default_hooks.bf16_compress_hook"><code class="xref py py-meth docutils literal notranslate"><span class="pre">bf16_compress_hook()</span></code></a> as a wrapper,
which can be combined with other communication hooks.</p>
<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_wrapper">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.algorithms.ddp_comm_hooks.default_hooks.</span></span><span class="sig-name descname"><span class="pre">fp16_compress_wrapper</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">hook</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.html#fp16_compress_wrapper"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_wrapper" title="Permalink to this definition">¶</a></dt>
<dd><p>This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision
floating point format (<code class="docutils literal notranslate"><span class="pre">torch.float16</span></code>), and casts the resulting tensor of the given hook back to
the input data type, such as <code class="docutils literal notranslate"><span class="pre">float32</span></code>.</p>
<p>Therefore, <code class="docutils literal notranslate"><span class="pre">fp16_compress_hook</span></code> is equivalent to <code class="docutils literal notranslate"><span class="pre">fp16_compress_wrapper(allreduce_hook)</span></code>.</p>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">state</span> <span class="o">=</span> <span class="n">PowerSGDState</span><span class="p">(</span><span class="n">process_group</span><span class="o">=</span><span class="n">process_group</span><span class="p">,</span> <span class="n">matrix_approximation_rank</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">start_powerSGD_iter</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ddp_model</span><span class="o">.</span><span class="n">register_comm_hook</span><span class="p">(</span><span class="n">state</span><span class="p">,</span> <span class="n">fp16_compress_wrapper</span><span class="p">(</span><span class="n">powerSGD_hook</span><span class="p">))</span>
</pre></div>
</div>
</dd>
</dl>
<dl class="field-list simple">
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Callable" title="(in Python v3.11)"><em>Callable</em></a>[[<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Any" title="(in Python v3.11)"><em>Any</em></a>, <a class="reference internal" href="#torch.distributed.GradBucket" title="torch._C._distributed_c10d.GradBucket"><em>GradBucket</em></a>], <a class="reference internal" href="futures.html#torch.futures.Future" title="torch.jit.Future"><em>Future</em></a>[<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>]]</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.algorithms.ddp_comm_hooks.default_hooks.bf16_compress_wrapper">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.algorithms.ddp_comm_hooks.default_hooks.</span></span><span class="sig-name descname"><span class="pre">bf16_compress_wrapper</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">hook</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.html#bf16_compress_wrapper"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.bf16_compress_wrapper" title="Permalink to this definition">¶</a></dt>
<dd><p>Warning: This API is experimental, and it requires NCCL version later than 2.9.6.</p>
<p>This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision
<cite>Brain floating point format &lt;https://en.wikipedia.org/wiki/Bfloat16_floating-point_format&gt; `_  (``torch.bfloat16`</cite>),
and casts the resulting tensor of the given hook back to the input data type, such as <code class="docutils literal notranslate"><span class="pre">float32</span></code>.</p>
<p>Therefore, <code class="docutils literal notranslate"><span class="pre">bf16_compress_hook</span></code> is equivalent to <code class="docutils literal notranslate"><span class="pre">bf16_compress_wrapper(allreduce_hook)</span></code>.</p>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">state</span> <span class="o">=</span> <span class="n">PowerSGDState</span><span class="p">(</span><span class="n">process_group</span><span class="o">=</span><span class="n">process_group</span><span class="p">,</span> <span class="n">matrix_approximation_rank</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">start_powerSGD_iter</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ddp_model</span><span class="o">.</span><span class="n">register_comm_hook</span><span class="p">(</span><span class="n">state</span><span class="p">,</span> <span class="n">bf16_compress_wrapper</span><span class="p">(</span><span class="n">powerSGD_hook</span><span class="p">))</span>
</pre></div>
</div>
</dd>
</dl>
<dl class="field-list simple">
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Callable" title="(in Python v3.11)"><em>Callable</em></a>[[<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Any" title="(in Python v3.11)"><em>Any</em></a>, <a class="reference internal" href="#torch.distributed.GradBucket" title="torch._C._distributed_c10d.GradBucket"><em>GradBucket</em></a>], <a class="reference internal" href="futures.html#torch.futures.Future" title="torch.jit.Future"><em>Future</em></a>[<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>]]</p>
</dd>
</dl>
</dd></dl>

</section>
<section id="powersgd-communication-hook">
<h2>PowerSGD Communication Hook<a class="headerlink" href="#powersgd-communication-hook" title="Permalink to this heading">¶</a></h2>
<p>PowerSGD (<a class="reference external" href="https://arxiv.org/abs/1905.13727">Vogels et al., NeurIPS 2019</a>)
is a gradient compression algorithm, which can provide very high compression
rates and accelerate bandwidth-bound distributed training.
This algorithm needs to maintain both some hyperparameters and the internal
state. Therefore, PowerSGD communication hook is a <strong>stateful</strong> hook,
and the user needs to provide a state object defined as below.</p>
<section id="powersgd-state">
<h3>PowerSGD State<a class="headerlink" href="#powersgd-state" title="Permalink to this heading">¶</a></h3>
<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.</span></span><span class="sig-name descname"><span class="pre">PowerSGDState</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">process_group</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">matrix_approximation_rank</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">start_powerSGD_iter</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1000</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_compression_rate</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">2</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_error_feedback</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">warm_start</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">orthogonalization_epsilon</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compression_stats_logging_frequency</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">10000</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_tensors_with_same_shape</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.html#PowerSGDState"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState" title="Permalink to this definition">¶</a></dt>
<dd><p>Stores both the algorithm’s hyperparameters and the internal state for all the gradients during the training.
Particularly, <code class="docutils literal notranslate"><span class="pre">matrix_approximation_rank</span></code> and <code class="docutils literal notranslate"><span class="pre">start_powerSGD_iter</span></code> are the main hyperparameters that should be tuned by the user.
For performance, we suggest to keep binary hyperparameters <code class="docutils literal notranslate"><span class="pre">use_error_feedback</span></code> and <code class="docutils literal notranslate"><span class="pre">warm_start</span></code> on.</p>
<ol class="arabic">
<li><p><code class="docutils literal notranslate"><span class="pre">matrix_approximation_rank</span></code> controls the size of compressed low-rank tensors, which determines the compression rate. The lower the rank, the stronger the compression.</p>
<blockquote>
<div><p>1.1. If <code class="docutils literal notranslate"><span class="pre">matrix_approximation_rank</span></code> is too low, the full model quality will need more training steps to reach or will never reach and yield loss in accuracy.</p>
<p>1.2. The increase of <code class="docutils literal notranslate"><span class="pre">matrix_approximation_rank</span></code> can substantially increase the computation costs of the compression, and the accuracy may not be further improved beyond a certain <code class="docutils literal notranslate"><span class="pre">matrix_approximation_rank</span></code> threshold.</p>
</div></blockquote>
</li>
</ol>
<p>To tune <code class="docutils literal notranslate"><span class="pre">matrix_approximation_rank</span></code>, we suggest to start from 1 and increase by factors of 2 (like an exponential grid search, 1, 2, 4, …), until a satisfactory accuracy is reached. Typically only a small value 1-4 is used. For some NLP tasks (as shown in Appendix D of the original paper), this value has been increased to 32.</p>
<ol class="arabic simple" start="2">
<li><p><code class="docutils literal notranslate"><span class="pre">start_powerSGD_iter</span></code> defers PowerSGD compression until step <code class="docutils literal notranslate"><span class="pre">start_powerSGD_iter</span></code>, and vanilla allreduce runs prior to step <code class="docutils literal notranslate"><span class="pre">start_powerSGD_iter</span></code>. This hybrid scheme of <strong>vanilla allreduce + PowerSGD</strong> can effectively improve the accuracy, even a relatively small <code class="docutils literal notranslate"><span class="pre">matrix_approximation_rank</span></code> is used. This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients, and compressing gradients too early may make the training quickly take a suboptimal trajectory, which can result in an irrecoverable impact on the accuracy.</p></li>
</ol>
<p>To tune <code class="docutils literal notranslate"><span class="pre">start_powerSGD_iter</span></code>, we suggest to start with 10% of total training steps, and increase it until a satisfactory accuracy is reached. If there is a warm-up stage in the training, <code class="docutils literal notranslate"><span class="pre">start_powerSGD_iter</span></code> typically should be no less than the number of warm-up steps.</p>
<ol class="arabic simple" start="3">
<li><p><code class="docutils literal notranslate"><span class="pre">min_compression_rate</span></code> is the minimum compression rate required when a layer is compressed. Due to the computation overheads incurred by the compression, a tensor is worth compressing only if there can be sufficient saving in bandwidth, where <code class="docutils literal notranslate"><span class="pre">(num_rows</span> <span class="pre">+</span> <span class="pre">num_cols)</span> <span class="pre">*</span> <span class="pre">matrix_approximation_rank</span> <span class="pre">*</span> <span class="pre">min_compression_rate</span> <span class="pre">&lt;</span> <span class="pre">num_rows</span> <span class="pre">*</span> <span class="pre">num_cols</span></code>. If the specified compression rate threshold cannot be satisfied, the tensor will be directly allreduced without compression.</p></li>
</ol>
<p>Compression statistics are logged every <code class="docutils literal notranslate"><span class="pre">compression_stats_logging_frequency</span></code> iterations once PowerSGD compression starts.</p>
<ol class="arabic simple" start="4">
<li><p><code class="docutils literal notranslate"><span class="pre">orthogonalization_epsilon</span></code> can be a very small value (e.g., 1e-8) added to every normalized matrix column in orthogonalization step, to prevent div-by-zero error if any column has all 0s. If this can already be prevented (e.g., by batch normalization), an epsilon of 0 is recommended for accuracy.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">batch_tensors_with_same_shape</span></code> controls whether to compress and decompress tensors with same shape in a batched operation to achieve higher parallelism. Note that you should also increase the bucket size (i.e., <code class="docutils literal notranslate"><span class="pre">bucket_cap_mb</span></code> arg in DDP constructor) to make more same-shaped tensors appear in the same bucket, however this may reduce the overlap between computation and communication, and increase the memory footprint due to stacking the tensors of the same shape. Set to <code class="docutils literal notranslate"><span class="pre">True</span></code> if the compression / decompression computation is a bottleneck.</p></li>
</ol>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>If error feedback or warm-up is enabled, the minimum value of <code class="docutils literal notranslate"><span class="pre">start_powerSGD_iter</span></code> allowed in DDP is 2.
This is because there is another internal optimization that rebuilds buckets at iteration 1 in DDP,
and this can conflict with any tensor memorized before the rebuild process.</p>
</div>
<dl class="field-list simple">
</dl>
</dd></dl>

</section>
<section id="powersgd-hooks">
<h3>PowerSGD Hooks<a class="headerlink" href="#powersgd-hooks" title="Permalink to this heading">¶</a></h3>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>PowerSGD typically requires extra memory of the same size as the model’s
gradients to enable error feedback, which can compensate for biased
compressed communication and improve accuracy.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>PowerSGD hooks may conflict with <a class="reference external" href="https://github.com/NVIDIA/apex">Apex automatic mixed precision package</a>.
Please use PyTorch <a class="reference external" href="https://pytorch.org/docs/stable/amp.html">native automatic mixed precision package</a>
instead.</p>
</div>
<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.powerSGD_hook">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.</span></span><span class="sig-name descname"><span class="pre">powerSGD_hook</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">state</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bucket</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.html#powerSGD_hook"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.powerSGD_hook" title="Permalink to this definition">¶</a></dt>
<dd><p>This DDP communication hook implements PowerSGD gradient compression
algorithm described in the <a class="reference external" href="https://arxiv.org/abs/1905.13727">paper</a>.
Once gradient tensors are aggregated across all workers, this hook applies
compression as follows:</p>
<ol class="arabic">
<li><p>Views the input flattened 1D gradient tensor as a list of per-parameter tensors, and divides all the tensors into two groups:</p>
<blockquote>
<div><p>1.1 The tensors that should be compressed before allreduce, because the compression can give enough saving in bandwidth.</p>
<p>1.2 Rest of the tensors will be directly allreduced without compression, including all the vector tensors (for biases).</p>
</div></blockquote>
</li>
<li><p>Handles uncompressed tensors:</p>
<blockquote>
<div><p>2.1. Allocate contiguous memory for those uncompressed tensors, and allreduces all the uncompressed tensors as a batch, without compression;</p>
<p>2.2. Copies the individual uncompressed tensors from the contiguous memory back to the input tensor.</p>
</div></blockquote>
</li>
<li><p>Handles the tensors that should be compressed by PowerSGD compression:</p>
<blockquote>
<div><p>3.1. For each tensor M, creates two low-rank tensors P and Q for decomposing M,
such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;</p>
<p>3.2. Computes each P in Ps, which is equal to MQ;</p>
<p>3.3. Allreduces Ps as a batch;</p>
<p>3.4. Orthogonalizes each P in Ps;</p>
<p>3.5. Computes each Q in Qs, which is approximately equal to M^TP;</p>
<p>3.6. Allreduces Qs as a batch;</p>
<p>3.7. Computes each M among all the compressed tensors, which is approximately equal to PQ^T.</p>
</div></blockquote>
</li>
</ol>
<p>Note that this communication hook enforces vanilla allreduce for the first <code class="docutils literal notranslate"><span class="pre">state.start_powerSGD_iter</span></code> iterations.
This not only gives the user more control over the tradeoff between speedup and accuracy,
but also helps abstract away some complexity of the internal optimization of DDP for future communication hook developers.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>state</strong> (<a class="reference internal" href="#torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState" title="torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState"><em>PowerSGDState</em></a>) – State information to configure the compression rate and support error feedback, warm start, etc.
To tune the compression configs, mainly need to tune <code class="docutils literal notranslate"><span class="pre">matrix_approximation_rank</span></code>, <code class="docutils literal notranslate"><span class="pre">start_powerSGD_iter</span></code>
and <code class="docutils literal notranslate"><span class="pre">min_compression_rate</span></code>.</p></li>
<li><p><strong>bucket</strong> (<em>dist.GradBucket</em>) – Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
Note that since DDP comm hook only supports single process single device mode,
only exactly one tensor is stored in this bucket.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>Future handler of the communication, which updates the gradients in place.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference internal" href="futures.html#torch.futures.Future" title="torch.jit.Future"><em>Future</em></a>[<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>]</p>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">state</span> <span class="o">=</span> <span class="n">PowerSGDState</span><span class="p">(</span><span class="n">process_group</span><span class="o">=</span><span class="n">process_group</span><span class="p">,</span> <span class="n">matrix_approximation_rank</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="go">                          start_powerSGD_iter=10, min_compression_rate=0.5)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ddp_model</span><span class="o">.</span><span class="n">register_comm_hook</span><span class="p">(</span><span class="n">state</span><span class="p">,</span> <span class="n">powerSGD_hook</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.batched_powerSGD_hook">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.</span></span><span class="sig-name descname"><span class="pre">batched_powerSGD_hook</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">state</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bucket</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.html#batched_powerSGD_hook"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.batched_powerSGD_hook" title="Permalink to this definition">¶</a></dt>
<dd><p>This DDP communication hook implements a simplified PowerSGD gradient compression
algorithm described in the <a class="reference external" href="https://arxiv.org/abs/1905.13727">paper</a>.
This variant does not compress the gradients layer by layer,
but instead compresses the flattened input tensor that batches all the gradients.
Therefore, it is <strong>faster</strong> than <a class="reference internal" href="#torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.powerSGD_hook" title="torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.powerSGD_hook"><code class="xref py py-meth docutils literal notranslate"><span class="pre">powerSGD_hook()</span></code></a>,
but usually results in a <strong>much lower accuracy</strong>, unless <code class="docutils literal notranslate"><span class="pre">matrix_approximation_rank</span></code> is 1.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Increasing <code class="docutils literal notranslate"><span class="pre">matrix_approximation_rank</span></code> here may not necessarily increase the accuracy,
because batching per-parameter tensors without column/row alignment can destroy low-rank structure.
Therefore, the user should always consider <a class="reference internal" href="#torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.powerSGD_hook" title="torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.powerSGD_hook"><code class="xref py py-meth docutils literal notranslate"><span class="pre">powerSGD_hook()</span></code></a> first,
and only consider this variant when a satisfactory accuracy can be achieved when <code class="docutils literal notranslate"><span class="pre">matrix_approximation_rank</span></code> is 1.</p>
</div>
<p>Once gradient tensors are aggregated across all workers, this hook applies
compression as follows:</p>
<ol class="arabic simple">
<li><p>Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings;</p></li>
<li><p>Creates two low-rank tensors P and Q for decomposing M, such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;</p></li>
<li><p>Computes P, which is equal to MQ;</p></li>
<li><p>Allreduces P;</p></li>
<li><p>Orthogonalizes P;</p></li>
<li><p>Computes Q, which is approximately equal to M^TP;</p></li>
<li><p>Allreduces Q;</p></li>
<li><p>Computes M, which is approximately equal to PQ^T.</p></li>
<li><p>Truncates the input tensor to the original length.</p></li>
</ol>
<p>Note that this communication hook enforces vanilla allreduce for the first <code class="docutils literal notranslate"><span class="pre">state.start_powerSGD_iter</span></code> iterations.
This not only gives the user more control over the tradeoff between speedup and accuracy,
but also helps abstract away some complexity of the internal optimization of DDP for future communication hook developers.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>state</strong> (<a class="reference internal" href="#torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState" title="torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState"><em>PowerSGDState</em></a>) – State information to configure the compression rate and support error feedback, warm start, etc.
To tune the compression configs, mainly need to tune <code class="docutils literal notranslate"><span class="pre">matrix_approximation_rank</span></code> and <code class="docutils literal notranslate"><span class="pre">start_powerSGD_iter</span></code>.</p></li>
<li><p><strong>bucket</strong> (<em>dist.GradBucket</em>) – Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
Note that since DDP comm hook only supports single process single device mode,
only exactly one tensor is stored in this bucket.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>Future handler of the communication, which updates the gradients in place.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference internal" href="futures.html#torch.futures.Future" title="torch.jit.Future"><em>Future</em></a>[<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>]</p>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">state</span> <span class="o">=</span> <span class="n">PowerSGDState</span><span class="p">(</span><span class="n">process_group</span><span class="o">=</span><span class="n">process_group</span><span class="p">,</span> <span class="n">matrix_approximation_rank</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">ddp_model</span><span class="o">.</span><span class="n">register_comm_hook</span><span class="p">(</span><span class="n">state</span><span class="p">,</span> <span class="n">batched_powerSGD_hook</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

</section>
</section>
<section id="debugging-communication-hooks">
<h2>Debugging Communication Hooks<a class="headerlink" href="#debugging-communication-hooks" title="Permalink to this heading">¶</a></h2>
<p>As the name implies, debugging communication hooks are <strong>only</strong> used for debugging and performance optimization purpose.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Debugging communication hooks do not necessarily output the correct results.</p>
</div>
<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks.noop_hook">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks.</span></span><span class="sig-name descname"><span class="pre">noop_hook</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">_</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bucket</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.html#noop_hook"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks.noop_hook" title="Permalink to this definition">¶</a></dt>
<dd><p>This DDP communication hook returns a future that wraps the input,
so it is a noop that does not incur any communication overheads.</p>
<p>This hook should <strong>only</strong> be used for headroom analysis of allreduce optimization,
instead of the normal gradient synchronization.
For example, if only less than 10% speedup of training time can be observed after this hook is registered,
it usually implies that allreduce is not a performance bottleneck for this case.
Such instrumentation can be particularly useful
if GPU traces cannot be easily retrieved or the trace analysis is complicated
some factors such as the overlap between allreduce and computation or the desynchronization across ranks.</p>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">ddp_model</span><span class="o">.</span><span class="n">register_comm_hook</span><span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="n">noop_hook</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
<dl class="field-list simple">
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference internal" href="futures.html#torch.futures.Future" title="torch.jit.Future"><em>Future</em></a>[<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>]</p>
</dd>
</dl>
</dd></dl>

</section>
<section id="checkpointing-of-communication-hooks">
<h2>Checkpointing of Communication Hooks<a class="headerlink" href="#checkpointing-of-communication-hooks" title="Permalink to this heading">¶</a></h2>
<p>A stateful communication hook can be saved as a part of model checkpointing to enable trainer restarts.
To make a hook serializable, <code class="docutils literal notranslate"><span class="pre">__setstate__</span></code> and <code class="docutils literal notranslate"><span class="pre">__getstate__</span></code> should be defined.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><code class="docutils literal notranslate"><span class="pre">__getstate__</span></code> should exclude non-serializable attributes from a returned dictionary.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><code class="docutils literal notranslate"><span class="pre">__setstate__</span></code> should properly initialize non-serializable attributes, excluded from a provided <code class="docutils literal notranslate"><span class="pre">state</span></code>.</p>
</div>
<p><a class="reference internal" href="#torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState" title="torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState"><code class="xref py py-class docutils literal notranslate"><span class="pre">PowerSGDState</span></code></a> has <code class="docutils literal notranslate"><span class="pre">__setstate__</span></code> and <code class="docutils literal notranslate"><span class="pre">__getstate__</span></code> implemented and can be used as a reference.</p>
<dl class="py class">
<dt class="sig sig-object py">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.</span></span><span class="sig-name descname"><span class="pre">PowerSGDState</span></span><a class="reference internal" href="_modules/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.html#PowerSGDState"><span class="viewcode-link"><span class="pre">[source]</span></span></a></dt>
<dd><dl class="field-list simple">
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState.__getstate__">
<span class="sig-name descname"><span class="pre">__getstate__</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.html#PowerSGDState.__getstate__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState.__getstate__" title="Permalink to this definition">¶</a></dt>
<dd><p>Returns a <code class="docutils literal notranslate"><span class="pre">Dict[str,</span> <span class="pre">Any]</span></code> which will be pickled and saved.
<code class="docutils literal notranslate"><span class="pre">process_group</span></code> is not serializable and excluded from
a returned state.</p>
</dd></dl>

<dl class="py method">
<dt class="sig sig-object py" id="torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState.__setstate__">
<span class="sig-name descname"><span class="pre">__setstate__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">state</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.html#PowerSGDState.__setstate__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook.PowerSGDState.__setstate__" title="Permalink to this definition">¶</a></dt>
<dd><p>Takes a provided <code class="docutils literal notranslate"><span class="pre">state</span></code> and retrieves <code class="docutils literal notranslate"><span class="pre">PowerSGDState</span></code>.
<code class="docutils literal notranslate"><span class="pre">process_group</span></code> is set to default.</p>
</dd></dl>

</dd></dl>

<p>Here is a simple, end-to-end example of saving and reloading PowerSGD state and hook.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">tempfile</span>
<span class="kn">import</span> <span class="nn">torch</span>
<span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="kn">import</span> <span class="nn">torch.nn</span> <span class="k">as</span> <span class="nn">nn</span>
<span class="kn">import</span> <span class="nn">torch.optim</span> <span class="k">as</span> <span class="nn">optim</span>

<span class="kn">from</span> <span class="nn">torch.distributed.algorithms.ddp_comm_hooks</span> <span class="kn">import</span> <span class="n">powerSGD_hook</span> <span class="k">as</span> <span class="n">powerSGD</span>

<span class="k">class</span> <span class="nc">SimpleModel</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">fc1</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="mi">24</span><span class="p">,</span><span class="mi">24</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">relu</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">ReLU</span><span class="p">()</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">fc2</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="mi">24</span><span class="p">,</span><span class="mi">12</span><span class="p">)</span>

    <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fc2</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fc1</span><span class="p">(</span><span class="n">x</span><span class="p">)))</span>

<span class="k">def</span> <span class="nf">setup</span><span class="p">(</span><span class="n">rank</span><span class="p">,</span> <span class="n">world_size</span><span class="p">):</span>
    <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s1">&#39;MASTER_ADDR&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="s1">&#39;localhost&#39;</span>
    <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s1">&#39;MASTER_PORT&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="s1">&#39;12355&#39;</span>

    <span class="c1"># initialize the process group</span>
    <span class="n">dist</span><span class="o">.</span><span class="n">init_process_group</span><span class="p">(</span><span class="s2">&quot;nccl&quot;</span><span class="p">,</span> <span class="n">rank</span><span class="o">=</span><span class="n">rank</span><span class="p">,</span> <span class="n">world_size</span><span class="o">=</span><span class="n">world_size</span><span class="p">)</span>

<span class="k">def</span> <span class="nf">cleanup</span><span class="p">():</span>
    <span class="n">dist</span><span class="o">.</span><span class="n">destroy_process_group</span><span class="p">()</span>

<span class="k">def</span> <span class="nf">run_demo</span><span class="p">(</span><span class="n">demo_fn</span><span class="p">,</span> <span class="n">world_size</span><span class="p">):</span>
    <span class="n">mp</span><span class="o">.</span><span class="n">spawn</span><span class="p">(</span>
        <span class="n">demo_fn</span><span class="p">,</span>
        <span class="n">args</span><span class="o">=</span><span class="p">(</span><span class="n">world_size</span><span class="p">,),</span>
        <span class="n">nprocs</span><span class="o">=</span><span class="n">world_size</span><span class="p">,</span>
        <span class="n">join</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>

<span class="k">def</span> <span class="nf">demo_serialization</span><span class="p">(</span><span class="n">rank</span><span class="p">,</span> <span class="n">world_size</span><span class="p">):</span>
    <span class="n">setup</span><span class="p">(</span><span class="n">rank</span><span class="p">,</span> <span class="n">world_size</span><span class="p">)</span>

    <span class="n">CHECKPOINT</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">gettempdir</span><span class="p">()</span> <span class="o">+</span> <span class="s2">&quot;/checkpoint.pt&quot;</span>

    <span class="n">model</span> <span class="o">=</span> <span class="n">SimpleModel</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">rank</span><span class="p">)</span>
    <span class="n">ddp_model</span> <span class="o">=</span> <span class="n">DistributedDataParallel</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">device_ids</span><span class="o">=</span><span class="p">[</span><span class="n">rank</span><span class="p">])</span>

    <span class="n">powersgd_hook</span> <span class="o">=</span> <span class="n">powerSGD</span><span class="o">.</span><span class="n">powerSGD_hook</span>
    <span class="n">powersgd_state</span> <span class="o">=</span> <span class="n">powerSGD</span><span class="o">.</span><span class="n">PowerSGDState</span><span class="p">(</span><span class="n">process_group</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>

    <span class="n">optimizer</span> <span class="o">=</span> <span class="n">optim</span><span class="o">.</span><span class="n">SGD</span><span class="p">(</span><span class="n">ddp_model</span><span class="o">.</span><span class="n">parameters</span><span class="p">(),</span> <span class="n">lr</span><span class="o">=</span><span class="mf">0.001</span><span class="p">)</span>
    <span class="n">ddp_model</span><span class="o">.</span><span class="n">register_comm_hook</span><span class="p">(</span><span class="n">powersgd_state</span><span class="p">,</span> <span class="n">powersgd_hook</span><span class="p">)</span>

    <span class="n">state</span> <span class="o">=</span> <span class="p">{</span>
        <span class="s1">&#39;state_dict&#39;</span><span class="p">:</span> <span class="n">ddp_model</span><span class="o">.</span><span class="n">state_dict</span><span class="p">(),</span>
        <span class="s1">&#39;comm_hook&#39;</span><span class="p">:</span> <span class="n">hook</span><span class="p">,</span>
        <span class="s1">&#39;comm_hook_state&#39;</span><span class="p">:</span> <span class="n">hook_state</span><span class="p">}</span>

    <span class="k">if</span> <span class="n">rank</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
        <span class="n">torch</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">state</span><span class="p">,</span> <span class="n">CHECKPOINT</span><span class="p">)</span>

    <span class="n">dist</span><span class="o">.</span><span class="n">barrier</span><span class="p">()</span>
    <span class="n">map_location</span> <span class="o">=</span> <span class="p">{</span><span class="s1">&#39;cuda:</span><span class="si">%d</span><span class="s1">&#39;</span> <span class="o">%</span> <span class="mi">0</span><span class="p">:</span> <span class="s1">&#39;cuda:</span><span class="si">%d</span><span class="s1">&#39;</span> <span class="o">%</span> <span class="n">rank</span><span class="p">}</span>
    <span class="n">checkpoint</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">CHECKPOINT</span><span class="p">,</span> <span class="n">map_location</span><span class="o">=</span><span class="n">map_location</span><span class="p">)</span>

    <span class="n">ddp_model</span><span class="o">.</span><span class="n">load_state_dict</span><span class="p">(</span><span class="n">checkpoint</span><span class="p">[</span><span class="s1">&#39;state_dict&#39;</span><span class="p">])</span>
    <span class="n">powersgd_hook</span> <span class="o">=</span> <span class="n">checkpoint</span><span class="p">[</span><span class="s1">&#39;comm_hook&#39;</span><span class="p">]</span>
    <span class="n">powersgd_state</span> <span class="o">=</span> <span class="n">checkpoint</span><span class="p">[</span><span class="s1">&#39;comm_hook_state&#39;</span><span class="p">]</span>

    <span class="n">ddp_model</span><span class="o">.</span><span class="n">register_comm_hook</span><span class="p">(</span><span class="n">powersgd_state</span><span class="p">,</span> <span class="n">powersgd_hook</span><span class="p">)</span>

    <span class="k">if</span> <span class="n">rank</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
        <span class="n">os</span><span class="o">.</span><span class="n">remove</span><span class="p">(</span><span class="n">CHECKPOINT</span><span class="p">)</span>

    <span class="n">cleanup</span><span class="p">()</span>

<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
    <span class="n">n_gpus</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">device_count</span><span class="p">()</span>
    <span class="k">assert</span> <span class="n">n_gpus</span> <span class="o">&gt;=</span> <span class="mi">2</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;Requires at least 2 GPUs to run, but got </span><span class="si">{</span><span class="n">n_gpus</span><span class="si">}</span><span class="s2">&quot;</span>
    <span class="n">world_size</span> <span class="o">=</span> <span class="n">n_gpus</span>
    <span class="n">run_demo</span><span class="p">(</span><span class="n">demo_serialization</span><span class="p">,</span> <span class="n">world_size</span><span class="p">)</span>
</pre></div>
</div>
</section>
<section id="acknowledgements">
<h2>Acknowledgements<a class="headerlink" href="#acknowledgements" title="Permalink to this heading">¶</a></h2>
<p>Many thanks to PowerSGD paper author <strong>Thijs Vogels</strong> for the code review on
PowerSGD communication hook, as well as the
<a class="reference external" href="https://observablehq.com/&#64;tvogels/powersgd-benchmark">comparison experiments</a>,
which show that the performance of PowerSGD communication hook is on par with
the implementation in the original <a class="reference external" href="https://arxiv.org/abs/1905.13727">paper</a>.</p>
</section>
</section>


             </article>
             
            </div>
            <footer>
  
    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
      
        <a href="pipeline.html" class="btn btn-neutral float-right" title="Pipeline Parallelism" accesskey="n" rel="next">Next <img src="_static/images/chevron-right-orange.svg" class="next-page"></a>
      
      
        <a href="complex_numbers.html" class="btn btn-neutral" title="Complex Numbers" accesskey="p" rel="prev"><img src="_static/images/chevron-right-orange.svg" class="previous-page"> Previous</a>
      
    </div>
  

    <hr>

  
  <div role="contentinfo">
    <p>
        &copy; Copyright 2023, PyTorch Contributors.

    </p>
  </div>
    
      <div>
        Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
      </div>
     

</footer>

          </div>
<script>

var match = window.location.href.match(/\/_[a-zA-Z0-9_]*.html|_dynamo/gi);
var url = window.location.href.lastIndexOf(match[match.length-1]);

if (url)
  {
    var div = '<div class="admonition note"><p class="admonition-title">Note</p><p><i class="fa fa-exclamation-circle" aria-hidden="true">&nbsp</i> This page describes an internal API which is not intended to be used outside of the PyTorch codebase and can be modified or removed without notice.</p></div>'
    document.getElementById("pytorch-article").insertAdjacentHTML('afterBegin', div)
  }
</script>
        </div>

        <div class="pytorch-content-right" id="pytorch-content-right">
          <div class="pytorch-right-menu" id="pytorch-right-menu">
            <div class="pytorch-side-scroll" id="pytorch-side-scroll-right">
              <ul>
<li><a class="reference internal" href="#">DDP Communication Hooks</a><ul>
<li><a class="reference internal" href="#how-to-use-a-communication-hook">How to Use a Communication Hook?</a></li>
<li><a class="reference internal" href="#what-does-a-communication-hook-operate-on">What Does a Communication Hook Operate On?</a></li>
<li><a class="reference internal" href="#default-communication-hooks">Default Communication Hooks</a></li>
<li><a class="reference internal" href="#powersgd-communication-hook">PowerSGD Communication Hook</a><ul>
<li><a class="reference internal" href="#powersgd-state">PowerSGD State</a></li>
<li><a class="reference internal" href="#powersgd-hooks">PowerSGD Hooks</a></li>
</ul>
</li>
<li><a class="reference internal" href="#debugging-communication-hooks">Debugging Communication Hooks</a></li>
<li><a class="reference internal" href="#checkpointing-of-communication-hooks">Checkpointing of Communication Hooks</a></li>
<li><a class="reference internal" href="#acknowledgements">Acknowledgements</a></li>
</ul>
</li>
</ul>

            </div>
          </div>
        </div>
      </section>
    </div>

  
       <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
         <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
         <script src="_static/jquery.js"></script>
         <script src="_static/underscore.js"></script>
         <script src="_static/_sphinx_javascript_frameworks_compat.js"></script>
         <script src="_static/doctools.js"></script>
         <script src="_static/clipboard.min.js"></script>
         <script src="_static/copybutton.js"></script>
     

  <script type="text/javascript" src="_static/js/vendor/popper.min.js"></script>
  <script type="text/javascript" src="_static/js/vendor/bootstrap.min.js"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/list.js/1.5.0/list.min.js"></script>
  <script type="text/javascript" src="_static/js/theme.js"></script>

  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>
 
<script script type="text/javascript">
  var collapsedSections = ['Developer Notes', 'Language Bindings', 'Libraries', 'Community'];
</script>

<img height="1" width="1" style="border-style:none;" alt="" src="https://www.googleadservices.com/pagead/conversion/795629140/?label=txkmCPmdtosBENSssfsC&amp;guid=ON&amp;script=0"/>


  <!-- Begin Footer -->

  <div class="container-fluid docs-tutorials-resources" id="docs-tutorials-resources">
    <div class="container">
      <div class="row">
        <div class="col-md-4 text-center">
          <h2>Docs</h2>
          <p>Access comprehensive developer documentation for PyTorch</p>
          <a class="with-right-arrow" href="https://pytorch.org/docs/stable/index.html">View Docs</a>
        </div>

        <div class="col-md-4 text-center">
          <h2>Tutorials</h2>
          <p>Get in-depth tutorials for beginners and advanced developers</p>
          <a class="with-right-arrow" href="https://pytorch.org/tutorials">View Tutorials</a>
        </div>

        <div class="col-md-4 text-center">
          <h2>Resources</h2>
          <p>Find development resources and get your questions answered</p>
          <a class="with-right-arrow" href="https://pytorch.org/resources">View Resources</a>
        </div>
      </div>
    </div>
  </div>

  <footer class="site-footer">
    <div class="container footer-container">
      <div class="footer-logo-wrapper">
        <a href="https://pytorch.org/" class="footer-logo"></a>
      </div>

      <div class="footer-links-wrapper">
        <div class="footer-links-col">
          <ul>
            <li class="list-title"><a href="https://pytorch.org/">PyTorch</a></li>
            <li><a href="https://pytorch.org/get-started">Get Started</a></li>
            <li><a href="https://pytorch.org/features">Features</a></li>
            <li><a href="https://pytorch.org/ecosystem">Ecosystem</a></li>
            <li><a href="https://pytorch.org/blog/">Blog</a></li>
            <li><a href="https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md">Contributing</a></li>
          </ul>
        </div>

        <div class="footer-links-col">
          <ul>
            <li class="list-title"><a href="https://pytorch.org/resources">Resources</a></li>
            <li><a href="https://pytorch.org/tutorials">Tutorials</a></li>
            <li><a href="https://pytorch.org/docs/stable/index.html">Docs</a></li>
            <li><a href="https://discuss.pytorch.org" target="_blank">Discuss</a></li>
            <li><a href="https://github.com/pytorch/pytorch/issues" target="_blank">Github Issues</a></li>
            <li><a href="https://pytorch.org/assets/brand-guidelines/PyTorch-Brand-Guidelines.pdf" target="_blank">Brand Guidelines</a></li>
          </ul>
        </div>

        <div class="footer-links-col">
          <ul>
            <li class="list-title">Stay up to date</li>
            <li><a href="https://www.facebook.com/pytorch" target="_blank">Facebook</a></li>
            <li><a href="https://twitter.com/pytorch" target="_blank">Twitter</a></li>
            <li><a href="https://www.youtube.com/pytorch" target="_blank">YouTube</a></li>
            <li><a href="https://www.linkedin.com/company/pytorch" target="_blank">LinkedIn</a></li>
          </ul>  
          </div>

        <div class="footer-links-col">
          <ul>
            <li class="list-title">PyTorch Podcasts</li>
            <li><a href="https://open.spotify.com/show/6UzHKeiy368jKfQMKKvJY5" target="_blank">Spotify</a></li>
            <li><a href="https://podcasts.apple.com/us/podcast/pytorch-developer-podcast/id1566080008" target="_blank">Apple</a></li>
            <li><a href="https://www.google.com/podcasts?feed=aHR0cHM6Ly9mZWVkcy5zaW1wbGVjYXN0LmNvbS9PQjVGa0lsOA%3D%3D" target="_blank">Google</a></li>
            <li><a href="https://music.amazon.com/podcasts/7a4e6f0e-26c2-49e9-a478-41bd244197d0/PyTorch-Developer-Podcast?" target="_blank">Amazon</a></li>
          </ul>
         </div>
        </div>
        
        <div class="privacy-policy">
          <ul>
            <li class="privacy-policy-links"><a href="https://www.linuxfoundation.org/terms/" target="_blank">Terms</a></li>
            <li class="privacy-policy-links">|</li>
            <li class="privacy-policy-links"><a href="https://www.linuxfoundation.org/privacy-policy/" target="_blank">Privacy</a></li>
          </ul>
        </div>
        <div class="copyright">
        <p>© Copyright The Linux Foundation. The PyTorch Foundation is a project of The Linux Foundation.
          For web site terms of use, trademark policy and other policies applicable to The PyTorch Foundation please see
          <a href="www.linuxfoundation.org/policies/">www.linuxfoundation.org/policies/</a>. The PyTorch Foundation supports the PyTorch open source
          project, which has been established as PyTorch Project a Series of LF Projects, LLC. For policies applicable to the PyTorch Project a Series of LF Projects, LLC,
          please see <a href="www.lfprojects.org/policies/">www.lfprojects.org/policies/</a>.</p>
      </div>
     </div>

  </footer>

  <div class="cookie-banner-wrapper">
  <div class="container">
    <p class="gdpr-notice">To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. As the current maintainers of this site, Facebook’s Cookies Policy applies. Learn more, including about available controls: <a href="https://www.facebook.com/policies/cookies/">Cookies Policy</a>.</p>
    <img class="close-button" src="_static/images/pytorch-x.svg">
  </div>
</div>

  <!-- End Footer -->

  <!-- Begin Mobile Menu -->

  <div class="mobile-main-menu">
    <div class="container-fluid">
      <div class="container">
        <div class="mobile-main-menu-header-container">
          <a class="header-logo" href="https://pytorch.org/" aria-label="PyTorch"></a>
          <a class="main-menu-close-button" href="#" data-behavior="close-mobile-menu"></a>
        </div>
      </div>
    </div>

    <div class="mobile-main-menu-links-container">
      <div class="main-menu">
        <ul>
          <li>
            <a href="https://pytorch.org/get-started">Get Started</a>
          </li>

          <li>
            <a href="https://pytorch.org/ecosystem">Ecosystem</a>
          </li>
            
          <li>
            <a href="https://pytorch.org/mobile">Mobile</a>
          </li>

          <li>
            <a href="https://pytorch.org/blog/">Blog</a>
          </li>

          <li>
            <a href="https://pytorch.org/tutorials">Tutorials</a>
          </li>

          <li class="resources-mobile-menu-title" class="active">
            Docs
          </li>

          <ul class="resources-mobile-menu-items">
            <li>
              <a href="https://pytorch.org/docs/stable/index.html">PyTorch</a>
            </li>

            <li>
              <a href="https://pytorch.org/audio/stable/index.html">torchaudio</a>
            </li>

            <li>
              <a href="https://pytorch.org/text/stable/index.html">torchtext</a>
            </li>

            <li>
              <a href="https://pytorch.org/vision/stable/index.html">torchvision</a>
            </li>

            <li>
              <a href="https://pytorch.org/torcharrow">torcharrow</a>
            </li>

            <li>
              <a href="https://pytorch.org/data">TorchData</a>
            </li>

            <li>
              <a href="https://pytorch.org/torchrec">TorchRec</a>
            </li>

            <li>
              <a href="https://pytorch.org/serve/">TorchServe</a>
            </li>

            <li>
              <a href="https://pytorch.org/torchx/">TorchX</a>
            </li>

            <li>
              <a href="https://pytorch.org/xla">PyTorch on XLA Devices</a>
            </li>
          </ul>

          <li class="resources-mobile-menu-title">
            Resources
          </li>
            
           <ul class="resources-mobile-menu-items">

            <li>
              <a href="https://pytorch.org/features">About</a>
            </li>

            <li>
              <a href="https://pytorch.org/foundation">PyTorch Foundation</a>
            </li>

            <li>
              <a href="https://pytorch.org/#community-module">Community</a>
            </li>

            <li>
              <a href="https://pytorch.org/community-stories">Community Stories</a>
            </li>

            <li>
              <a href="https://pytorch.org/resources">Developer Resources</a>
            </li>

            <li>
              <a href="https://pytorch.org/events">Events</a>
            </li>

            <li>
              <a href="https://discuss.pytorch.org/">Forums</a>
            </li>

            <li>
              <a href="https://pytorch.org/hub">Models (Beta)</a>
            </li>
          </ul>

          <li>
            <a href="https://github.com/pytorch/pytorch">Github</a>
          </li>
        </ul>
      </div>
    </div>
  </div>

  <!-- End Mobile Menu -->

  <script type="text/javascript" src="_static/js/vendor/anchor.min.js"></script>

  <script type="text/javascript">
    $(document).ready(function() {
      mobileMenu.bind();
      mobileTOC.bind();
      pytorchAnchors.bind();
      sideMenus.bind();
      scrollToAnchor.bind();
      highlightNavigation.bind();
      mainMenuDropdown.bind();
      filterTags.bind();

      // Add class to links that have code blocks, since we cannot create links in code blocks
      $("article.pytorch-article a span.pre").each(function(e) {
        $(this).closest("a").addClass("has-code");
      });
    })
  </script>
</body>
</html>