Comment in pretrain_t5.py to explain how pipeline parallelism is impl…

…emented for T5 model
denkensk · Jul 30, 2021 · 2babcaf · 2babcaf
1 parent 46c74b4
commit 2babcaf
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 0 deletions.
diff --git a/megatron/schedules.py b/megatron/schedules.py
@@ -71,6 +71,9 @@ def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_r
         losses_reduced.append(loss_reduced)
     timers('forward-compute').stop()
 
+    # If T5 model (or other model with encoder and decoder)
+    # and in decoder stack, then send encoder_hidden_state
+    # downstream as well.
     if mpu.is_pipeline_stage_after_split() and \
             args.model_type == ModelType.encoder_and_decoder:
         return [output_tensor, input_tensor[-1]]

diff --git a/pretrain_t5.py b/pretrain_t5.py
@@ -31,6 +31,42 @@
 from megatron.utils import average_losses_across_data_parallel_group
 
 
+"""
+Pipeline parallelism for T5
+===========================
+
+T5 is a model architecture with both encoder and decoder blocks.
+Consequently, pipeline parallelism is implemented slightly differently
+compared to architectures like GPT and BERT.
+
+In particular, when pipeline_model_parallel_world_size > 1, each stage
+either executes an encoder block or a decoder block. The
+--pipeline-model-parallel-split-rank argument controls the rank at which
+the split happens: all ranks lower than this argument execute the
+encoder block, and all ranks equal to or higher than this argument value
+execute the decoder block.
+
+In the encoder section of the model, only one tensor is sent downstream:
+the intermediate encoder_hidden_state. In the decoder section of the
+model, two tensors are sent downstream in the forward pass: the fully
+computed encoder_hidden_state, and the intermediate decoder_hidden_state.
+
+In particular, these are the shapes of the tensors sent between
+different workers:
+    If rank is in decoder section:
+        intermediate decoder_hidden_state (pre-transpose),
+        complete encoder_hidden_state (post-transpose).
+    If rank is at boundary between encoder and decoder sections:
+        complete encoder_hidden_state (post-transpose).
+    If rank is in encoder section:
+        intermediate encoder_hidden_state (pre-transpose).
+
+Additionally, we have code in the backward_step function in schedules.py
+to accumulate the encoder_hidden_state gradient across skip connections
+(encoder_hidden_state fed in as input to each layer in the decoder).
+"""
+
+
 def model_provider(pre_process=True, post_process=True,
                    add_encoder=True, add_decoder=True):
     """Build the model."""