update T5 to use methods from common

leeosha53302 · Oct 29, 2023 · c426f19 · c426f19
1 parent 37009e5
commit c426f19
Show file tree

Hide file tree

Showing 24 changed files with 264 additions and 377 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -114,6 +114,20 @@ train.t5_core.220m_tp2_pp1_1node_100steps:
     TEST_LEVEL: L0
     PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
 
+train.t5_core.220m_tp4_pp1_1node_100steps:
+  <<: *selene-test-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: t5
+    USE_TE: 0
+    TP_SIZE: 4
+    PP_SIZE: 1
+    NUM_NODES: 1
+    MAX_STEPS: 100
+    TIME_LIMIT: 30:00"
+    TEST_LEVEL: L0
+    PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
+
 train.t5_core.220m_te_tp1_pp1_1node_100steps:
   <<: *selene-test-launcher
   variables:
@@ -142,42 +156,71 @@ train.t5_core.220m_te_tp2_pp1_1node_100steps:
     TEST_LEVEL: L0
     PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
 
-train.t5_core.220m_do_tp1_pp1_1node_100steps:
+train.t5_core.220m_te_tp4_pp1_1node_100steps:
   <<: *selene-test-launcher
   variables:
     <<: [*VARS]
     RUN_MODEL: t5
-    USE_TE: 0
+    USE_TE: 1
+    TP_SIZE: 4
+    PP_SIZE: 1
+    NUM_NODES: 1
+    MAX_STEPS: 100
+    TIME_LIMIT: 30:00"
+    TEST_LEVEL: L0
+    PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
+
+train.t5_core.220m_te_nofa_tp1_pp1_1node_100steps:
+  <<: *selene-test-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: t5
+    USE_TE: 1
+    NO_FA: 1
     TP_SIZE: 1
     PP_SIZE: 1
     NUM_NODES: 1
     MAX_STEPS: 100
     TIME_LIMIT: 30:00"
     TEST_LEVEL: L0
     PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
-    ADDITIONAL_PARAMS: "--use-distributed-optimizer"
 
-train.t5_core.220m_do_tp2_pp1_1node_100steps:
+train.t5_core.220m_tp4_pp1_sp_1node_100steps:
   <<: *selene-test-launcher
   variables:
     <<: [*VARS]
     RUN_MODEL: t5
     USE_TE: 0
-    TP_SIZE: 2
+    TP_SIZE: 4
     PP_SIZE: 1
     NUM_NODES: 1
     MAX_STEPS: 100
     TIME_LIMIT: 30:00"
     TEST_LEVEL: L0
     PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
-    ADDITIONAL_PARAMS: "--use-distributed-optimizer"
+    ADDITIONAL_PARAMS: "--sequence-parallel"
 
-train.t5_core.220m_te_do_tp1_pp1_1node_100steps:
+train.t5_core.220m_te_tp4_pp1_sp_1node_100steps:
   <<: *selene-test-launcher
   variables:
     <<: [*VARS]
     RUN_MODEL: t5
     USE_TE: 1
+    TP_SIZE: 4
+    PP_SIZE: 1
+    NUM_NODES: 1
+    MAX_STEPS: 100
+    TIME_LIMIT: 30:00"
+    TEST_LEVEL: L0
+    PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
+    ADDITIONAL_PARAMS: "--sequence-parallel"
+
+train.t5_core.220m_do_tp1_pp1_1node_100steps:
+  <<: *selene-test-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: t5
+    USE_TE: 0
     TP_SIZE: 1
     PP_SIZE: 1
     NUM_NODES: 1
@@ -187,13 +230,13 @@ train.t5_core.220m_te_do_tp1_pp1_1node_100steps:
     PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
     ADDITIONAL_PARAMS: "--use-distributed-optimizer"
 
-train.t5_core.220m_te_do_tp2_pp1_1node_100steps:
+train.t5_core.220m_te_do_tp1_pp1_1node_100steps:
   <<: *selene-test-launcher
   variables:
     <<: [*VARS]
     RUN_MODEL: t5
     USE_TE: 1
-    TP_SIZE: 2
+    TP_SIZE: 1
     PP_SIZE: 1
     NUM_NODES: 1
     MAX_STEPS: 100
@@ -229,6 +272,20 @@ resume.checkpoint.t5_core.220m_tp1_pp1_1node:
     TEST_LEVEL: L0
     PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
 
+resume.checkpoint.t5_core.220m_te_tp1_pp1_1node:
+  <<: *selene-test-resume-checkpoint-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: t5
+    USE_TE: 1
+    TP_SIZE: 1
+    PP_SIZE: 1
+    NUM_NODES: 1
+    TIME_LIMIT: "30:00"
+    TEST_LEVEL: L0
+    PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
+
+
 
 # train.t5_core.220m_tp1_pp1_rope_1node_100steps:
 #   <<: *selene-test-launcher

diff --git a/examples/detxoify_lm/generate-1.3b.sh b/examples/detxoify_lm/generate-1.3b.sh
diff --git a/examples/evaluate_retriever_nq.sh b/examples/evaluate_retriever_nq.sh
diff --git a/examples/msdp/data_processing.sh b/examples/msdp/data_processing.sh
diff --git a/examples/msdp/eval_knwl_generation.sh b/examples/msdp/eval_knwl_generation.sh
diff --git a/examples/msdp/eval_resp_generation.sh b/examples/msdp/eval_resp_generation.sh
diff --git a/examples/msdp/prep_resp_gen.sh b/examples/msdp/prep_resp_gen.sh
diff --git a/examples/msdp/prompt_knwl_gen.sh b/examples/msdp/prompt_knwl_gen.sh
diff --git a/examples/msdp/prompt_resp_gen.sh b/examples/msdp/prompt_resp_gen.sh
diff --git a/examples/pretrain_t5.sh b/examples/pretrain_t5.sh
diff --git a/examples/pretrain_t5_distributed_with_mp.sh b/examples/pretrain_t5_distributed_with_mp.sh
diff --git a/examples/t5/README.md b/examples/t5/README.md
@@ -10,12 +10,12 @@
 To run the model on Selene 
 ```
 PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.09-py3
-ACCOUNT_NAME=""
+ACCOUNT_NAME="" 
 PARTITION=""
 JOB_NAME=""
 NUM_NODES=1
-CHECKPOINT_PATH="" #<Specify path>
-TENSORBOARD_LOGS_PATH=""#<Specify path>
+CHECKPOINT_PATH="" #<Specify path to checkpoint>
+TENSORBOARD_LOGS_PATH=""#<Specify path to tensorboard log>
 VOCAB_FILE="" #<Specify path to file>/bert-large-cased-vocab.txt
 DATA_PATH="" #<Specify path and file prefix>_text_document
 
@@ -27,7 +27,7 @@ srun -N $NUM_NODES --container-image $PYTORCH_IMAGE --container-mounts "/path/to
 
 ## 2. Configurations
 <a id="markdown-configurations" name="configurations"></a>
-The example in this folder shows you how to run 220M model. 
+The architecture arguments below shows configuration for T5 220M model. 
 
 ### 220M 
 ```
@@ -47,7 +47,22 @@ The example in this folder shows you how to run 220M model.
 
 ## 3. Training Results
 <a id="markdown-training-results" name="training-results"></a>
-The following is the results we got for the 220M model on Pile dataset. The training takes 4 days on 32 GPUs, with batch size of 2048. 
+Below is the training curve for the 220M model on Pile dataset. The training takes 4 days on 32 GPUs, with batch size of 2048. 
+
+Finetuning on SQUAD dataset, the validation result is: 63.44\%
 <!-- ![Alt text](examples/t5/training_curve.png.png "Training loss curve for T5 220M model on Pile dataset (batch size of 2048)") -->
-<!-- ![IMAGE_DESCRIPTION](training_curve.png) -->
-<img src="training_curve.png"  width="700" height="500">
+<p align="center">
+<img src="/lustre/fsw/joc/huvu/codes/T5_mcore/megatron-lm-updated/megatron-lm/examples/t5/t5_mcore_train_curve.png"  width="800" height="400">
+</p>
+
+<!-- ## 4. Functional supports
+The table below show current T5 functional supports.
+
+|               | Transformer engine  | Flash-attention | Tensor parallel | Pipeline parallel | Sequence parallel | Distributed optimizer | 
+| ------------- | :---: | :---: | :---: | :---: | :---: | :---: | 
+| **Transformer engine**   | Content Cell  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | Content Cell  |
+| **Flash-attention**   | Content Cell  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | 
+| **Tensor parallel**  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | 
+| **Pipeline parallel**  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | 
+| **Sequence parallel**  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | Content Cell  |
+| **Distributed optimizer**  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | Content Cell  | Content Cell  |  -->
diff --git a/megatron/core/models/T5/t5_embedding.py b/megatron/core/models/T5/t5_embedding.py