NVIDIA · ananthsub · Feb 7, 2025 · Feb 7, 2025 · Feb 7, 2025 · farhadrgh
diff --git a/nemo/lightning/fabric/plugins.py b/nemo/lightning/fabric/plugins.py
@@ -146,9 +146,10 @@ def convert_module(self, module: nn.Module) -> nn.Module:
             config = get_model_config(module.module)
             config.fp16 = self.dtype_config.fp16
             config.bf16 = self.dtype_config.bf16
-            if hasattr(module, 'module'):
+            # Avoid rewrapping the module if it's already of type Float16Module
+            if hasattr(module, 'module') and not isinstance(module.module, Float16Module):
                 module.module = Float16Module(config, module.module)
-            else:
+            elif not isinstance(module, Float16Module):
                 module = Float16Module(config, module)
 if hasattr(module, 'module'): 
     module.module = Float16Module(config, module.module) 
 else: 
     module = Float16Module(config, module) 
 if hasattr(module, 'module'): 
     module.module = Float16Module(config, module.module) 
 else: 
     module = Float16Module(config, module) 
 
         return module

diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
@@ -649,15 +649,16 @@ def init_ddp(self):
             disable_bucketing = (model_chunk_idx > 0) or overlap_param_gather_with_optimizer_step
 
             with init_ddp_context():
-                if HAVE_CUSTOM_FSDP and self.ddp_config.use_custom_fsdp:
+                # Avoid rewrapping the module if it's already wrapped with FSDP
+                if HAVE_CUSTOM_FSDP and self.ddp_config.use_custom_fsdp and not isinstance(module, FullyShardedDataParallel):
                     FSDP = FullyShardedDataParallel
                     dist_module = FSDP(
                         module.config,
                         self.ddp_config,
                         module,
                         disable_bucketing=disable_bucketing,
                     )
-                else:
+                elif not isinstance(module, DDP):
                     dist_module = DDP(
                         module.config,
                         self.ddp_config,