Skip to content

Commit

Permalink
support tp initialization, mapping and utils
Browse files Browse the repository at this point in the history
  • Loading branch information
lvyufeng committed Dec 2, 2024
1 parent 97a8389 commit b66eaa9
Show file tree
Hide file tree
Showing 25 changed files with 668 additions and 1,467 deletions.
8 changes: 4 additions & 4 deletions megatron/core/fusions/fused_layer_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from torch.nn.parameter import Parameter

from megatron.core.transformer import TransformerConfig
from megatron.core.utils import make_viewless_tensor
# from megatron.core.utils import make_viewless_tensor

try:
from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
Expand Down Expand Up @@ -144,9 +144,9 @@ def forward(self, input: Tensor) -> Tensor:
# a populated '_base' field). This will result in schedule.py's
# deallocate_output_tensor() throwing an error, so a viewless tensor is
# created to prevent this.
output = make_viewless_tensor(
inp=output, requires_grad=input.requires_grad, keep_graph=True
)
# output = make_viewless_tensor(
# inp=output, requires_grad=input.requires_grad, keep_graph=True
# )

else:
if (
Expand Down
41 changes: 3 additions & 38 deletions megatron/core/models/gpt/gpt_layer_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,16 @@
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.custom_layers.transformer_engine import (
TEDotProductAttention,
TELayerNormColumnParallelLinear,
TENorm,
TERowParallelLinear,
)
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.mlp import MLP, MLPSubmodules
from megatron.core.transformer.moe.moe_layer import MoELayer
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
# from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules


# Use this spec to use lower level Transformer Engine modules (required for fp8 training)
def get_gpt_layer_with_transformer_engine_spec(
num_experts: int = None, moe_grouped_gemm: bool = False, qk_layernorm: bool = False
) -> ModuleSpec:
mlp = _get_mlp_module_spec(
use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
)
return ModuleSpec(
module=TransformerLayer,
submodules=TransformerLayerSubmodules(
self_attention=ModuleSpec(
module=SelfAttention,
params={"attn_mask_type": AttnMaskType.causal},
submodules=SelfAttentionSubmodules(
linear_qkv=TELayerNormColumnParallelLinear,
core_attention=TEDotProductAttention,
linear_proj=TERowParallelLinear,
q_layernorm=TENorm if qk_layernorm else IdentityOp,
k_layernorm=TENorm if qk_layernorm else IdentityOp,
),
),
self_attn_bda=get_bias_dropout_add,
pre_mlp_layernorm=TENorm if num_experts else IdentityOp,
mlp=mlp,
mlp_bda=get_bias_dropout_add,
),
)


# Use this spec for an implementation using only modules in megatron core
def get_gpt_layer_local_spec(
num_experts: int = None, moe_grouped_gemm: bool = False, qk_layernorm: bool = False
Expand Down Expand Up @@ -92,8 +57,8 @@ def _get_mlp_module_spec(
return ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear,
linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
linear_fc1=ColumnParallelLinear,
linear_fc2=RowParallelLinear,
),
)
else:
Expand Down
52 changes: 2 additions & 50 deletions megatron/core/models/retro/decoder_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.models.gpt.gpt_layer_specs import (
get_gpt_layer_local_spec,
get_gpt_layer_with_transformer_engine_spec,
)
from megatron.core.models.retro.config import RetroConfig
from megatron.core.models.retro.decoder_attention import (
Expand All @@ -19,52 +18,13 @@
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer import ModuleSpec
from megatron.core.transformer.attention import CrossAttentionSubmodules
from megatron.core.transformer.custom_layers.transformer_engine import (
TEColumnParallelLinear,
TEDotProductAttention,
TENorm,
TERowParallelLinear,
)
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.transformer_block import (
TransformerBlockSubmodules,
get_num_layers_to_build,
)


def get_retro_decoder_layer_te_spec(
encoder_block_spec: typing.Union[ModuleSpec, TransformerBlockSubmodules, None] = None
) -> ModuleSpec:
"""Retro decoder TE spec (uses Transformer Engine components).
A Retro decoder layer uses custom attention and bias-dropout-add operators
to perform chunked-cross attention. Additionally, the first Retro decoder
layer instantiates an entire encoder transformer block. As such, the decoder
cross attention module takes an optional encoder block spec, which is only
provided for the first Retro decoder layer.
Args:
encoder_block_spec (ModuleSpec): Retro encoder block spec, to be provided for the first Retro decoder layer.
Returns:
A module spec with Transformer Engine modules.
"""
spec = get_gpt_layer_with_transformer_engine_spec()
spec.submodules.pre_cross_attn_layernorm = TENorm
spec.submodules.cross_attention = ModuleSpec(
module=RetroDecoderCrossAttention,
params={"encoder_block_spec": encoder_block_spec,},
submodules=CrossAttentionSubmodules(
linear_q=TEColumnParallelLinear,
linear_kv=TEColumnParallelLinear,
core_attention=TEDotProductAttention,
linear_proj=TERowParallelLinear,
),
)
spec.submodules.cross_attn_bda = ModuleSpec(module=RetroDecoderBiasDropoutAdd)
return spec


def get_retro_decoder_layer_local_spec(
encoder_block_spec: typing.Optional[ModuleSpec] = None,
) -> ModuleSpec:
Expand Down Expand Up @@ -131,16 +91,8 @@ def get_retro_decoder_block_spec(
retro_layer_numbers = list(range(retro_layer_start, num_layers + 1, 3))

# Layer specs.
gpt_layer_spec = (
get_gpt_layer_with_transformer_engine_spec()
if use_transformer_engine
else get_gpt_layer_local_spec()
)
get_retro_decoder_layer_spec = (
get_retro_decoder_layer_te_spec
if use_transformer_engine
else get_retro_decoder_layer_local_spec
)
gpt_layer_spec = get_gpt_layer_local_spec()
get_retro_decoder_layer_spec = get_retro_decoder_layer_local_spec
retro_layer_spec = get_retro_decoder_layer_spec()
retro_layer_spec_with_retriever = get_retro_decoder_layer_spec(
get_retro_encoder_block_spec(config, use_transformer_engine)
Expand Down
7 changes: 5 additions & 2 deletions megatron/core/tensor_parallel/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,11 @@ def _initialize_affine_weight_cpu(

with torch.no_grad():
# all tensors must live on the same device
cpu_weight = torch.cat(my_weight_list, dim=partition_dim).to_dense()
weight.data.copy_(cpu_weight)
# cpu_weight = torch.cat(my_weight_list, dim=partition_dim).to_dense()
# weight.data.copy_(cpu_weight)
cpu_weight = torch.cat(my_weight_list, dim=partition_dim)
weight.assign_value(cpu_weight)

if return_master_weight:
return master_weight
return None
Expand Down
Loading

0 comments on commit b66eaa9

Please sign in to comment.