nemo_automodel.components.distributed.config

Strategy-specific distributed training configuration classes.

Design principle:

Size params (dp_size, dp_replicate_size, tp_size, pp_size, cp_size, ep_size) are grouped in ParallelismSizes.
dp_replicate_size is FSDP2-only: raises assertion if passed with non-FSDP2 config
Strategy-specific configs contain only additional flags unique to each strategy
Managers become normal classes that accept (config, device_mesh)

Module Contents

Classes

Name	Description
`DDPConfig`	Additional configuration for DDP distributed training.
`DistributedSetup`	Resolved distributed topology and execution policies.
`FSDP2Config`	Additional configuration for FSDP2 distributed training.
`MegatronFSDPConfig`	Additional configuration for MegatronFSDP distributed training.
`MoEParallelizerConfig`	Configuration for MoE model parallelization (EP + FSDP settings).

Functions

Name	Description
`_resolve_strategy_config`	Resolve a setup-level strategy name or config object.

Data

ActivationCheckpointingMode

DistributedConfig

DistributedStrategyConfig

_STRATEGY_MAP

_StrategyConfigClass

__all__

API

class nemo_automodel.components.distributed.config.DDPConfig(
    activation_checkpointing: bool = False,
    broadcast_buffers: bool = False,
    find_unused_parameters: bool = False,
    static_graph: bool = False,
    bucket_cap_mb: typing.Optional[float] = None,
    gradient_as_bucket_view: bool = False,
    autocast_dtype: typing.Optional[torch.dtype] = None
)

Dataclass

Additional configuration for DDP distributed training.

Note: DDP does not support tensor parallelism, pipeline parallelism, or expert parallelism. Only dp_size is relevant (inferred from world_size).

activation_checkpointing

bool = False

autocast_dtype

Optional[dtype] = None

broadcast_buffers

bool = False

bucket_cap_mb

Optional[float] = None

find_unused_parameters

bool = False

gradient_as_bucket_view

bool = False

static_graph

bool = False

nemo_automodel.components.distributed.config.DDPConfig.to_dict() -> typing.Dict[str, typing.Any]

Convert config to dictionary.

class nemo_automodel.components.distributed.config.DistributedSetup(
    mesh_context: 'MeshContext',
    strategy_config: nemo_automodel.components.distributed.config.DistributedStrategyConfig | None = None,
    pipeline_config: 'PipelineConfig | None' = None,
    moe_parallel_config: 'MoEParallelizerConfig | None' = None,
    activation_checkpointing: nemo_automodel.components.distributed.config.ActivationCheckpointingMode = False
)

Dataclass

Resolved distributed topology and execution policies.

activation_checkpointing

ActivationCheckpointingMode = False

mesh_context

'MeshContext'

moe_parallel_config

'MoEParallelizerConfig | None' = None

pipeline_config

'PipelineConfig | None' = None

strategy_config

DistributedStrategyConfig | None = None

nemo_automodel.components.distributed.config.DistributedSetup.build(
    strategy: str | nemo_automodel.components.distributed.config.DistributedStrategyConfig = 'fsdp2',
    parallelism_sizes: 'ParallelismSizes | None' = None,
    pipeline_config: 'PipelineConfig | dict | None' = None,
    moe_parallel_config: 'MoEParallelizerConfig | dict | None' = None,
    activation_checkpointing: nemo_automodel.components.distributed.config.ActivationCheckpointingMode = False,
    world_size: int | None = None
) -> 'DistributedSetup'

classmethod

Create a resolved distributed setup from sizes and policy configs.

Intentionally, this function is forgiving wrt the input types, allowing strings for the strategy and dicts for the pipeline and MoE configs.

class nemo_automodel.components.distributed.config.FSDP2Config(
    sequence_parallel: bool = False,
    tp_plan: typing.Optional[dict] = None,
    patch_is_packed_sequence: bool = False,
    mp_policy: typing.Optional[torch.distributed.fsdp.MixedPrecisionPolicy] = (lambda: MixedPrecisionPoli...,
    offload_policy: typing.Optional[torch.distributed.fsdp.CPUOffloadPolicy] = None,
    autocast_dtype: typing.Optional[torch.dtype] = None,
    activation_checkpointing: nemo_automodel.components.distributed.config.ActivationCheckpointingMode = False,
    defer_fsdp_grad_sync: bool = True,
    reshard_after_forward: typing.Optional[bool] = None,
    enable_async_tensor_parallel: bool = False,
    enable_compile: bool = False,
    enable_fsdp2_prefetch: bool = False,
    fsdp2_backward_prefetch_depth: int = 2,
    fsdp2_forward_prefetch_depth: int = 1
)

Dataclass

Additional configuration for FSDP2 distributed training.

Note: Size parameters (dp_size, dp_replicate_size, tp_size, pp_size, cp_size, ep_size) are grouped separately in ParallelismSizes.

activation_checkpointing

ActivationCheckpointingMode = False

autocast_dtype

Optional[dtype] = None

defer_fsdp_grad_sync

bool = True

enable_async_tensor_parallel

bool = False

enable_compile

bool = False

enable_fsdp2_prefetch

bool = False

fsdp2_backward_prefetch_depth

int = 2

fsdp2_forward_prefetch_depth

int = 1

mp_policy

Optional[MixedPrecisionPolicy]

offload_policy

Optional[CPUOffloadPolicy] = None

patch_is_packed_sequence

bool = False

reshard_after_forward

Optional[bool] = None

sequence_parallel

bool = False

tp_plan

Optional[dict] = None

nemo_automodel.components.distributed.config.FSDP2Config.__post_init__()

nemo_automodel.components.distributed.config.FSDP2Config.to_dict() -> typing.Dict[str, typing.Any]

Convert config to dictionary (shallow, preserves policy objects).

class nemo_automodel.components.distributed.config.MegatronFSDPConfig(
    megatron_fsdp_unit_modules: typing.List[str] = (lambda: ['transformers.mod...,
    zero_dp_strategy: int = 3,
    init_fsdp_with_meta_device: bool = False,
    grad_reduce_in_fp32: bool = False,
    preserve_fp32_weights: bool = False,
    overlap_grad_reduce: bool = True,
    overlap_param_gather: bool = True,
    check_for_nan_in_grad: bool = True,
    average_in_collective: bool = False,
    disable_bucketing: bool = False,
    calculate_per_token_loss: bool = False,
    keep_fp8_transpose_cache: bool = False,
    nccl_ub: bool = False,
    fsdp_double_buffer: bool = False,
    activation_checkpointing: bool = False
)

Dataclass

Additional configuration for MegatronFSDP distributed training.

Note: Size parameters (dp_size, tp_size, cp_size) are grouped separately in ParallelismSizes. MegatronFSDP does not support pp_size, dp_replicate_size, or ep_size.

activation_checkpointing

bool = False

average_in_collective

bool = False

calculate_per_token_loss

bool = False

check_for_nan_in_grad

bool = True

disable_bucketing

bool = False

fsdp_double_buffer

bool = False

grad_reduce_in_fp32

bool = False

init_fsdp_with_meta_device

bool = False

keep_fp8_transpose_cache

bool = False

megatron_fsdp_unit_modules

List[str]

nccl_ub

bool = False

overlap_grad_reduce

bool = True

overlap_param_gather

bool = True

preserve_fp32_weights

bool = False

zero_dp_strategy

int = 3

nemo_automodel.components.distributed.config.MegatronFSDPConfig.to_dict() -> typing.Dict[str, typing.Any]

Convert config to dictionary (shallow, preserves objects).

class nemo_automodel.components.distributed.config.MoEParallelizerConfig(
    ignore_router_for_ac: bool = True,
    reshard_after_forward: bool = False,
    lm_head_precision: typing.Optional[typing.Union[str, torch.dtype]] = None,
    wrap_outer_model: bool = True,
    mp_policy: typing.Optional[torch.distributed.fsdp.MixedPrecisionPolicy] = None
)

Dataclass

Configuration for MoE model parallelization (EP + FSDP settings).

ignore_router_for_ac

bool = True

lm_head_precision

Optional[Union[str, dtype]] = None

mp_policy

Optional[MixedPrecisionPolicy] = None

reshard_after_forward

bool = False

wrap_outer_model

bool = True

nemo_automodel.components.distributed.config.MoEParallelizerConfig.to_dict() -> typing.Dict[str, typing.Any]

nemo_automodel.components.distributed.config._resolve_strategy_config(
    strategy: str | nemo_automodel.components.distributed.config.DistributedStrategyConfig,
    strategy_kwargs: typing.Any = {}
) -> nemo_automodel.components.distributed.config.DistributedStrategyConfig

Resolve a setup-level strategy name or config object.

nemo_automodel.components.distributed.config.ActivationCheckpointingMode = Union[bool, Literal['selective']]

nemo_automodel.components.distributed.config.DistributedConfig = DistributedStrategyConfig

nemo_automodel.components.distributed.config.DistributedStrategyConfig = Union['FSDP2Config', 'MegatronFSDPConfig', 'DDPConfig']

nemo_automodel.components.distributed.config._STRATEGY_MAP: Dict[str, _StrategyConfigClass] = {'fsdp2': FSDP2Config, 'megatron_fsdp': MegatronFSDPConfig, 'megatron-fsdp': Meg...

nemo_automodel.components.distributed.config._StrategyConfigClass = type[FSDP2Config] | type[MegatronFSDPConfig] | type[DDPConfig]

nemo_automodel.components.distributed.config.__all__ = ['DDPConfig', 'DistributedSetup', 'DistributedStrategyConfig', 'FSDP2Config', 'M...