nemo_automodel.components.distributed.config

View as Markdown

Strategy-specific distributed training configuration classes.

Design principle:

  • Size params (dp_size, dp_replicate_size, tp_size, pp_size, cp_size, ep_size) are grouped in ParallelismSizes.
  • dp_replicate_size is FSDP2-only: raises assertion if passed with non-FSDP2 config
  • Strategy-specific configs contain only additional flags unique to each strategy
  • Managers become normal classes that accept (config, device_mesh)

Module Contents

Classes

NameDescription
DDPConfigAdditional configuration for DDP distributed training.
DistributedSetupResolved distributed topology and execution policies.
FSDP2ConfigAdditional configuration for FSDP2 distributed training.
MegatronFSDPConfigAdditional configuration for MegatronFSDP distributed training.
MoEParallelizerConfigConfiguration for MoE model parallelization (EP + FSDP settings).

Functions

NameDescription
_resolve_strategy_configResolve a setup-level strategy name or config object.

Data

ActivationCheckpointingMode

DistributedConfig

DistributedStrategyConfig

_STRATEGY_MAP

_StrategyConfigClass

__all__

API

class nemo_automodel.components.distributed.config.DDPConfig(
activation_checkpointing: bool = False,
broadcast_buffers: bool = False,
find_unused_parameters: bool = False,
static_graph: bool = False,
bucket_cap_mb: typing.Optional[float] = None,
gradient_as_bucket_view: bool = False,
autocast_dtype: typing.Optional[torch.dtype] = None
)
Dataclass

Additional configuration for DDP distributed training.

Note: DDP does not support tensor parallelism, pipeline parallelism, or expert parallelism. Only dp_size is relevant (inferred from world_size).

activation_checkpointing
bool = False
autocast_dtype
Optional[dtype] = None
broadcast_buffers
bool = False
bucket_cap_mb
Optional[float] = None
find_unused_parameters
bool = False
gradient_as_bucket_view
bool = False
static_graph
bool = False
nemo_automodel.components.distributed.config.DDPConfig.to_dict() -> typing.Dict[str, typing.Any]

Convert config to dictionary.

class nemo_automodel.components.distributed.config.DistributedSetup(
mesh_context: 'MeshContext',
strategy_config: nemo_automodel.components.distributed.config.DistributedStrategyConfig | None = None,
pipeline_config: 'PipelineConfig | None' = None,
moe_parallel_config: 'MoEParallelizerConfig | None' = None,
activation_checkpointing: nemo_automodel.components.distributed.config.ActivationCheckpointingMode = False
)
Dataclass

Resolved distributed topology and execution policies.

activation_checkpointing
ActivationCheckpointingMode = False
mesh_context
'MeshContext'
moe_parallel_config
'MoEParallelizerConfig | None' = None
pipeline_config
'PipelineConfig | None' = None
strategy_config
DistributedStrategyConfig | None = None
nemo_automodel.components.distributed.config.DistributedSetup.build(
strategy: str | nemo_automodel.components.distributed.config.DistributedStrategyConfig = 'fsdp2',
parallelism_sizes: 'ParallelismSizes | None' = None,
pipeline_config: 'PipelineConfig | dict | None' = None,
moe_parallel_config: 'MoEParallelizerConfig | dict | None' = None,
activation_checkpointing: nemo_automodel.components.distributed.config.ActivationCheckpointingMode = False,
world_size: int | None = None
) -> 'DistributedSetup'
classmethod

Create a resolved distributed setup from sizes and policy configs.

Intentionally, this function is forgiving wrt the input types, allowing strings for the strategy and dicts for the pipeline and MoE configs.

class nemo_automodel.components.distributed.config.FSDP2Config(
sequence_parallel: bool = False,
tp_plan: typing.Optional[dict] = None,
patch_is_packed_sequence: bool = False,
mp_policy: typing.Optional[torch.distributed.fsdp.MixedPrecisionPolicy] = (lambda: MixedPrecisionPoli...,
offload_policy: typing.Optional[torch.distributed.fsdp.CPUOffloadPolicy] = None,
autocast_dtype: typing.Optional[torch.dtype] = None,
activation_checkpointing: nemo_automodel.components.distributed.config.ActivationCheckpointingMode = False,
defer_fsdp_grad_sync: bool = True,
reshard_after_forward: typing.Optional[bool] = None,
enable_async_tensor_parallel: bool = False,
enable_compile: bool = False,
enable_fsdp2_prefetch: bool = False,
fsdp2_backward_prefetch_depth: int = 2,
fsdp2_forward_prefetch_depth: int = 1
)
Dataclass

Additional configuration for FSDP2 distributed training.

Note: Size parameters (dp_size, dp_replicate_size, tp_size, pp_size, cp_size, ep_size) are grouped separately in ParallelismSizes.

activation_checkpointing
ActivationCheckpointingMode = False
autocast_dtype
Optional[dtype] = None
defer_fsdp_grad_sync
bool = True
enable_async_tensor_parallel
bool = False
enable_compile
bool = False
enable_fsdp2_prefetch
bool = False
fsdp2_backward_prefetch_depth
int = 2
fsdp2_forward_prefetch_depth
int = 1
mp_policy
Optional[MixedPrecisionPolicy]
offload_policy
Optional[CPUOffloadPolicy] = None
patch_is_packed_sequence
bool = False
reshard_after_forward
Optional[bool] = None
sequence_parallel
bool = False
tp_plan
Optional[dict] = None
nemo_automodel.components.distributed.config.FSDP2Config.__post_init__()
nemo_automodel.components.distributed.config.FSDP2Config.to_dict() -> typing.Dict[str, typing.Any]

Convert config to dictionary (shallow, preserves policy objects).

class nemo_automodel.components.distributed.config.MegatronFSDPConfig(
megatron_fsdp_unit_modules: typing.List[str] = (lambda: ['transformers.mod...,
zero_dp_strategy: int = 3,
init_fsdp_with_meta_device: bool = False,
grad_reduce_in_fp32: bool = False,
preserve_fp32_weights: bool = False,
overlap_grad_reduce: bool = True,
overlap_param_gather: bool = True,
check_for_nan_in_grad: bool = True,
average_in_collective: bool = False,
disable_bucketing: bool = False,
calculate_per_token_loss: bool = False,
keep_fp8_transpose_cache: bool = False,
nccl_ub: bool = False,
fsdp_double_buffer: bool = False,
activation_checkpointing: bool = False
)
Dataclass

Additional configuration for MegatronFSDP distributed training.

Note: Size parameters (dp_size, tp_size, cp_size) are grouped separately in ParallelismSizes. MegatronFSDP does not support pp_size, dp_replicate_size, or ep_size.

activation_checkpointing
bool = False
average_in_collective
bool = False
calculate_per_token_loss
bool = False
check_for_nan_in_grad
bool = True
disable_bucketing
bool = False
fsdp_double_buffer
bool = False
grad_reduce_in_fp32
bool = False
init_fsdp_with_meta_device
bool = False
keep_fp8_transpose_cache
bool = False
megatron_fsdp_unit_modules
List[str]
nccl_ub
bool = False
overlap_grad_reduce
bool = True
overlap_param_gather
bool = True
preserve_fp32_weights
bool = False
zero_dp_strategy
int = 3
nemo_automodel.components.distributed.config.MegatronFSDPConfig.to_dict() -> typing.Dict[str, typing.Any]

Convert config to dictionary (shallow, preserves objects).

class nemo_automodel.components.distributed.config.MoEParallelizerConfig(
ignore_router_for_ac: bool = True,
reshard_after_forward: bool = False,
lm_head_precision: typing.Optional[typing.Union[str, torch.dtype]] = None,
wrap_outer_model: bool = True,
mp_policy: typing.Optional[torch.distributed.fsdp.MixedPrecisionPolicy] = None
)
Dataclass

Configuration for MoE model parallelization (EP + FSDP settings).

ignore_router_for_ac
bool = True
lm_head_precision
Optional[Union[str, dtype]] = None
mp_policy
Optional[MixedPrecisionPolicy] = None
reshard_after_forward
bool = False
wrap_outer_model
bool = True
nemo_automodel.components.distributed.config.MoEParallelizerConfig.to_dict() -> typing.Dict[str, typing.Any]
nemo_automodel.components.distributed.config._resolve_strategy_config(
strategy: str | nemo_automodel.components.distributed.config.DistributedStrategyConfig,
strategy_kwargs: typing.Any = {}
) -> nemo_automodel.components.distributed.config.DistributedStrategyConfig

Resolve a setup-level strategy name or config object.

nemo_automodel.components.distributed.config.ActivationCheckpointingMode = Union[bool, Literal['selective']]
nemo_automodel.components.distributed.config.DistributedConfig = DistributedStrategyConfig
nemo_automodel.components.distributed.config.DistributedStrategyConfig = Union['FSDP2Config', 'MegatronFSDPConfig', 'DDPConfig']
nemo_automodel.components.distributed.config._STRATEGY_MAP: Dict[str, _StrategyConfigClass] = {'fsdp2': FSDP2Config, 'megatron_fsdp': MegatronFSDPConfig, 'megatron-fsdp': Meg...
nemo_automodel.components.distributed.config._StrategyConfigClass = type[FSDP2Config] | type[MegatronFSDPConfig] | type[DDPConfig]
nemo_automodel.components.distributed.config.__all__ = ['DDPConfig', 'DistributedSetup', 'DistributedStrategyConfig', 'FSDP2Config', 'M...