`nemo_automodel.components.moe.config`#

MoE parallelizer configuration.

Module Contents#

Classes#

`MoEParallelizerConfig`	Configuration for MoE model parallelization (EP + FSDP settings).
`MoEConfig`

API#

class nemo_automodel.components.moe.config.MoEParallelizerConfig#

Configuration for MoE model parallelization (EP + FSDP settings).

ignore_router_for_ac: bool#: False

reshard_after_forward: bool#: False

lm_head_precision: Optional[Union[str, torch.dtype]]#: None

wrap_outer_model: bool#: True

to_dict() → Dict[str, Any]#

class nemo_automodel.components.moe.config.MoEConfig#

n_routed_experts: int#: None

n_shared_experts: int#: None

n_activated_experts: int#: None

n_expert_groups: int#: None

n_limited_groups: int#: None

train_gate: bool#: None

gate_bias_update_factor: float#: None

aux_loss_coeff: float#: None

score_func: str#: None

route_scale: float#: None

dim: int#: None

inter_dim: int#: None

moe_inter_dim: int#: None

norm_topk_prob: bool#: None

router_bias: bool#: False

expert_bias: bool#: False

expert_activation: Literal[swiglu, quick_geglu, relu2]#: ‘swiglu’

activation_alpha: float#: 1.702

activation_limit: float#: 7.0

softmax_before_topk: bool#: False

dtype: str | torch.dtype#: None

shared_expert_gate: bool#: False

shared_expert_inter_dim: int | None#: None

shared_expert_activation: str#: ‘swiglu’

force_e_score_correction_bias: bool#: False

moe_latent_size: int | None#: None

property expert_dim: int#: Dimension used for expert projections (latent size when set, otherwise model dim).

__post_init__()#