nemo_automodel.components.models.minimax_m2.state_dict_adapter#

Module Contents#

Classes#

MiniMaxM2StateDictAdapter

Convert between MiniMax-M2.1 HF checkpoints and native grouped-expert format.

Functions#

Data#

API#

nemo_automodel.components.models.minimax_m2.state_dict_adapter.NON_QUANTIZED_KEY_PATTERNS#

[‘input_layernorm.weight’, ‘post_attention_layernorm.weight’, ‘norm.weight’, ‘lm_head.weight’, ‘embe…

nemo_automodel.components.models.minimax_m2.state_dict_adapter.should_quantize_key(key: str) bool#
class nemo_automodel.components.models.minimax_m2.state_dict_adapter.MiniMaxM2StateDictAdapter(
config: Any,
moe_config: nemo_automodel.components.moe.layers.MoEConfig,
backend: nemo_automodel.components.models.common.BackendConfig,
dtype: torch.dtype = torch.float32,
)#

Bases: nemo_automodel.components.moe.state_dict_mixin.MoESplitExpertsStateDictMixin, nemo_automodel.components.checkpoint.state_dict_adapter.StateDictAdapter

Convert between MiniMax-M2.1 HF checkpoints and native grouped-expert format.

Initialization

property _expert_path_segment: str#
_dequantize(
state_dict: dict[str, Any],
) dict[str, Any]#
_hf_key_to_native(key: str) str#
_native_key_to_hf(key: str) str#
to_hf(
state_dict: dict[str, Any],
exclude_key_regex: Optional[str] = None,
quantization: bool = False,
**kwargs,
) dict[str, Any]#
from_hf(
hf_state_dict: dict[str, Any],
device_mesh: Optional[torch.distributed.device_mesh.DeviceMesh] = None,
**kwargs,
) dict[str, Any]#
convert_single_tensor_to_hf(
fqn: str,
tensor: Any,
**kwargs,
) list[tuple[str, Any]]#