`nemo_automodel.components.models.deepseek_v3.state_dict_adapter`#

Module Contents#

Classes#

DeepSeekV3StateDictAdapter

Functions#

`calculate_scale_shape`
`dequantize_from_fp8`

Data#

`logger`
`BLOCK_SIZE`

API#

nemo_automodel.components.models.deepseek_v3.state_dict_adapter.logger#: ‘getLogger(…)’

nemo_automodel.components.models.deepseek_v3.state_dict_adapter.BLOCK_SIZE#: 128

class nemo_automodel.components.models.deepseek_v3.state_dict_adapter.DeepSeekV3StateDictAdapter( config: transformers.DeepseekV3Config, moe_config: nemo_automodel.components.moe.layers.MoEConfig, backend: nemo_automodel.components.moe.utils.BackendConfig, dtype: torch.dtype = torch.float32, )#

Bases: nemo_automodel.components.moe.state_dict_mixin.MoESplitExpertsStateDictMixin, nemo_automodel.components.checkpoint.state_dict_adapter.StateDictAdapter

_dequantize( state_dict: dict[str, Any], ) → dict[str, Any]#

_add_quantization_scale_inv_tensors( state_dict: dict[str, Any], ) → dict[str, Any]#

to_hf(

state_dict: dict[str, Any],

exclude_key_regex: Optional[str] = None,

quantization: bool = False,

**kwargs,

) → dict[str, Any]#: Convert from native model state dict to HuggingFace format. Automatically detects format based on backend.enable_deepep configuration.

from_hf(

hf_state_dict: dict[str, Any],

device_mesh: Optional[torch.distributed.device_mesh.DeviceMesh] = None,

**kwargs,

) → dict[str, Any]#

Convert HF checkpoint to native format.

Dequantize FP8 tensors if scale_inv buffers are provided
Aggregate per-expert weights into grouped tensors
If device_mesh is provided, only load experts needed for the current rank

convert_single_tensor_to_hf(

fqn: str,

tensor: Any,

**kwargs,

) → list[tuple[str, Any]]#

Convert a single tensor from native format to HuggingFace format.

Parameters:

fqn – Fully qualified name of the tensor in native format
tensor – The tensor to convert
**kwargs – Additional arguments for conversion

Returns:

List of (fqn, tensor) tuples in HuggingFace format

nemo_automodel.components.models.deepseek_v3.state_dict_adapter.calculate_scale_shape( weight: torch.Tensor, BLOCK_SIZE: int = BLOCK_SIZE, ) → torch.Size#

nemo_automodel.components.models.deepseek_v3.state_dict_adapter.dequantize_from_fp8( weight: torch.Tensor, scale_inv: torch.Tensor, dtype=torch.bfloat16, BLOCK_SIZE: int = BLOCK_SIZE, ) → torch.Tensor#