`nemo_automodel.components.models.gpt_oss.state_dict_adapter`#

Module Contents#

nemo_automodel.components.models.gpt_oss.state_dict_adapter.FP4_VALUES#: None

class nemo_automodel.components.models.gpt_oss.state_dict_adapter.GPTOSSStateDictAdapter( config: transformers.GptOssConfig, moe_config: nemo_automodel.components.moe.layers.MoEConfig, backend: nemo_automodel.components.moe.utils.BackendConfig, dtype: torch.dtype = torch.bfloat16, )#

_apply_key_mapping( state_dict: dict[str, Any], mapping: dict[str, str], ) → dict[str, Any]#

_dequantize_block_scale_tensors( state_dict: dict[str, Any], ) → dict[str, Any]#

_convert_moe_packed_tensors( blocks, scales, dtype: torch.dtype = torch.bfloat16, rows_per_chunk: int = 32768 * 1024, ) → torch.Tensor#

Convert the mxfp4 weights to bfloat16.

Source: https://github.com/huggingface/transformers/blob/869735d37d0f929311ac6611728c482a4414ba8c/src/transformers/integrations/mxfp4.py#L77

from_hf(

hf_state_dict: dict[str, Any],

device_mesh: Optional[torch.distributed.device_mesh.DeviceMesh] = None,

**kwargs,

) → dict[str, Any]#

Convert HF checkpoint to native format.

convert_single_tensor_to_hf(

fqn: str,

tensor: Any,

**kwargs,

) → list[tuple[str, Any]]#

Convert a single tensor from native format to HuggingFace format.

Parameters:

Returns:

List of (fqn, tensor) tuples in HuggingFace format