nemo_automodel.components.models.mistral3.model#
Module Contents#
Classes#
Configuration for Ministral3 text decoder. |
|
Functions#
Data#
API#
- nemo_automodel.components.models.mistral3.model.logger#
‘get_logger(…)’
- class nemo_automodel.components.models.mistral3.model.Ministral3Config(
- vocab_size: Optional[int] = 131072,
- hidden_size: Optional[int] = 4096,
- intermediate_size: Optional[int] = 14336,
- num_hidden_layers: Optional[int] = 34,
- num_attention_heads: Optional[int] = 32,
- num_key_value_heads: Optional[int] = 8,
- head_dim: Optional[int] = 128,
- hidden_act: Optional[str] = 'silu',
- max_position_embeddings: Optional[int] = 262144,
- initializer_range: Optional[float] = 0.02,
- rms_norm_eps: Optional[float] = 1e-05,
- use_cache: Optional[bool] = True,
- pad_token_id: Optional[int] = 11,
- bos_token_id: Optional[int] = 1,
- eos_token_id: Optional[int] = 2,
- tie_word_embeddings: Optional[bool] = False,
- rope_parameters: Optional[dict] = None,
- sliding_window: Optional[int] = None,
- attention_dropout: Optional[float] = 0.0,
- **kwargs,
Bases:
transformers.configuration_utils.PretrainedConfigConfiguration for Ministral3 text decoder.
Initialization
- model_type#
‘ministral3’
- keys_to_ignore_at_inference#
[‘past_key_values’]
- base_model_tp_plan#
None
- base_model_pp_plan#
None
- nemo_automodel.components.models.mistral3.model.rotate_half(x)#
- class nemo_automodel.components.models.mistral3.model.Ministral3PreTrainedModel#
Bases:
transformers.modeling_utils.PreTrainedModel- base_model_prefix#
‘model’
- supports_gradient_checkpointing#
True
- _no_split_modules#
[‘Ministral3DecoderLayer’]
- _skip_keys_device_placement#
[‘past_key_values’]
- _supports_flash_attn#
True
- _supports_sdpa#
True
- _supports_flex_attn#
True
- _can_compile_fullgraph#
True
- _supports_attention_backend#
True
- _can_record_outputs#
None
- class nemo_automodel.components.models.mistral3.model.Ministral3ModelOutputWithPast#
Bases:
transformers.modeling_outputs.BaseModelOutputWithPastNone
- class nemo_automodel.components.models.mistral3.model.Ministral3CausalLMOutputWithPast#
Bases:
transformers.modeling_outputs.CausalLMOutputWithPast
- class nemo_automodel.components.models.mistral3.model.Ministral3RotaryEmbedding(
- config: nemo_automodel.components.models.mistral3.model.Ministral3Config,
- device=None,
Bases:
torch.nn.ModuleInitialization
- inv_freq: torch.Tensor#
None
- static compute_default_rope_parameters(
- config: Optional[nemo_automodel.components.models.mistral3.model.Ministral3Config] = None,
- device: Optional[torch.device] = None,
- seq_len: Optional[int] = None,
- forward(x, position_ids)#
- nemo_automodel.components.models.mistral3.model.apply_rotary_pos_emb(
- q,
- k,
- cos,
- sin,
- position_ids=None,
- unsqueeze_dim=1,
- nemo_automodel.components.models.mistral3.model.repeat_kv(hidden_states: torch.Tensor, n_rep: int) torch.Tensor#
- nemo_automodel.components.models.mistral3.model.eager_attention_forward(
- module: torch.nn.Module,
- query: torch.Tensor,
- key: torch.Tensor,
- value: torch.Tensor,
- attention_mask: Optional[torch.Tensor],
- scaling: float,
- dropout: float = 0.0,
- **kwargs: transformers.processing_utils.Unpack[transformers.utils.TransformersKwargs],
- nemo_automodel.components.models.mistral3.model._get_llama_4_attn_scale(
- positions_ids: torch.Tensor,
- beta: float,
- max_position_embeddings: int,
- class nemo_automodel.components.models.mistral3.model.Ministral3Attention(
- config: nemo_automodel.components.models.mistral3.model.Ministral3Config,
- layer_idx: int,
Bases:
torch.nn.ModuleInitialization
- forward(
- hidden_states: torch.Tensor,
- position_embeddings: tuple[torch.Tensor, torch.Tensor],
- attention_mask: Optional[torch.Tensor],
- past_key_values: Optional[transformers.cache_utils.Cache] = None,
- cache_position: Optional[torch.LongTensor] = None,
- **kwargs: transformers.processing_utils.Unpack[transformers.modeling_flash_attention_utils.FlashAttentionKwargs],
- class nemo_automodel.components.models.mistral3.model.Ministral3MLP( )#
Bases:
torch.nn.ModuleInitialization
- forward(x)#
- class nemo_automodel.components.models.mistral3.model.Ministral3RMSNorm(hidden_size, eps=1e-06)#
Bases:
torch.nn.ModuleInitialization
- forward(hidden_states)#
- class nemo_automodel.components.models.mistral3.model.GradientCheckpointingLayer#
Bases:
torch.nn.Module- abstractmethod forward(*args, **kwargs)#
- class nemo_automodel.components.models.mistral3.model.Ministral3DecoderLayer(
- config: nemo_automodel.components.models.mistral3.model.Ministral3Config,
- layer_idx: int,
Bases:
nemo_automodel.components.models.mistral3.model.GradientCheckpointingLayerInitialization
- forward(
- hidden_states: torch.Tensor,
- attention_mask: Optional[torch.Tensor] = None,
- position_ids: Optional[torch.LongTensor] = None,
- past_key_values: Optional[transformers.cache_utils.Cache] = None,
- use_cache: Optional[bool] = False,
- cache_position: Optional[torch.LongTensor] = None,
- position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
- **kwargs: transformers.processing_utils.Unpack[transformers.utils.TransformersKwargs],
- class nemo_automodel.components.models.mistral3.model.Ministral3Model( )#
Bases:
nemo_automodel.components.models.mistral3.model.Ministral3PreTrainedModelInitialization
- forward(
- input_ids: Optional[torch.LongTensor] = None,
- attention_mask: Optional[torch.Tensor] = None,
- position_ids: Optional[torch.LongTensor] = None,
- past_key_values: Optional[transformers.cache_utils.Cache] = None,
- inputs_embeds: Optional[torch.FloatTensor] = None,
- use_cache: Optional[bool] = None,
- cache_position: Optional[torch.LongTensor] = None,
- **kwargs: transformers.processing_utils.Unpack[transformers.utils.TransformersKwargs],
- class nemo_automodel.components.models.mistral3.model.Ministral3ForCausalLM( )#
Bases:
nemo_automodel.components.models.mistral3.model.Ministral3PreTrainedModel,transformers.generation.GenerationMixinInitialization
- _tied_weights_keys#
None
- _tp_plan#
None
- _pp_plan#
None
- forward(
- input_ids: Optional[torch.LongTensor] = None,
- attention_mask: Optional[torch.Tensor] = None,
- position_ids: Optional[torch.LongTensor] = None,
- past_key_values: Optional[transformers.cache_utils.Cache] = None,
- inputs_embeds: Optional[torch.FloatTensor] = None,
- labels: Optional[torch.LongTensor] = None,
- use_cache: Optional[bool] = None,
- cache_position: Optional[torch.LongTensor] = None,
- logits_to_keep: Union[int, torch.Tensor] = 0,
- **kwargs: transformers.processing_utils.Unpack[transformers.utils.TransformersKwargs],
- nemo_automodel.components.models.mistral3.model.mod_pkg#
‘setdefault(…)’
- nemo_automodel.components.models.mistral3.model.config_mod#
‘ModuleType(…)’
- nemo_automodel.components.models.mistral3.model.modeling_mod#
‘ModuleType(…)’
- nemo_automodel.components.models.mistral3.model._orig_auto_from_config#
None
- nemo_automodel.components.models.mistral3.model._patched_from_config(cls, config, *model_args, **kwargs)#
- nemo_automodel.components.models.mistral3.model.ModelClass#
None