nemo_automodel.components.models.mistral3.model#

Module Contents#

Classes#

Functions#

Data#

API#

nemo_automodel.components.models.mistral3.model.logger#

‘get_logger(…)’

class nemo_automodel.components.models.mistral3.model.Ministral3Config(
vocab_size: Optional[int] = 131072,
hidden_size: Optional[int] = 4096,
intermediate_size: Optional[int] = 14336,
num_hidden_layers: Optional[int] = 34,
num_attention_heads: Optional[int] = 32,
num_key_value_heads: Optional[int] = 8,
head_dim: Optional[int] = 128,
hidden_act: Optional[str] = 'silu',
max_position_embeddings: Optional[int] = 262144,
initializer_range: Optional[float] = 0.02,
rms_norm_eps: Optional[float] = 1e-05,
use_cache: Optional[bool] = True,
pad_token_id: Optional[int] = 11,
bos_token_id: Optional[int] = 1,
eos_token_id: Optional[int] = 2,
tie_word_embeddings: Optional[bool] = False,
rope_parameters: Optional[dict] = None,
sliding_window: Optional[int] = None,
attention_dropout: Optional[float] = 0.0,
**kwargs,
)#

Bases: transformers.configuration_utils.PretrainedConfig

Configuration for Ministral3 text decoder.

Initialization

model_type#

‘ministral3’

keys_to_ignore_at_inference#

[‘past_key_values’]

base_model_tp_plan#

None

base_model_pp_plan#

None

nemo_automodel.components.models.mistral3.model.rotate_half(x)#
class nemo_automodel.components.models.mistral3.model.Ministral3PreTrainedModel#

Bases: transformers.modeling_utils.PreTrainedModel

config: nemo_automodel.components.models.mistral3.model.Ministral3Config#

None

base_model_prefix#

‘model’

supports_gradient_checkpointing#

True

_no_split_modules#

[‘Ministral3DecoderLayer’]

_skip_keys_device_placement#

[‘past_key_values’]

_supports_flash_attn#

True

_supports_sdpa#

True

_supports_flex_attn#

True

_can_compile_fullgraph#

True

_supports_attention_backend#

True

_can_record_outputs#

None

class nemo_automodel.components.models.mistral3.model.Ministral3ModelOutputWithPast#

Bases: transformers.modeling_outputs.BaseModelOutputWithPast

image_hidden_states: Optional[torch.FloatTensor]#

None

class nemo_automodel.components.models.mistral3.model.Ministral3CausalLMOutputWithPast#

Bases: transformers.modeling_outputs.CausalLMOutputWithPast

class nemo_automodel.components.models.mistral3.model.Ministral3RotaryEmbedding(
config: nemo_automodel.components.models.mistral3.model.Ministral3Config,
device=None,
)#

Bases: torch.nn.Module

Initialization

inv_freq: torch.Tensor#

None

static compute_default_rope_parameters(
config: Optional[nemo_automodel.components.models.mistral3.model.Ministral3Config] = None,
device: Optional[torch.device] = None,
seq_len: Optional[int] = None,
) tuple[torch.Tensor, float]#
forward(x, position_ids)#
nemo_automodel.components.models.mistral3.model.apply_rotary_pos_emb(
q,
k,
cos,
sin,
position_ids=None,
unsqueeze_dim=1,
)#
nemo_automodel.components.models.mistral3.model.repeat_kv(hidden_states: torch.Tensor, n_rep: int) torch.Tensor#
nemo_automodel.components.models.mistral3.model.eager_attention_forward(
module: torch.nn.Module,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attention_mask: Optional[torch.Tensor],
scaling: float,
dropout: float = 0.0,
**kwargs: transformers.processing_utils.Unpack[transformers.utils.TransformersKwargs],
)#
nemo_automodel.components.models.mistral3.model._get_llama_4_attn_scale(
positions_ids: torch.Tensor,
beta: float,
max_position_embeddings: int,
) torch.Tensor#
class nemo_automodel.components.models.mistral3.model.Ministral3Attention(
config: nemo_automodel.components.models.mistral3.model.Ministral3Config,
layer_idx: int,
)#

Bases: torch.nn.Module

Initialization

forward(
hidden_states: torch.Tensor,
position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor],
past_key_values: Optional[transformers.cache_utils.Cache] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: transformers.processing_utils.Unpack[transformers.modeling_flash_attention_utils.FlashAttentionKwargs],
) tuple[torch.Tensor, Optional[torch.Tensor]]#
class nemo_automodel.components.models.mistral3.model.Ministral3MLP(
config: nemo_automodel.components.models.mistral3.model.Ministral3Config,
)#

Bases: torch.nn.Module

Initialization

forward(x)#
class nemo_automodel.components.models.mistral3.model.Ministral3RMSNorm(hidden_size, eps=1e-06)#

Bases: torch.nn.Module

Initialization

forward(hidden_states)#
class nemo_automodel.components.models.mistral3.model.GradientCheckpointingLayer#

Bases: torch.nn.Module

abstractmethod forward(*args, **kwargs)#
class nemo_automodel.components.models.mistral3.model.Ministral3DecoderLayer(
config: nemo_automodel.components.models.mistral3.model.Ministral3Config,
layer_idx: int,
)#

Bases: nemo_automodel.components.models.mistral3.model.GradientCheckpointingLayer

Initialization

forward(
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[transformers.cache_utils.Cache] = None,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
**kwargs: transformers.processing_utils.Unpack[transformers.utils.TransformersKwargs],
) torch.Tensor#
class nemo_automodel.components.models.mistral3.model.Ministral3Model(
config: nemo_automodel.components.models.mistral3.model.Ministral3Config,
)#

Bases: nemo_automodel.components.models.mistral3.model.Ministral3PreTrainedModel

Initialization

forward(
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[transformers.cache_utils.Cache] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: transformers.processing_utils.Unpack[transformers.utils.TransformersKwargs],
) transformers.modeling_outputs.BaseModelOutputWithPast#
class nemo_automodel.components.models.mistral3.model.Ministral3ForCausalLM(
config: nemo_automodel.components.models.mistral3.model.Ministral3Config,
)#

Bases: nemo_automodel.components.models.mistral3.model.Ministral3PreTrainedModel, transformers.generation.GenerationMixin

Initialization

_tied_weights_keys#

None

_tp_plan#

None

_pp_plan#

None

forward(
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[transformers.cache_utils.Cache] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: transformers.processing_utils.Unpack[transformers.utils.TransformersKwargs],
) transformers.modeling_outputs.CausalLMOutputWithPast#
nemo_automodel.components.models.mistral3.model.mod_pkg#

‘setdefault(…)’

nemo_automodel.components.models.mistral3.model.config_mod#

‘ModuleType(…)’

nemo_automodel.components.models.mistral3.model.modeling_mod#

‘ModuleType(…)’

nemo_automodel.components.models.mistral3.model._orig_auto_from_config#

None

nemo_automodel.components.models.mistral3.model._patched_from_config(cls, config, *model_args, **kwargs)#
nemo_automodel.components.models.mistral3.model.ModelClass#

None