nemo_automodel.components.models.deepseek_v3.model#

Module Contents#

Classes#

API#

class nemo_automodel.components.models.deepseek_v3.model.Block(
layer_idx: int,
config: transformers.models.deepseek_v3.configuration_deepseek_v3.DeepseekV3Config,
moe_config: nemo_automodel.components.moe.layers.MoEConfig,
backend: nemo_automodel.components.moe.utils.BackendConfig,
)#

Bases: torch.nn.Module

Initialization

forward(
x: torch.Tensor,
freqs_cis: torch.Tensor,
attention_mask: torch.Tensor | None = None,
padding_mask: torch.Tensor | None = None,
seq_lens: torch.Tensor | None = None,
**attn_kwargs: Any,
) tuple[torch.Tensor, torch.Tensor | None]#

Forward pass for the Transformer block.

Parameters:
  • x (torch.Tensor) – Input tensor.

  • freqs_cis (torch.Tensor) – Precomputed complex exponential values for rotary embeddings.

  • padding_mask (torch.Tensor) – Boolean tensor indicating padding positions.

Returns:

Output tensor after block computation. torch.Tensor | None: Auxiliary loss for load balancing (if applicable).

Return type:

torch.Tensor

_mlp(x: torch.Tensor, padding_mask: torch.Tensor) torch.Tensor#
init_weights(buffer_device: torch.device)#
class nemo_automodel.components.models.deepseek_v3.model.DeepseekV3Model(
config: transformers.models.deepseek_v3.configuration_deepseek_v3.DeepseekV3Config,
backend: nemo_automodel.components.moe.utils.BackendConfig,
*,
moe_config: nemo_automodel.components.moe.layers.MoEConfig | None = None,
)#

Bases: torch.nn.Module

Initialization

forward(
input_ids: torch.Tensor,
*,
position_ids: torch.Tensor | None = None,
attention_mask: torch.Tensor | None = None,
padding_mask: torch.Tensor | None = None,
seq_lens: torch.Tensor | None = None,
**attn_kwargs: Any,
) tuple[torch.Tensor, torch.Tensor | None]#
update_moe_gate_bias() None#
init_weights(buffer_device: torch.device | None = None) None#
class nemo_automodel.components.models.deepseek_v3.model.DeepseekV3ForCausalLM(
config: transformers.models.deepseek_v3.configuration_deepseek_v3.DeepseekV3Config,
moe_config: nemo_automodel.components.moe.layers.MoEConfig | None = None,
backend: nemo_automodel.components.moe.utils.BackendConfig | None = None,
)#

Bases: torch.nn.Module

Initialization

classmethod from_config(
pretrained_model_name_or_path: str | transformers.models.deepseek_v3.configuration_deepseek_v3.DeepseekV3Config,
moe_config: nemo_automodel.components.moe.layers.MoEConfig | None = None,
backend: nemo_automodel.components.moe.utils.BackendConfig | None = None,
trust_remote_code: bool = False,
**kwargs,
)#
forward(
input_ids: torch.Tensor,
*,
position_ids: torch.Tensor | None = None,
attention_mask: torch.Tensor | None = None,
padding_mask: torch.Tensor | None = None,
seq_lens: torch.Tensor | None = None,
**attn_kwargs: Any,
) torch.Tensor#
update_moe_gate_bias() None#
initialize_weights(
buffer_device: torch.device | None = None,
dtype: torch.dtype = torch.bfloat16,
) None#