nemo_automodel.components.models.deepseek_v3.layers

View as Markdown

Module Contents

Classes

NameDescription
MLA-

Data

logger

API

class nemo_automodel.components.models.deepseek_v3.layers.MLA(
config: transformers.models.deepseek_v3.configuration_deepseek_v3.DeepseekV3Config,
backend: nemo_automodel.components.models.common.BackendConfig
)

Bases: Module

kv_a_layernorm
kv_a_proj_with_mqa
kv_b_proj
kv_lora_rank
= config.kv_lora_rank
n_heads
= config.num_attention_heads
o_proj
q_a_layernorm
q_a_proj
q_b_proj
q_lora_rank
= config.q_lora_rank
q_proj
qk_head_dim
qk_nope_head_dim
= config.qk_nope_head_dim
qk_rope_head_dim
= config.qk_rope_head_dim
rope_fusion
= backend.rope_fusion
softmax_scale
= self.qk_head_dim ** -0.5
v_head_dim
= config.v_head_dim
nemo_automodel.components.models.deepseek_v3.layers.MLA._forward_impl(
x: torch.Tensor,
freqs_cis: torch.Tensor,
attention_mask: torch.Tensor | None = None,
attn_kwargs: typing.Any = {}
)
nemo_automodel.components.models.deepseek_v3.layers.MLA.forward(
x: torch.Tensor,
freqs_cis: torch.Tensor,
attention_mask: torch.Tensor | None = None,
attn_kwargs: typing.Any = {}
)
nemo_automodel.components.models.deepseek_v3.layers.MLA.init_weights(
buffer_device: torch.device,
init_std: float = 0.02
)
nemo_automodel.components.models.deepseek_v3.layers.logger = logging.getLogger(__name__)