nemo_automodel.components.models.deepseek_v32.config#

Module Contents#

Classes#

API#

class nemo_automodel.components.models.deepseek_v32.config.DeepseekV32Config(
vocab_size: int = 129280,
hidden_size: int = 7168,
intermediate_size: int = 18432,
moe_intermediate_size: int = 2048,
num_hidden_layers: int = 61,
num_attention_heads: int = 128,
num_key_value_heads: int = 128,
n_shared_experts: int = 1,
n_routed_experts: int = 256,
ep_size: int = 1,
routed_scaling_factor: float = 2.5,
kv_lora_rank: int = 512,
q_lora_rank: int = 1536,
qk_head_dim: int = 192,
qk_nope_head_dim: int = 128,
qk_rope_head_dim: int = 64,
v_head_dim: int = 128,
hidden_act: str = 'silu',
max_position_embeddings: int = 163840,
initializer_range: float = 0.02,
rms_norm_eps: float = 1e-06,
use_cache: bool = True,
pad_token_id: int | None = None,
bos_token_id: int = 0,
eos_token_id: int = 1,
pretraining_tp: int = 1,
tie_word_embeddings: bool = False,
rope_theta: float = 10000.0,
rope_scaling: dict | None = None,
attention_bias: bool = False,
attention_dropout: float = 0.0,
num_experts_per_tok: int = 8,
n_group: int = 8,
topk_group: int = 4,
first_k_dense_replace: int = 3,
norm_topk_prob: bool = True,
scoring_func: str = 'sigmoid',
aux_loss_alpha: float = 0.001,
seq_aux: bool = True,
index_n_heads: int = 64,
index_head_dim: int = 128,
index_topk: int = 2048,
torch_dtype: str = 'bfloat16',
**kwargs,
)#

Bases: transformers.PretrainedConfig

Initialization

model_type#

‘deepseek_v32’

keys_to_ignore_at_inference#

[‘past_key_values’]