nemo_automodel.components.models.deepseek_v32.config
nemo_automodel.components.models.deepseek_v32.config
Module Contents
Classes
| Name | Description |
|---|---|
DeepseekV32Config | - |
API
class nemo_automodel.components.models.deepseek_v32.config.DeepseekV32Config( vocab_size: int = 129280, hidden_size: int = 7168, intermediate_size: int = 18432, moe_intermediate_size: int = 2048, num_hidden_layers: int = 61, num_attention_heads: int = 128, num_key_value_heads: int = 128, n_shared_experts: int = 1, n_routed_experts: int = 256, ep_size: int = 1, routed_scaling_factor: float = 2.5, kv_lora_rank: int = 512, q_lora_rank: int = 1536, qk_head_dim: int = 192, qk_nope_head_dim: int = 128, qk_rope_head_dim: int = 64, v_head_dim: int = 128, hidden_act: str = 'silu', max_position_embeddings: int = 163840, initializer_range: float = 0.02, rms_norm_eps: float = 1e-06, use_cache: bool = True, pad_token_id: int | None = None, bos_token_id: int = 0, eos_token_id: int = 1, pretraining_tp: int = 1, tie_word_embeddings: bool = False, rope_theta: float = 10000.0, rope_scaling: dict | None = None, attention_bias: bool = False, attention_dropout: float = 0.0, num_experts_per_tok: int = 8, n_group: int = 8, topk_group: int = 4, first_k_dense_replace: int = 3, norm_topk_prob: bool = True, scoring_func: str = 'sigmoid', aux_loss_alpha: float = 0.001, seq_aux: bool = True, index_n_heads: int = 64, index_head_dim: int = 128, index_topk: int = 2048, torch_dtype: str = 'bfloat16', kwargs = {} )
Bases: PretrainedConfig
keys_to_ignore_at_inference
= ['past_key_values']
model_type
= 'deepseek_v32'