nemo_automodel.components.models.deepseek_v32.config#
Module Contents#
Classes#
API#
- class nemo_automodel.components.models.deepseek_v32.config.DeepseekV32Config(
- vocab_size: int = 129280,
- hidden_size: int = 7168,
- intermediate_size: int = 18432,
- moe_intermediate_size: int = 2048,
- num_hidden_layers: int = 61,
- num_attention_heads: int = 128,
- num_key_value_heads: int = 128,
- n_shared_experts: int = 1,
- n_routed_experts: int = 256,
- ep_size: int = 1,
- routed_scaling_factor: float = 2.5,
- kv_lora_rank: int = 512,
- q_lora_rank: int = 1536,
- qk_head_dim: int = 192,
- qk_nope_head_dim: int = 128,
- qk_rope_head_dim: int = 64,
- v_head_dim: int = 128,
- hidden_act: str = 'silu',
- max_position_embeddings: int = 163840,
- initializer_range: float = 0.02,
- rms_norm_eps: float = 1e-06,
- use_cache: bool = True,
- pad_token_id: int | None = None,
- bos_token_id: int = 0,
- eos_token_id: int = 1,
- pretraining_tp: int = 1,
- tie_word_embeddings: bool = False,
- rope_theta: float = 10000.0,
- rope_scaling: dict | None = None,
- attention_bias: bool = False,
- attention_dropout: float = 0.0,
- num_experts_per_tok: int = 8,
- n_group: int = 8,
- topk_group: int = 4,
- first_k_dense_replace: int = 3,
- norm_topk_prob: bool = True,
- scoring_func: str = 'sigmoid',
- aux_loss_alpha: float = 0.001,
- seq_aux: bool = True,
- index_n_heads: int = 64,
- index_head_dim: int = 128,
- index_topk: int = 2048,
- torch_dtype: str = 'bfloat16',
- **kwargs,
Bases:
transformers.PretrainedConfigInitialization
- model_type#
‘deepseek_v32’
- keys_to_ignore_at_inference#
[‘past_key_values’]