bridge.models.sarvam.sarvam_provider#
Module Contents#
Classes#
Sarvam 30B model provider. |
|
Sarvam 105B model provider. |
Data#
API#
- bridge.models.sarvam.sarvam_provider.logger#
‘getLogger(…)’
- class bridge.models.sarvam.sarvam_provider.SarvamMoEModelProvider#
Bases:
megatron.bridge.models.gpt_provider.GPTModelProviderSarvam 30B model provider.
- transformer_layer_spec: Union[megatron.core.transformer.ModuleSpec, Callable[[megatron.bridge.models.gpt_provider.GPTModelProvider], megatron.core.transformer.ModuleSpec]]#
‘partial(…)’
- normalization: str#
‘RMSNorm’
- activation_func: Callable#
None
- gated_linear_unit: bool#
True
- position_embedding_type: str#
‘rope’
- add_bias_linear: bool#
False
False
- make_vocab_size_divisible_by: int#
128
- add_qkv_bias: bool#
False
- qk_layernorm: bool#
True
- init_method_std: float#
0.006
0.0
- attention_dropout: float#
0.0
- layernorm_epsilon: float#
1e-06
- moe_aux_loss_coeff: float#
0
- moe_router_pre_softmax: bool#
True
- moe_router_enable_expert_bias: bool#
True
- moe_router_bias_update_rate: float#
0.001
- moe_grouped_gemm: bool#
True
- moe_permute_fusion: bool#
True
- moe_router_topk_scaling_factor: float#
2.5
False
- moe_router_dtype: Optional[str]#
‘fp32’
- moe_router_score_function: str#
‘sigmoid’
- moe_token_dispatcher_type: str#
‘alltoall’
- attention_softmax_in_fp32: bool#
True
- persist_layer_norm: bool#
True
- cross_entropy_fusion_impl: str#
‘te’
- cp_comm_type: str#
‘p2p’
- recompute_granularity: str#
‘selective’
- recompute_modules: List[str]#
‘field(…)’
- kv_channels: Optional[int]#
64
- seq_length: int#
131072
- rotary_base: float#
8000000.0
- vocab_size: int#
262144
- num_moe_experts: int#
128
- moe_router_topk: int#
6
- num_layers: int#
19
4096
- num_attention_heads: int#
64
8192
1024
1024
- moe_layer_freq: Union[int, List[int]]#
‘field(…)’
- bf16: bool#
True
- num_query_groups: int#
4
- class bridge.models.sarvam.sarvam_provider.SarvamMLAModelProvider#
Bases:
megatron.bridge.models.transformer_config.MLATransformerConfig,megatron.bridge.models.gpt_provider.GPTModelProviderSarvam 105B model provider.
- transformer_layer_spec: Union[megatron.core.transformer.ModuleSpec, Callable[[megatron.bridge.models.gpt_provider.GPTModelProvider], megatron.core.transformer.ModuleSpec]]#
‘partial(…)’
- normalization: str#
‘RMSNorm’
- activation_func: Callable#
None
- gated_linear_unit: bool#
True
- position_embedding_type: str#
‘rope’
- add_bias_linear: bool#
False
False
- make_vocab_size_divisible_by: int#
128
- add_qkv_bias: bool#
False
- qk_layernorm: bool#
True
- init_method_std: float#
0.006
0.0
- attention_dropout: float#
0.0
- layernorm_epsilon: float#
1e-06
- moe_aux_loss_coeff: float#
0
- moe_router_pre_softmax: bool#
True
- moe_router_enable_expert_bias: bool#
True
- moe_router_bias_update_rate: float#
0.001
- moe_grouped_gemm: bool#
True
- moe_permute_fusion: bool#
True
- moe_router_topk_scaling_factor: float#
2.5
False
- moe_router_dtype: Optional[str]#
‘fp32’
- moe_router_score_function: str#
‘sigmoid’
- moe_token_dispatcher_type: str#
‘alltoall’
- attention_softmax_in_fp32: bool#
True
- persist_layer_norm: bool#
True
- cross_entropy_fusion_impl: str#
‘te’
- cp_comm_type: str#
‘p2p’
- recompute_granularity: str#
‘selective’
- recompute_modules: List[str]#
‘field(…)’
- multi_latent_attention: bool#
True
- rope_type: str#
‘yarn’
- rotary_scaling_factor: float#
40
- original_max_position_embeddings: int#
4096
- beta_fast: float#
32.0
- beta_slow: float#
1.0
- mscale: float#
1.0
- mscale_all_dim: float#
1.0
- kv_channels: Optional[int]#
64
- seq_length: int#
131072
- rotary_base: float#
10000.0
- vocab_size: int#
262144
- num_moe_experts: int#
128
- moe_router_topk: int#
8
- num_layers: int#
32
4096
- num_attention_heads: int#
64
16384
2048
2048
- moe_layer_freq: Union[int, List[int]]#
‘field(…)’
- bf16: bool#
True
- q_lora_rank: Optional[int]#
None
- kv_lora_rank: int#
512
- qk_head_dim: int#
128
- qk_pos_emb_head_dim: int#
64
- v_head_dim: int#
128