bridge.models.sarvam.sarvam_provider#

Module Contents#

Classes#

SarvamMoEModelProvider

Sarvam 30B model provider.

SarvamMLAModelProvider

Sarvam 105B model provider.

Data#

API#

bridge.models.sarvam.sarvam_provider.logger#

‘getLogger(…)’

class bridge.models.sarvam.sarvam_provider.SarvamMoEModelProvider#

Bases: megatron.bridge.models.gpt_provider.GPTModelProvider

Sarvam 30B model provider.

transformer_layer_spec: Union[megatron.core.transformer.ModuleSpec, Callable[[megatron.bridge.models.gpt_provider.GPTModelProvider], megatron.core.transformer.ModuleSpec]]#

‘partial(…)’

normalization: str#

‘RMSNorm’

activation_func: Callable#

None

gated_linear_unit: bool#

True

position_embedding_type: str#

‘rope’

add_bias_linear: bool#

False

share_embeddings_and_output_weights: bool#

False

make_vocab_size_divisible_by: int#

128

add_qkv_bias: bool#

False

qk_layernorm: bool#

True

init_method_std: float#

0.006

hidden_dropout: float#

0.0

attention_dropout: float#

0.0

layernorm_epsilon: float#

1e-06

moe_aux_loss_coeff: float#

0

moe_router_pre_softmax: bool#

True

moe_router_enable_expert_bias: bool#

True

moe_router_bias_update_rate: float#

0.001

moe_grouped_gemm: bool#

True

moe_permute_fusion: bool#

True

moe_router_topk_scaling_factor: float#

2.5

moe_shared_expert_overlap: bool#

False

moe_router_dtype: Optional[str]#

‘fp32’

moe_router_score_function: str#

‘sigmoid’

moe_token_dispatcher_type: str#

‘alltoall’

attention_softmax_in_fp32: bool#

True

persist_layer_norm: bool#

True

cross_entropy_fusion_impl: str#

‘te’

cp_comm_type: str#

‘p2p’

recompute_granularity: str#

‘selective’

recompute_modules: List[str]#

‘field(…)’

kv_channels: Optional[int]#

64

seq_length: int#

131072

rotary_base: float#

8000000.0

vocab_size: int#

262144

num_moe_experts: int#

128

moe_router_topk: int#

6

num_layers: int#

19

hidden_size: int#

4096

num_attention_heads: int#

64

ffn_hidden_size: int#

8192

moe_ffn_hidden_size: int#

1024

moe_shared_expert_intermediate_size: int#

1024

moe_layer_freq: Union[int, List[int]]#

‘field(…)’

bf16: bool#

True

num_query_groups: int#

4

class bridge.models.sarvam.sarvam_provider.SarvamMLAModelProvider#

Bases: megatron.bridge.models.transformer_config.MLATransformerConfig, megatron.bridge.models.gpt_provider.GPTModelProvider

Sarvam 105B model provider.

transformer_layer_spec: Union[megatron.core.transformer.ModuleSpec, Callable[[megatron.bridge.models.gpt_provider.GPTModelProvider], megatron.core.transformer.ModuleSpec]]#

‘partial(…)’

normalization: str#

‘RMSNorm’

activation_func: Callable#

None

gated_linear_unit: bool#

True

position_embedding_type: str#

‘rope’

add_bias_linear: bool#

False

share_embeddings_and_output_weights: bool#

False

make_vocab_size_divisible_by: int#

128

add_qkv_bias: bool#

False

qk_layernorm: bool#

True

init_method_std: float#

0.006

hidden_dropout: float#

0.0

attention_dropout: float#

0.0

layernorm_epsilon: float#

1e-06

moe_aux_loss_coeff: float#

0

moe_router_pre_softmax: bool#

True

moe_router_enable_expert_bias: bool#

True

moe_router_bias_update_rate: float#

0.001

moe_grouped_gemm: bool#

True

moe_permute_fusion: bool#

True

moe_router_topk_scaling_factor: float#

2.5

moe_shared_expert_overlap: bool#

False

moe_router_dtype: Optional[str]#

‘fp32’

moe_router_score_function: str#

‘sigmoid’

moe_token_dispatcher_type: str#

‘alltoall’

attention_softmax_in_fp32: bool#

True

persist_layer_norm: bool#

True

cross_entropy_fusion_impl: str#

‘te’

cp_comm_type: str#

‘p2p’

recompute_granularity: str#

‘selective’

recompute_modules: List[str]#

‘field(…)’

multi_latent_attention: bool#

True

rope_type: str#

‘yarn’

rotary_scaling_factor: float#

40

original_max_position_embeddings: int#

4096

beta_fast: float#

32.0

beta_slow: float#

1.0

mscale: float#

1.0

mscale_all_dim: float#

1.0

kv_channels: Optional[int]#

64

seq_length: int#

131072

rotary_base: float#

10000.0

vocab_size: int#

262144

num_moe_experts: int#

128

moe_router_topk: int#

8

num_layers: int#

32

hidden_size: int#

4096

num_attention_heads: int#

64

ffn_hidden_size: int#

16384

moe_ffn_hidden_size: int#

2048

moe_shared_expert_intermediate_size: int#

2048

moe_layer_freq: Union[int, List[int]]#

‘field(…)’

bf16: bool#

True

q_lora_rank: Optional[int]#

None

kv_lora_rank: int#

512

qk_head_dim: int#

128

qk_pos_emb_head_dim: int#

64

v_head_dim: int#

128