`bridge.models.kimi.kimi_provider`#

Module Contents#

Classes#

KimiK2Provider

https://moonshotai.github.io/Kimi-K2/

API#

class bridge.models.kimi.kimi_provider.KimiK2Provider#

Bases: megatron.bridge.models.transformer_config.MLATransformerConfig, megatron.bridge.models.gpt_provider.GPTModelProvider

https://moonshotai.github.io/Kimi-K2/

transformer_layer_spec: Union[megatron.core.transformer.ModuleSpec, Callable[[megatron.bridge.models.gpt_provider.GPTModelProvider], megatron.core.transformer.ModuleSpec]]#: ‘partial(…)’

num_layers: int#: 61

hidden_size: int#: 7168

ffn_hidden_size: int#: 18432

num_moe_experts: int#: 384

moe_ffn_hidden_size: int#: 2048

moe_shared_expert_intermediate_size: int#: 2048

moe_layer_freq: Union[int, List[int]]#: ‘field(…)’

normalization: str#: ‘RMSNorm’

activation_func: Callable#: None

gated_linear_unit: bool#: True

position_embedding_type: str#: ‘rope’

add_bias_linear: bool#: False

share_embeddings_and_output_weights: bool#: False

num_attention_heads: int#: 64

kv_channels: int#: 64

max_position_embeddings: int#: 4096

seq_length: int#: 4096

rotary_base: float#: 50000.0

make_vocab_size_divisible_by: int#: 1280

mtp_num_layers: Optional[int]#: None

mtp_loss_scaling_factor: Optional[float]#: None

attention_dropout: float#: 0.0

hidden_dropout: float#: 0.0

qk_layernorm: bool#: True

moe_router_topk: int#: 8

moe_router_num_groups: int#: 1

moe_router_group_topk: int#: 1

moe_router_topk_scaling_factor: float#: 2.827

moe_aux_loss_coeff: float#: 0.001

moe_router_score_function: str#: ‘sigmoid’

moe_router_enable_expert_bias: bool#: True

moe_router_bias_update_rate: float#: 0.001

moe_grouped_gemm: bool#: True

moe_router_pre_softmax: bool#: True

moe_token_dispatcher_type: str#: ‘alltoall’

moe_router_load_balancing_type: str#: ‘seq_aux_loss’

moe_shared_expert_overlap: bool#: True

moe_router_dtype: Optional[str]#: ‘fp32’

multi_latent_attention: bool#: True

q_lora_rank: int#: 1536

kv_lora_rank: int#: 512

qk_head_dim: int#: 128

qk_pos_emb_head_dim: int#: 64

v_head_dim: int#: 128

rotary_scaling_factor: float#: 32

beta_fast: float#: 1.0

beta_slow: float#: 1.0

mscale: float#: 1.0

mscale_all_dim: float#: 1.0

init_method_std: float#: 0.006

layernorm_epsilon: float#: 1e-06

bf16: bool#: True

params_dtype: torch.dtype#: None

async_tensor_model_parallel_allreduce: bool#: True

attention_softmax_in_fp32: bool#: False

persist_layer_norm: bool#: True

num_layers_in_first_pipeline_stage: Optional[int]#: None

num_layers_in_last_pipeline_stage: Optional[int]#: None

account_for_embedding_in_pipeline_split: bool#: False

account_for_loss_in_pipeline_split: bool#: False

vocab_size: int#: 163840

apply_rope_fusion: bool#: False

bias_activation_fusion: bool#: True

bias_dropout_fusion: bool#: True

masked_softmax_fusion: bool#: True

gradient_accumulation_fusion: bool#: True

cross_entropy_loss_fusion: bool#: True

cross_entropy_fusion_impl: str#: ‘te’

moe_permute_fusion: bool#: None