bridge.models.deepseek.deepseek_provider#

Module Contents#

Classes#

DeepSeekModelProvider

Deprecated alias for MLAModelProvider.

DeepSeekV2ModelProvider

DeepSeek-V2 Model: https://github.com/deepseek-ai/DeepSeek-V2

DeepSeekV2LiteModelProvider

DeepSeek-V2-Lite Model: https://github.com/deepseek-ai/DeepSeek-V2 HuggingFace: https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite

DeepSeekV3ModelProvider

DeepSeek-V3 Model: https://github.com/deepseek-ai/DeepSeek-V3

MoonlightModelProvider16B

Moonlight-16B-A3B Model: https://github.com/moonshotai/Moonlight-16B-A3B

Functions#

Data#

API#

bridge.models.deepseek.deepseek_provider._warn_deprecated(
old_cls: str,
new_cls: str = 'MLAModelProvider',
) None#
class bridge.models.deepseek.deepseek_provider.DeepSeekModelProvider#

Bases: megatron.bridge.models.mla_provider.MLAModelProvider

Deprecated alias for MLAModelProvider.

Deprecated: This alias remains for backward compatibility and will be removed in a future release. Use MLAModelProvider instead.

normalization: str#

‘RMSNorm’

activation_func: Callable#

None

gated_linear_unit: bool#

True

position_embedding_type: str#

‘rope’

add_bias_linear: bool#

False

share_embeddings_and_output_weights: bool#

False

qk_layernorm: bool#

True

bf16: bool#

True

params_dtype: torch.dtype#

None

moe_grouped_gemm: bool#

True

moe_token_dispatcher_type: str#

‘alltoall’

q_lora_rank: Optional[int]#

1536

kv_lora_rank: int#

512

__post_init__() None#
class bridge.models.deepseek.deepseek_provider.DeepSeekV2ModelProvider#

Bases: megatron.bridge.models.mla_provider.MLAModelProvider

DeepSeek-V2 Model: https://github.com/deepseek-ai/DeepSeek-V2

num_layers: int#

60

hidden_size: int#

5120

ffn_hidden_size: int#

12288

num_moe_experts: int#

160

moe_ffn_hidden_size: int#

1536

moe_shared_expert_intermediate_size: int#

3072

moe_layer_freq: Union[int, List[int]]#

‘field(…)’

moe_router_topk: int#

6

moe_router_num_groups: int#

8

moe_router_group_topk: int#

3

moe_router_topk_scaling_factor: float#

16.0

moe_aux_loss_coeff: float#

0.001

mscale: float#

0.707

mscale_all_dim: float#

0.707

vocab_size: int#

102400

__post_init__() None#
class bridge.models.deepseek.deepseek_provider.DeepSeekV2LiteModelProvider#

Bases: megatron.bridge.models.mla_provider.MLAModelProvider

DeepSeek-V2-Lite Model: https://github.com/deepseek-ai/DeepSeek-V2 HuggingFace: https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite

num_layers: int#

27

hidden_size: int#

2048

ffn_hidden_size: int#

10944

num_attention_heads: int#

16

kv_channels: int#

16

q_lora_rank: Optional[int]#

None

num_moe_experts: int#

64

moe_ffn_hidden_size: int#

1408

moe_shared_expert_intermediate_size: int#

2816

moe_layer_freq: Union[int, List[int]]#

‘field(…)’

moe_router_topk: int#

6

moe_router_num_groups: int#

1

moe_router_group_topk: int#

1

moe_router_topk_scaling_factor: float#

1.0

mscale: float#

0.707

mscale_all_dim: float#

0.707

vocab_size: int#

102400

__post_init__() None#
class bridge.models.deepseek.deepseek_provider.DeepSeekV3ModelProvider#

Bases: megatron.bridge.models.mla_provider.MLAModelProvider

DeepSeek-V3 Model: https://github.com/deepseek-ai/DeepSeek-V3

num_layers: int#

61

hidden_size: int#

7168

ffn_hidden_size: int#

18432

kv_channels: int#

128

num_moe_experts: int#

256

moe_ffn_hidden_size: int#

2048

moe_shared_expert_intermediate_size: int#

2048

moe_layer_freq: Union[int, List[int]]#

‘field(…)’

moe_router_topk: int#

8

moe_router_num_groups: int#

8

moe_router_group_topk: int#

4

moe_router_topk_scaling_factor: float#

2.5

moe_aux_loss_coeff: float#

0.0001

make_vocab_size_divisible_by: int#

1280

moe_router_score_function: str#

‘sigmoid’

moe_router_enable_expert_bias: bool#

True

moe_router_bias_update_rate: float#

0.001

mscale: float#

1.0

mscale_all_dim: float#

1.0

vocab_size: int#

129280

__post_init__() None#
class bridge.models.deepseek.deepseek_provider.MoonlightModelProvider16B#

Bases: megatron.bridge.models.mla_provider.MLAModelProvider

Moonlight-16B-A3B Model: https://github.com/moonshotai/Moonlight-16B-A3B

Moonlight is based on DeepSeek-V3.

max_position_embeddings: int#

4096

num_layers: int#

27

hidden_size: int#

2048

ffn_hidden_size: int#

11264

num_attention_heads: int#

16

kv_channels: int#

16

num_moe_experts: int#

64

moe_ffn_hidden_size: int#

1408

moe_shared_expert_intermediate_size: int#

2816

moe_layer_freq: Union[int, List[int]]#

‘field(…)’

moe_router_topk: int#

6

moe_router_num_groups: int#

1

moe_router_group_topk: int#

1

moe_router_topk_scaling_factor: float#

2.446

moe_aux_loss_coeff: float#

0.001

make_vocab_size_divisible_by: int#

1280

moe_router_score_function: str#

‘sigmoid’

moe_router_enable_expert_bias: bool#

True

rotary_scaling_factor: float#

1.0

mscale: float#

1.0

mscale_all_dim: float#

1.0

rotary_base: float#

50000

layernorm_epsilon: float#

1e-05

q_lora_rank: int#

None

init_method_std: float#

0.02

moe_router_bias_update_rate: float#

0.001

rotary_percent: float#

1.0

vocab_size: int#

163842

__post_init__() None#
bridge.models.deepseek.deepseek_provider.DeepSeekProvider#

None

bridge.models.deepseek.deepseek_provider.DeepSeekV2Provider#

None

bridge.models.deepseek.deepseek_provider.DeepSeekV2LiteProvider#

None

bridge.models.deepseek.deepseek_provider.DeepSeekV3Provider#

None

bridge.models.deepseek.deepseek_provider.MoonlightProvider#

None