bridge.models.mamba.nemotron_h_provider
#
Module Contents#
Classes#
Configuration for Nemotron-H models. |
|
Configuration for a 4B parameter Nemotron-H model. |
|
Configuration for a 8B parameter Nemotron-H model. |
|
Configuration for a 47B parameter Nemotron-H model. |
|
Configuration for a 56B parameter Nemotron-H model. |
|
Configuration for a 9B parameter Nemotron Nano v2 model. |
|
Configuration for a 12B parameter Nemotron Nano v2 model. |
Data#
API#
- bridge.models.mamba.nemotron_h_provider.logger#
‘getLogger(…)’
- class bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider#
Bases:
megatron.bridge.models.mamba.mamba_provider.MambaProvider
Configuration for Nemotron-H models.
- seq_length: int#
8192
- mamba_num_groups: int#
8
- mamba_head_dim: int#
64
- num_query_groups: int#
8
- make_vocab_size_divisible_by: int#
128
- activation_func: callable#
None
- masked_softmax_fusion: bool#
True
- apply_query_key_layer_scaling: bool#
False
- persist_layer_norm: bool#
True
- attention_softmax_in_fp32: bool#
False
- first_last_layers_bf16: bool#
True
- is_hybrid_model: bool#
True
- class bridge.models.mamba.nemotron_h_provider.NemotronHModel4BProvider#
Bases:
bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider
Configuration for a 4B parameter Nemotron-H model.
- hybrid_override_pattern: str#
‘M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-’
- num_layers: int#
52
3072
- mamba_num_heads: int#
112
- kv_channels: int#
128
- mamba_state_dim: int#
128
12288
- num_attention_heads: int#
32
- use_mamba_mem_eff_path: bool#
False
- class bridge.models.mamba.nemotron_h_provider.NemotronHModel8BProvider#
Bases:
bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider
Configuration for a 8B parameter Nemotron-H model.
- hybrid_override_pattern: str#
‘M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-’
- num_layers: int#
52
4096
- mamba_state_dim: int#
128
21504
- num_attention_heads: int#
32
- class bridge.models.mamba.nemotron_h_provider.NemotronHModel47BProvider#
Bases:
bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider
Configuration for a 47B parameter Nemotron-H model.
- hybrid_override_pattern: str#
‘M-M-M-M-M-M-M-M-M*-M-M-M-M-M-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-M-M—MM—M-M*-M-M-M-M-M-’
- num_layers: int#
98
8192
- mamba_state_dim: int#
256
30720
- num_attention_heads: int#
64
- class bridge.models.mamba.nemotron_h_provider.NemotronHModel56BProvider#
Bases:
bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider
Configuration for a 56B parameter Nemotron-H model.
- hybrid_override_pattern: str#
‘M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M…’
- num_layers: int#
118
8192
- mamba_state_dim: int#
256
32768
- num_attention_heads: int#
64
- class bridge.models.mamba.nemotron_h_provider.NemotronNano9Bv2Provider#
Bases:
bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider
Configuration for a 9B parameter Nemotron Nano v2 model.
- hybrid_override_pattern: str#
‘M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-’
- num_layers: int#
56
4480
- mamba_num_heads: int#
128
- kv_channels: int#
128
- mamba_state_dim: int#
128
15680
- num_attention_heads: int#
40
- mamba_head_dim: int#
80
- class bridge.models.mamba.nemotron_h_provider.NemotronNano12Bv2Provider#
Bases:
bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider
Configuration for a 12B parameter Nemotron Nano v2 model.
- hybrid_override_pattern: str#
‘M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-’
- num_layers: int#
62
5120
- mamba_num_heads: int#
128
- kv_channels: int#
128
- mamba_state_dim: int#
128
20480
- num_attention_heads: int#
40
- mamba_head_dim: int#
80