bridge.models.mamba.nemotron_h_provider#

Module Contents#

Classes#

NemotronHModelProvider

Configuration for Nemotron-H models.

NemotronHModel4BProvider

Configuration for a 4B parameter Nemotron-H model.

NemotronHModel8BProvider

Configuration for a 8B parameter Nemotron-H model.

NemotronHModel47BProvider

Configuration for a 47B parameter Nemotron-H model.

NemotronHModel56BProvider

Configuration for a 56B parameter Nemotron-H model.

NemotronNano9Bv2Provider

Configuration for a 9B parameter Nemotron Nano v2 model.

NemotronNano12Bv2Provider

Configuration for a 12B parameter Nemotron Nano v2 model.

Data#

API#

bridge.models.mamba.nemotron_h_provider.logger#

‘getLogger(…)’

class bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider#

Bases: megatron.bridge.models.mamba.mamba_provider.MambaProvider

Configuration for Nemotron-H models.

seq_length: int#

8192

mamba_num_groups: int#

8

mamba_head_dim: int#

64

num_query_groups: int#

8

make_vocab_size_divisible_by: int#

128

activation_func: callable#

None

masked_softmax_fusion: bool#

True

apply_query_key_layer_scaling: bool#

False

persist_layer_norm: bool#

True

attention_softmax_in_fp32: bool#

False

first_last_layers_bf16: bool#

True

is_hybrid_model: bool#

True

class bridge.models.mamba.nemotron_h_provider.NemotronHModel4BProvider#

Bases: bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider

Configuration for a 4B parameter Nemotron-H model.

hybrid_override_pattern: str#

‘M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-’

num_layers: int#

52

hidden_size: int#

3072

mamba_num_heads: int#

112

kv_channels: int#

128

mamba_state_dim: int#

128

ffn_hidden_size: int#

12288

num_attention_heads: int#

32

use_mamba_mem_eff_path: bool#

False

class bridge.models.mamba.nemotron_h_provider.NemotronHModel8BProvider#

Bases: bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider

Configuration for a 8B parameter Nemotron-H model.

hybrid_override_pattern: str#

‘M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-’

num_layers: int#

52

hidden_size: int#

4096

mamba_state_dim: int#

128

ffn_hidden_size: int#

21504

num_attention_heads: int#

32

class bridge.models.mamba.nemotron_h_provider.NemotronHModel47BProvider#

Bases: bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider

Configuration for a 47B parameter Nemotron-H model.

hybrid_override_pattern: str#

‘M-M-M-M-M-M-M-M-M*-M-M-M-M-M-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-M-M—MM—M-M*-M-M-M-M-M-’

num_layers: int#

98

hidden_size: int#

8192

mamba_state_dim: int#

256

ffn_hidden_size: int#

30720

num_attention_heads: int#

64

class bridge.models.mamba.nemotron_h_provider.NemotronHModel56BProvider#

Bases: bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider

Configuration for a 56B parameter Nemotron-H model.

hybrid_override_pattern: str#

‘M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M…’

num_layers: int#

118

hidden_size: int#

8192

mamba_state_dim: int#

256

ffn_hidden_size: int#

32768

num_attention_heads: int#

64

class bridge.models.mamba.nemotron_h_provider.NemotronNano9Bv2Provider#

Bases: bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider

Configuration for a 9B parameter Nemotron Nano v2 model.

hybrid_override_pattern: str#

‘M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-’

num_layers: int#

56

hidden_size: int#

4480

mamba_num_heads: int#

128

kv_channels: int#

128

mamba_state_dim: int#

128

ffn_hidden_size: int#

15680

num_attention_heads: int#

40

mamba_head_dim: int#

80

class bridge.models.mamba.nemotron_h_provider.NemotronNano12Bv2Provider#

Bases: bridge.models.mamba.nemotron_h_provider.NemotronHModelProvider

Configuration for a 12B parameter Nemotron Nano v2 model.

hybrid_override_pattern: str#

‘M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-’

num_layers: int#

62

hidden_size: int#

5120

mamba_num_heads: int#

128

kv_channels: int#

128

mamba_state_dim: int#

128

ffn_hidden_size: int#

20480

num_attention_heads: int#

40

mamba_head_dim: int#

80