bridge.models.nemotron.nemotron_provider#

Module Contents#

Classes#

NemotronModelProvider

Configuration class for Nemotron models.

Nemotron3ModelProvider4B

Configuration class for the Nemotron3 4B model, inheriting from NemotronModelProvider.

Nemotron3ModelProvider8B

Configuration class for the Nemotron3 8B model, inheriting from NemotronModelProvider.

Nemotron3ModelProvider22B

Configuration class for the Nemotron3 22B model, inheriting from NemotronModelProvider.

Nemotron4ModelProvider15B

Configuration class for the Nemotron4 15B model, inheriting from NemotronModelProvider.

Nemotron4ModelProvider340B

Configuration class for the Nemotron4 340B model, inheriting from NemotronModelProvider.

Functions#

squared_relu

Squared ReLU activation function.

Data#

API#

bridge.models.nemotron.nemotron_provider.logger#

‘getLogger(…)’

bridge.models.nemotron.nemotron_provider.squared_relu(x)#

Squared ReLU activation function.

class bridge.models.nemotron.nemotron_provider.NemotronModelProvider#

Bases: megatron.bridge.models.gpt_provider.GPTModelProvider

Configuration class for Nemotron models.

normalization: str#

‘LayerNorm’

activation_func: Callable#

None

position_embedding_type: str#

‘rope’

share_embeddings_and_output_weights: bool#

False

add_bias_linear: bool#

False

hidden_dropout: float#

0.0

attention_dropout: float#

0.0

rotary_percent: float#

0.5

masked_softmax_fusion: bool#

‘field(…)’

persist_layer_norm: bool#

True

bias_dropout_add_fusion: bool#

False

layernorm_zero_centered_gamma: bool#

True

cross_entropy_loss_fusion: bool#

True

apply_rope_fusion: bool#

‘field(…)’

num_layers: int#

32

seq_length: int#

4096

hidden_size: int#

3072

ffn_hidden_size: int#

9216

num_attention_heads: int#

24

num_query_groups: Optional[int]#

8

kv_channels: Optional[int]#

128

init_method_std: float#

0.0134

bf16: bool#

True

fp16: bool#

False

params_dtype: torch.dtype#

None

autocast_dtype: torch.dtype#

None

class bridge.models.nemotron.nemotron_provider.Nemotron3ModelProvider4B#

Bases: bridge.models.nemotron.nemotron_provider.NemotronModelProvider

Configuration class for the Nemotron3 4B model, inheriting from NemotronModelProvider.

num_layers: int#

32

seq_length: int#

4096

hidden_size: int#

3072

ffn_hidden_size: int#

9216

num_attention_heads: int#

24

num_query_groups: int#

8

kv_channels: Optional[int]#

128

init_method_std: float#

0.0134

class bridge.models.nemotron.nemotron_provider.Nemotron3ModelProvider8B#

Bases: bridge.models.nemotron.nemotron_provider.NemotronModelProvider

Configuration class for the Nemotron3 8B model, inheriting from NemotronModelProvider.

num_layers: int#

32

seq_length: int#

4096

hidden_size: int#

4096

ffn_hidden_size: int#

16384

num_attention_heads: int#

32

num_query_groups: Optional[int]#

None

kv_channels: Optional[int]#

None

init_method_std: float#

0.01

class bridge.models.nemotron.nemotron_provider.Nemotron3ModelProvider22B#

Bases: bridge.models.nemotron.nemotron_provider.NemotronModelProvider

Configuration class for the Nemotron3 22B model, inheriting from NemotronModelProvider.

num_layers: int#

40

seq_length: int#

4096

hidden_size: int#

6144

ffn_hidden_size: int#

24576

num_attention_heads: int#

48

num_query_groups: Optional[int]#

None

kv_channels: Optional[int]#

None

init_method_std: float#

0.008

class bridge.models.nemotron.nemotron_provider.Nemotron4ModelProvider15B#

Bases: bridge.models.nemotron.nemotron_provider.NemotronModelProvider

Configuration class for the Nemotron4 15B model, inheriting from NemotronModelProvider.

num_layers: int#

32

seq_length: int#

4096

hidden_size: int#

6144

ffn_hidden_size: int#

24576

num_attention_heads: int#

48

num_query_groups: Optional[int]#

8

kv_channels: Optional[int]#

None

init_method_std: float#

0.0134

class bridge.models.nemotron.nemotron_provider.Nemotron4ModelProvider340B#

Bases: bridge.models.nemotron.nemotron_provider.NemotronModelProvider

Configuration class for the Nemotron4 340B model, inheriting from NemotronModelProvider.

num_layers: int#

96

seq_length: int#

4096

hidden_size: int#

18432

ffn_hidden_size: int#

73728

num_attention_heads: int#

96

num_query_groups: Optional[int]#

8

kv_channels: Optional[int]#

None

init_method_std: float#

0.0063