bridge.models.olmoe.olmoe_provider#

Module Contents#

Classes#

OlMoEModelProvider

Base provider for OlMoE Models.

OLMoESelfAttention

Custom self-attention module for OlMoE models.

Functions#

olmoe_layer_spec

Layer spec for OlMoE models.

API#

bridge.models.olmoe.olmoe_provider.olmoe_layer_spec(
config: megatron.bridge.models.gpt_provider.GPTModelProvider,
) megatron.core.transformer.ModuleSpec#

Layer spec for OlMoE models.

class bridge.models.olmoe.olmoe_provider.OlMoEModelProvider#

Bases: megatron.bridge.models.gpt_provider.GPTModelProvider

Base provider for OlMoE Models.

transformer_layer_spec: Union[megatron.core.transformer.ModuleSpec, Callable[[megatron.bridge.models.gpt_provider.GPTModelProvider], megatron.core.transformer.ModuleSpec]]#

None

normalization: str#

‘RMSNorm’

activation_func: Callable#

None

gated_linear_unit: bool#

True

add_bias_linear: bool#

False

add_qkv_bias: bool#

False

seq_length: int#

4096

init_method_std: int#

0.02

hidden_dropout: float#

0.0

vocab_size: int#

50304

share_embeddings_and_output_weights: Optional[bool]#

False

layernorm_epsilon: float#

1e-05

autocast_dtype: torch.dtype#

None

params_dtype: torch.dtype#

None

bf16: bool#

False

num_layers: int#

16

hidden_size: int#

2048

ffn_hidden_size: int#

1024

moe_ffn_hidden_size: int#

1024

kv_channels: int#

None

num_query_groups: int#

16

num_attention_heads: int#

16

attention_dropout: float#

0.0

qk_layernorm: bool#

True

position_embedding_type: str#

‘rope’

rotary_base: float#

10000.0

num_moe_experts: int#

64

moe_router_topk: int#

8

moe_token_dispatcher_type: str#

‘alltoall’

moe_router_load_balancing_type: str#

‘seq_aux_loss’

moe_aux_loss_coeff: float#

0.01

moe_router_pre_softmax: bool#

True

moe_grouped_gemm: bool#

True

moe_router_score_function: str#

‘softmax’

moe_permute_fusion: bool#

True

moe_router_dtype: str#

‘fp32’

persist_layer_norm: bool#

True

class bridge.models.olmoe.olmoe_provider.OLMoESelfAttention(
config: megatron.core.transformer.transformer_config.TransformerConfig,
submodules: megatron.core.transformer.attention.SelfAttentionSubmodules,
layer_number: int,
attn_mask_type=AttnMaskType.padding,
cp_comm_type: str = None,
pg_collection: megatron.core.process_groups_config.ProcessGroupCollection = None,
**kwargs,
)#

Bases: megatron.core.transformer.attention.SelfAttention

Custom self-attention module for OlMoE models.

Initialization

get_query_key_value_tensors(hidden_states, key_value_states=None)#

Derives query, key and value tensors from hidden_states.