bridge.models.gpt_oss.gpt_oss_provider#

Module Contents#

Classes#

GPTOSSProvider

Base config for GPT-OSS

GPTOSSProvider120B

Config for GPT-OSS 120B

GPTOSSProvider20B

Config for GPT-OSS 20B

Data#

API#

bridge.models.gpt_oss.gpt_oss_provider.logger#

‘getLogger(…)’

class bridge.models.gpt_oss.gpt_oss_provider.GPTOSSProvider#

Bases: megatron.bridge.models.gpt_provider.GPTModelProvider

Base config for GPT-OSS

hidden_size: int#

2880

num_attention_heads: int#

64

num_query_groups: int#

8

ffn_hidden_size: int#

2880

kv_channels: Optional[int]#

64

normalization: str#

‘RMSNorm’

gated_linear_unit: bool#

True

add_bias_linear: bool#

True

share_embeddings_and_output_weights: bool#

False

vocab_size: int#

201088

hidden_dropout: float#

0.0

attention_dropout: float#

0.0

bf16: bool#

True

params_dtype: torch.dtype#

None

position_embedding_type: str#

‘yarn’

rotary_base: int#

150000

yarn_rotary_scaling_factor: float#

32.0

yarn_original_max_position_embeddings: int#

4096

yarn_beta_fast: float#

32.0

yarn_beta_slow: float#

1.0

yarn_correction_range_round_to_int: bool#

False

yarn_mscale: float#

1.0

yarn_mscale_all_dim: float#

1.0

moe_router_topk: int#

4

moe_router_pre_softmax: bool#

False

moe_grouped_gemm: bool#

True

moe_token_dispatcher_type: str#

‘alltoall’

moe_permute_fusion: bool#

True

moe_ffn_hidden_size: int#

2880

moe_router_load_balancing_type: str#

‘none’

seq_length: int#

131072

window_size: Optional[Tuple[int, int]]#

(128, 0)

softmax_type: Literal[vanilla, off-by-one, learnable]#

‘learnable’

activation_func: Callable#

None

glu_linear_offset: float#

1.0

bias_activation_fusion: bool#

True

bias_dropout_fusion: bool#

False

window_attn_skip_freq: Optional[Union[int, List[int]]]#

2

activation_func_clamp_value: Optional[float]#

7.0

provide(
pre_process=None,
post_process=None,
vp_stage=None,
) megatron.core.models.gpt.GPTModel#
class bridge.models.gpt_oss.gpt_oss_provider.GPTOSSProvider120B#

Bases: bridge.models.gpt_oss.gpt_oss_provider.GPTOSSProvider

Config for GPT-OSS 120B

num_layers: int#

36

num_moe_experts: int#

128

class bridge.models.gpt_oss.gpt_oss_provider.GPTOSSProvider20B#

Bases: bridge.models.gpt_oss.gpt_oss_provider.GPTOSSProvider

Config for GPT-OSS 20B

num_layers: int#

24

num_moe_experts: int#

32