bridge.models.gpt_oss.gpt_oss_provider
#
Module Contents#
Classes#
Base config for GPT-OSS |
|
Config for GPT-OSS 120B |
|
Config for GPT-OSS 20B |
Data#
API#
- bridge.models.gpt_oss.gpt_oss_provider.logger#
‘getLogger(…)’
- class bridge.models.gpt_oss.gpt_oss_provider.GPTOSSProvider#
Bases:
megatron.bridge.models.gpt_provider.GPTModelProvider
Base config for GPT-OSS
2880
- num_attention_heads: int#
64
- num_query_groups: int#
8
2880
- kv_channels: Optional[int]#
64
- normalization: str#
‘RMSNorm’
- gated_linear_unit: bool#
True
- add_bias_linear: bool#
True
False
- vocab_size: int#
201088
0.0
- attention_dropout: float#
0.0
- bf16: bool#
True
- params_dtype: torch.dtype#
None
- position_embedding_type: str#
‘yarn’
- rotary_base: int#
150000
- yarn_rotary_scaling_factor: float#
32.0
- yarn_original_max_position_embeddings: int#
4096
- yarn_beta_fast: float#
32.0
- yarn_beta_slow: float#
1.0
- yarn_correction_range_round_to_int: bool#
False
- yarn_mscale: float#
1.0
- yarn_mscale_all_dim: float#
1.0
- moe_router_topk: int#
4
- moe_router_pre_softmax: bool#
False
- moe_grouped_gemm: bool#
True
- moe_token_dispatcher_type: str#
‘alltoall’
- moe_permute_fusion: bool#
True
2880
- moe_router_load_balancing_type: str#
‘none’
- seq_length: int#
131072
- window_size: Optional[Tuple[int, int]]#
(128, 0)
- softmax_type: Literal[vanilla, off-by-one, learnable]#
‘learnable’
- activation_func: Callable#
None
- glu_linear_offset: float#
1.0
- bias_activation_fusion: bool#
True
- bias_dropout_fusion: bool#
False
- window_attn_skip_freq: Optional[Union[int, List[int]]]#
2
- activation_func_clamp_value: Optional[float]#
7.0
- provide(
- pre_process=None,
- post_process=None,
- vp_stage=None,
- class bridge.models.gpt_oss.gpt_oss_provider.GPTOSSProvider120B#
Bases:
bridge.models.gpt_oss.gpt_oss_provider.GPTOSSProvider
Config for GPT-OSS 120B
- num_layers: int#
36
- num_moe_experts: int#
128
- class bridge.models.gpt_oss.gpt_oss_provider.GPTOSSProvider20B#
Bases:
bridge.models.gpt_oss.gpt_oss_provider.GPTOSSProvider
Config for GPT-OSS 20B
- num_layers: int#
24
- num_moe_experts: int#
32