nemo_automodel.components.models.gpt_oss.layers
#
Module Contents#
Classes#
Functions#
API#
- nemo_automodel.components.models.gpt_oss.layers._apply_rotary_emb(
- x: torch.Tensor,
- cos: torch.Tensor,
- sin: torch.Tensor,
- class nemo_automodel.components.models.gpt_oss.layers.RotaryEmbedding(
- head_dim: int,
- base: int,
- dtype: torch.dtype,
- initial_context_length: int = 4096,
- scaling_factor: float = 1.0,
- ntk_alpha: float = 1.0,
- ntk_beta: float = 32.0,
- device: torch.device | None = None,
Bases:
torch.nn.Module
Initialization
- _compute_concentration_and_inv_freq() torch.Tensor #
See YaRN paper: https://arxiv.org/abs/2309.00071
- _compute_cos_sin(num_tokens: int)#
- forward(
- query: torch.Tensor,
- key: torch.Tensor,
- class nemo_automodel.components.models.gpt_oss.layers.GptOssAttention(
- config: transformers.models.gpt_oss.configuration_gpt_oss.GptOssConfig,
- backend: nemo_automodel.components.moe.utils.BackendConfig,
- use_sliding_attention: bool = False,
Bases:
torch.nn.Module
Initialization
- forward(x: torch.Tensor, freqs_cis: torch.Tensor)#
- init_weights(buffer_device: torch.device, init_std: float = 0.02)#