nemo_automodel.components.models.gpt_oss.layers#

Module Contents#

Classes#

Functions#

API#

nemo_automodel.components.models.gpt_oss.layers._apply_rotary_emb(
x: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
) torch.Tensor#
class nemo_automodel.components.models.gpt_oss.layers.RotaryEmbedding(
head_dim: int,
base: int,
dtype: torch.dtype,
initial_context_length: int = 4096,
scaling_factor: float = 1.0,
ntk_alpha: float = 1.0,
ntk_beta: float = 32.0,
device: torch.device | None = None,
)#

Bases: torch.nn.Module

Initialization

_compute_concentration_and_inv_freq() torch.Tensor#

See YaRN paper: https://arxiv.org/abs/2309.00071

_compute_cos_sin(num_tokens: int)#
forward(
query: torch.Tensor,
key: torch.Tensor,
) tuple[torch.Tensor, torch.Tensor]#
class nemo_automodel.components.models.gpt_oss.layers.GptOssAttention(
config: transformers.models.gpt_oss.configuration_gpt_oss.GptOssConfig,
backend: nemo_automodel.components.moe.utils.BackendConfig,
use_sliding_attention: bool = False,
)#

Bases: torch.nn.Module

Initialization

forward(x: torch.Tensor, freqs_cis: torch.Tensor)#
init_weights(buffer_device: torch.device, init_std: float = 0.02)#