nemo_automodel.components.models.glm4_moe.model
nemo_automodel.components.models.glm4_moe.model
Module Contents
Classes
| Name | Description |
|---|---|
Block | - |
Glm4MoeForCausalLM | - |
Glm4MoeModel | - |
Data
API
class nemo_automodel.components.models.glm4_moe.model.Block( layer_idx: int, config: transformers.models.glm4_moe.configuration_glm4_moe.Glm4MoeConfig, moe_config: nemo_automodel.components.moe.config.MoEConfig, backend: nemo_automodel.components.models.common.BackendConfig )
Bases: Module
input_layernorm
mlp
= MoE(moe_config, backend)
post_attention_layernorm
self_attn
= Glm4MoeAttention(config, backend)
nemo_automodel.components.models.glm4_moe.model.Block._mlp( x: torch.Tensor, padding_mask: torch.Tensor | None ) -> torch.Tensor
nemo_automodel.components.models.glm4_moe.model.Block.forward( x: torch.Tensor, freqs_cis: torch.Tensor, attention_mask: torch.Tensor | None = None, padding_mask: torch.Tensor | None = None, attn_kwargs: typing.Any = {} ) -> torch.Tensor
nemo_automodel.components.models.glm4_moe.model.Block.init_weights( buffer_device: torch.device )
class nemo_automodel.components.models.glm4_moe.model.Glm4MoeForCausalLM( config: transformers.models.glm4_moe.configuration_glm4_moe.Glm4MoeConfig, moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None, backend: nemo_automodel.components.models.common.BackendConfig | None = None, kwargs = {} )
Bases: HFCheckpointingMixin, Module, MoEFSDPSyncMixin
_keep_in_fp32_modules_strict
= ['e_score_correction_bias']
backend
= backend or BackendConfig()
lm_head
model
state_dict_adapter
nemo_automodel.components.models.glm4_moe.model.Glm4MoeForCausalLM.forward( input_ids: torch.Tensor, position_ids: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, padding_mask: torch.Tensor | None = None, logits_to_keep: typing.Union[int, torch.Tensor] = 0, output_hidden_states: typing.Optional[bool] = None, attn_kwargs: typing.Any = {} ) -> transformers.modeling_outputs.CausalLMOutputWithPast
nemo_automodel.components.models.glm4_moe.model.Glm4MoeForCausalLM.from_config( config: transformers.models.glm4_moe.configuration_glm4_moe.Glm4MoeConfig, moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None, backend: nemo_automodel.components.models.common.BackendConfig | None = None, kwargs = {} )
classmethod
nemo_automodel.components.models.glm4_moe.model.Glm4MoeForCausalLM.from_pretrained( pretrained_model_name_or_path: str, model_args = (), kwargs = {} )
classmethod
nemo_automodel.components.models.glm4_moe.model.Glm4MoeForCausalLM.get_input_embeddings()
nemo_automodel.components.models.glm4_moe.model.Glm4MoeForCausalLM.get_output_embeddings()
nemo_automodel.components.models.glm4_moe.model.Glm4MoeForCausalLM.initialize_weights( buffer_device: torch.device | None = None, dtype: torch.dtype = torch.bfloat16 ) -> None
nemo_automodel.components.models.glm4_moe.model.Glm4MoeForCausalLM.set_input_embeddings( value )
nemo_automodel.components.models.glm4_moe.model.Glm4MoeForCausalLM.set_output_embeddings( new_embeddings )
class nemo_automodel.components.models.glm4_moe.model.Glm4MoeModel( config: transformers.models.glm4_moe.configuration_glm4_moe.Glm4MoeConfig, backend: nemo_automodel.components.models.common.BackendConfig, moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None, moe_overrides: dict | None = None )
Bases: Module
embed_tokens
head_dim
layers
= torch.nn.ModuleDict()
max_seq_len
= config.max_position_embeddings
moe_config
= moe_config or MoEConfig(**moe_defaults)
norm
rotary_emb
nemo_automodel.components.models.glm4_moe.model.Glm4MoeModel.forward( input_ids: torch.Tensor, position_ids: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, padding_mask: torch.Tensor | None = None, attn_kwargs: typing.Any = {} ) -> torch.Tensor
nemo_automodel.components.models.glm4_moe.model.Glm4MoeModel.init_weights( buffer_device: torch.device | None = None ) -> None
nemo_automodel.components.models.glm4_moe.model.ModelClass = Glm4MoeForCausalLM