nemo_automodel.components.models.step3p7.model#

Module Contents#

Classes#

Step3p7CausalLMOutput

Step3.7 CausalLM output with optional per-depth MTP logits.

Step3p7Model

Step3.7 VLM wrapper using the native Step3.5 MoE language backbone.

Step3p7ForConditionalGeneration

Native Step3.7 VLM implementation for MedPix fine-tuning with EP and PP.

Functions#

Data#

API#

nemo_automodel.components.models.step3p7.model.logger#

‘getLogger(…)’

class nemo_automodel.components.models.step3p7.model.Step3p7CausalLMOutput#

Step3.7 CausalLM output with optional per-depth MTP logits.

logits: torch.Tensor#

None

mtp_per_depth_logits: list[torch.Tensor] | None#

None

mtp_loss_scaling_factor: float | None#

None

nemo_automodel.components.models.step3p7.model._debug_vision_enabled() bool#
nemo_automodel.components.models.step3p7.model._debug_vision_log(message: str, *args: Any) None#
nemo_automodel.components.models.step3p7.model._rank() int#
class nemo_automodel.components.models.step3p7.model.Step3p7Model(
config: nemo_automodel.components.models.step3p7.configuration_step3p7.Step3p7Config,
backend: nemo_automodel.components.models.common.BackendConfig,
*,
moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None,
moe_overrides: dict | None = None,
)#

Bases: torch.nn.Module

Step3.7 VLM wrapper using the native Step3.5 MoE language backbone.

Initialization

property layers#
property embed_tokens#
property norm#
get_input_embeddings()#
set_input_embeddings(value)#
set_decoder(decoder)#
get_decoder()#
_vision_dtype_device() tuple[torch.dtype, torch.device]#
_process_image_features(image_features: torch.Tensor) torch.Tensor#
_process_image_input(
pixel_values: torch.Tensor,
*,
patch_pixel_values: torch.Tensor | None = None,
num_patches: torch.Tensor | list[int] | tuple[int, ...] | None = None,
) list[torch.Tensor]#
get_multimodal_embeddings(
*,
pixel_values: torch.Tensor | None = None,
patch_pixel_values: torch.Tensor | None = None,
num_patches: torch.Tensor | list[int] | tuple[int, ...] | None = None,
image_embeds: torch.Tensor | None = None,
**_: Any,
) list[torch.Tensor] | torch.Tensor | None#
prepare_inputs_embeds(
input_ids: torch.Tensor,
multimodal_embeddings: list[torch.Tensor] | torch.Tensor | None = None,
) torch.Tensor#
forward(
input_ids: torch.Tensor | None = None,
*,
attention_mask: torch.Tensor | None = None,
position_ids: torch.Tensor | None = None,
inputs_embeds: torch.Tensor | None = None,
pixel_values: torch.Tensor | None = None,
patch_pixel_values: torch.Tensor | None = None,
num_patches: torch.Tensor | list[int] | tuple[int, ...] | None = None,
image_embeds: torch.Tensor | None = None,
**kwargs: Any,
) torch.Tensor#
class nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration(
config: nemo_automodel.components.models.step3p7.configuration_step3p7.Step3p7Config,
moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None,
backend: nemo_automodel.components.models.common.BackendConfig | None = None,
**kwargs: Any,
)#

Bases: nemo_automodel.components.models.common.hf_checkpointing_mixin.HFCheckpointingMixin, torch.nn.Module, nemo_automodel.components.moe.fsdp_mixin.MoEFSDPSyncMixin

Native Step3.7 VLM implementation for MedPix fine-tuning with EP and PP.

Initialization

_keep_in_fp32_modules#

[‘rotary_emb’]

_pp_keep_self_forward: bool#

True

mtp_outputs_are_logits#

True

classmethod from_config(
config: nemo_automodel.components.models.step3p7.configuration_step3p7.Step3p7Config,
moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None,
backend: nemo_automodel.components.models.common.BackendConfig | None = None,
**kwargs: Any,
)#
classmethod from_pretrained(
pretrained_model_name_or_path: str,
*model_args: Any,
**kwargs: Any,
)#
property language_model#
property visual#
get_input_embeddings()#
set_input_embeddings(value)#
get_output_embeddings()#
set_output_embeddings(new_embeddings)#
set_decoder(decoder)#
get_decoder()#
customize_pipeline_stage_modules(
module_names_per_stage: list[list[str]],
*,
layers_prefix: str,
text_model: torch.nn.Module | None = None,
) list[list[str]]#
get_pipeline_stage_metas(
*,
is_first: bool,
microbatch_size: int,
seq_len: int,
dtype: torch.dtype,
) tuple[tuple[torch.Tensor, ...], tuple[torch.Tensor, ...]]#
_is_pipeline_parallel_stage() bool#
_build_mtp_embed_inputs_from_embeds(
inputs_embeds: torch.Tensor,
) tuple[torch.Tensor, ...]#
_make_position_ids(
hidden: torch.Tensor,
position_ids: torch.Tensor | None,
) torch.Tensor#
prepare_model_inputs_for_cp(
input_ids: torch.Tensor,
*,
pixel_values: torch.Tensor | None = None,
patch_pixel_values: torch.Tensor | None = None,
num_patches: torch.Tensor | list[int] | tuple[int, ...] | None = None,
image_embeds: torch.Tensor | None = None,
**_: Any,
) dict[str, torch.Tensor]#

Merge vision features into token embeddings before CP sequence sharding.

forward(
input_ids: torch.Tensor | None = None,
*mtp_embed_inputs: torch.Tensor,
position_ids: torch.Tensor | None = None,
attention_mask: torch.Tensor | None = None,
padding_mask: torch.Tensor | None = None,
inputs_embeds: torch.Tensor | None = None,
cache_position: torch.Tensor | None = None,
**kwargs: Any,
) torch.Tensor#
initialize_weights(
buffer_device: torch.device | None = None,
dtype: torch.dtype = torch.bfloat16,
) None#
nemo_automodel.components.models.step3p7.model.ModelClass#

None