nemo_automodel.components.models.step3p7.model

View as Markdown

Module Contents

Classes

NameDescription
Step3p7CausalLMOutputCausalLMOutputWithPast plus optional per-depth MTP logits.
Step3p7ForConditionalGenerationNative Step3.7 VLM implementation for MedPix fine-tuning with EP and PP.
Step3p7ModelStep3.7 VLM wrapper using the native Step3.5 MoE language backbone.

Functions

Data

ModelClass

logger

API

class nemo_automodel.components.models.step3p7.model.Step3p7CausalLMOutput(
mtp_per_depth_logits: list[torch.Tensor] | None = None,
mtp_loss_scaling_factor: float | None = None
)
Dataclass

Bases: CausalLMOutputWithPast

CausalLMOutputWithPast plus optional per-depth MTP logits.

Subclassing the HF ModelOutput gives this output the standard logits/hidden_states fields (so "hidden_states" in out and getattr(out, "hidden_states") behave like every other model and the fused-CE path can read the final hidden states), while the MTP fields stay declared dataclass fields so they survive output-restructuring layers like FSDP2’s mixed-precision output cast, which rebuild ModelOutput instances from declared fields only.

mtp_loss_scaling_factor
float | None = None
mtp_per_depth_logits
list[Tensor] | None = None
class nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration(
config: nemo_automodel.components.models.step3p7.configuration_step3p7.Step3p7Config,
moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None,
backend: nemo_automodel.components.models.common.BackendConfig | None = None,
kwargs: typing.Any = {}
)

Bases: HFCheckpointingMixin, Module, MoEFSDPSyncMixin

Native Step3.7 VLM implementation for MedPix fine-tuning with EP and PP.

_keep_in_fp32_modules
= ['rotary_emb']
_pp_keep_self_forward
bool = True
backend
= backend or BackendConfig()
lm_head
model
mtp
mtp_config
pad_token_id
= getattr(config.text_config, 'pad_token_id', None)
state_dict_adapter
vocab_size
= config.text_config.vocab_size
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration._build_mtp_embed_inputs_from_embeds(
inputs_embeds: torch.Tensor
) -> tuple[torch.Tensor, ...]
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration._is_pipeline_parallel_stage() -> bool
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration._make_position_ids(
hidden: torch.Tensor,
position_ids: torch.Tensor | None
) -> torch.Tensor
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.customize_pipeline_stage_modules(
module_names_per_stage: list[list[str]],
layers_prefix: str,
text_model: torch.nn.Module | None = None
) -> list[list[str]]
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.forward(
input_ids: torch.Tensor | None = None,
mtp_embed_inputs: torch.Tensor = (),
position_ids: torch.Tensor | None = None,
attention_mask: torch.Tensor | None = None,
padding_mask: torch.Tensor | None = None,
inputs_embeds: torch.Tensor | None = None,
cache_position: torch.Tensor | None = None,
logits_to_keep: typing.Union[int, torch.Tensor] = 0,
output_hidden_states: typing.Optional[bool] = None,
kwargs: typing.Any = {}
) -> torch.Tensor | nemo_automodel.components.models.step3p7.model.Step3p7CausalLMOutput
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.from_config(
config: nemo_automodel.components.models.step3p7.configuration_step3p7.Step3p7Config,
moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None,
backend: nemo_automodel.components.models.common.BackendConfig | None = None,
kwargs: typing.Any = {}
)
classmethod
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.from_pretrained(
pretrained_model_name_or_path: str,
model_args: typing.Any = (),
kwargs: typing.Any = {}
)
classmethod
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.get_decoder()
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.get_input_embeddings()
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.get_output_embeddings()
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.get_pipeline_stage_metas(
is_first: bool,
microbatch_size: int,
seq_len: int,
dtype: torch.dtype
) -> tuple[tuple[torch.Tensor, ...], tuple[torch.Tensor, ...]]
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.initialize_weights(
buffer_device: torch.device | None = None,
dtype: torch.dtype = torch.bfloat16
) -> None
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.prepare_model_inputs_for_cp(
input_ids: torch.Tensor,
pixel_values: torch.Tensor | None = None,
patch_pixel_values: torch.Tensor | None = None,
num_patches: torch.Tensor | list[int] | tuple[int, ...] | None = None,
image_embeds: torch.Tensor | None = None,
_: typing.Any = {}
) -> dict[str, torch.Tensor]

Merge vision features into token embeddings before CP sequence sharding.

nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.set_decoder(
decoder
)
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.set_input_embeddings(
value
)
nemo_automodel.components.models.step3p7.model.Step3p7ForConditionalGeneration.set_output_embeddings(
new_embeddings
)
class nemo_automodel.components.models.step3p7.model.Step3p7Model(
config: nemo_automodel.components.models.step3p7.configuration_step3p7.Step3p7Config,
backend: nemo_automodel.components.models.common.BackendConfig,
moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None,
moe_overrides: dict | None = None
)

Bases: Module

Step3.7 VLM wrapper using the native Step3.5 MoE language backbone.

image_placeholder_token_id
= config.image_token_id
language_model
moe_config
= self.language_model.moe_config
vision_model
= StepRoboticsVisionEncoder(config.vision_config)
vit_large_projector
vocab_size
= config.text_config.vocab_size
nemo_automodel.components.models.step3p7.model.Step3p7Model._process_image_features(
image_features: torch.Tensor
) -> torch.Tensor
nemo_automodel.components.models.step3p7.model.Step3p7Model._process_image_input(
pixel_values: torch.Tensor,
patch_pixel_values: torch.Tensor | None = None,
num_patches: torch.Tensor | list[int] | tuple[int, ...] | None = None
) -> list[torch.Tensor]
nemo_automodel.components.models.step3p7.model.Step3p7Model._vision_dtype_device() -> tuple[torch.dtype, torch.device]
nemo_automodel.components.models.step3p7.model.Step3p7Model.forward(
input_ids: torch.Tensor | None = None,
attention_mask: torch.Tensor | None = None,
position_ids: torch.Tensor | None = None,
inputs_embeds: torch.Tensor | None = None,
pixel_values: torch.Tensor | None = None,
patch_pixel_values: torch.Tensor | None = None,
num_patches: torch.Tensor | list[int] | tuple[int, ...] | None = None,
image_embeds: torch.Tensor | None = None,
kwargs: typing.Any = {}
) -> torch.Tensor
nemo_automodel.components.models.step3p7.model.Step3p7Model.get_decoder()
nemo_automodel.components.models.step3p7.model.Step3p7Model.get_input_embeddings()
nemo_automodel.components.models.step3p7.model.Step3p7Model.get_multimodal_embeddings(
pixel_values: torch.Tensor | None = None,
patch_pixel_values: torch.Tensor | None = None,
num_patches: torch.Tensor | list[int] | tuple[int, ...] | None = None,
image_embeds: torch.Tensor | None = None,
_: typing.Any = {}
) -> list[torch.Tensor] | torch.Tensor | None
nemo_automodel.components.models.step3p7.model.Step3p7Model.prepare_inputs_embeds(
input_ids: torch.Tensor,
multimodal_embeddings: list[torch.Tensor] | torch.Tensor | None = None
) -> torch.Tensor
nemo_automodel.components.models.step3p7.model.Step3p7Model.set_decoder(
decoder
)
nemo_automodel.components.models.step3p7.model.Step3p7Model.set_input_embeddings(
value
)
nemo_automodel.components.models.step3p7.model._debug_vision_enabled() -> bool
nemo_automodel.components.models.step3p7.model._debug_vision_log(
message: str,
args: typing.Any = ()
) -> None
nemo_automodel.components.models.step3p7.model._rank() -> int
nemo_automodel.components.models.step3p7.model.ModelClass = Step3p7ForConditionalGeneration
nemo_automodel.components.models.step3p7.model.logger = logging.getLogger(__name__)