nemo_automodel.components.models.step3p7.configuration_step3p7

Module Contents

Classes

Name	Description
`Step3p5VConfig`	Compatibility config for original Step VLM checkpoints using `step3p5v`.
`Step3p7Config`	Top-level configuration for Step3.7 vision-language checkpoints.
`Step3p7TextConfig`	Configuration for the Step3.7 language backbone.
`StepRoboticsVisionEncoderConfig`	Configuration for the Step robotics vision encoder.

Functions

Name	Description
`_json_safe_value`	Convert config values that are valid in-memory but not JSON serializable.
`_normalize_per_layer_values`	-
`_slice_mtp_per_layer_values`	-

API

class nemo_automodel.components.models.step3p7.configuration_step3p7.Step3p5VConfig()

Bases: Step3p7Config

Compatibility config for original Step VLM checkpoints using step3p5v.

model_type

= 'step3p5v'

class nemo_automodel.components.models.step3p7.configuration_step3p7.Step3p7Config(
    vision_config: typing.Optional[typing.Union[dict, nemo_automodel.components.models.step3p7.configuration_step3p7.StepRoboticsVisionEncoderConfig]] = None,
    text_config: typing.Optional[typing.Union[dict, nemo_automodel.components.models.step3p7.configuration_step3p7.Step3p7TextConfig]] = None,
    understand_projector_stride: int = 2,
    projector_bias: bool = False,
    image_token_id: int = 151679,
    kwargs = {}
)

Bases: PretrainedConfig

Top-level configuration for Step3.7 vision-language checkpoints.

hidden_size

= text_config.hidden_size

max_position_embeddings

= text_config.max_position_embeddings

model_type

= 'step3p7'

nemo_automodel.components.models.step3p7.configuration_step3p7.Step3p7Config.to_dict()

class nemo_automodel.components.models.step3p7.configuration_step3p7.Step3p7TextConfig(
    hidden_size: int = 4096,
    intermediate_size: int = 11264,
    num_attention_heads: int = 64,
    num_attention_groups: int = 8,
    num_hidden_layers: int = 45,
    num_nextn_predict_layers: int = 0,
    mtp_base_layer_idx: typing.Optional[int] = None,
    max_seq_len: int = 128000,
    vocab_size: int = 128815,
    rms_norm_eps: float = 1e-05,
    moe_intermediate_size: int = 1280,
    moe_num_experts: int = 288,
    moe_top_k: int = 8,
    rope_theta: float = 10000,
    rope_scaling: typing.Optional[dict[str, typing.Any]] = None,
    max_position_embeddings: int = 128000,
    share_expert_dims: int = 1280,
    share_expert_dim: typing.Optional[int] = None,
    head_dim: int = 128,
    norm_expert_weight: bool = True,
    layer_types: list[str] = None,
    sliding_window: typing.Optional[int] = None,
    pad_token_id: int = 1,
    attention_dropout: float = 0.0,
    use_head_wise_attn_gate: bool = False,
    use_moe_router_bias: bool = False,
    moe_router_activation: str = 'softmax',
    moe_router_scaling_factor: float = 1.0,
    need_fp32_gate: bool = False,
    attention_other_setting: typing.Optional[dict[str, typing.Any]] = None,
    swiglu_limits: typing.Optional[list[typing.Optional[float]]] = None,
    swiglu_limits_shared: typing.Optional[list[typing.Optional[float]]] = None,
    use_rope_layers: typing.Optional[list[bool]] = None,
    yarn_only_types: typing.Optional[list[str]] = None,
    moe_layers_enum: tuple[int] = (3, 4, 5, 6, 7, 8, 9, 10, 1...,
    kwargs = {}
)

Bases: PretrainedConfig

Configuration for the Step3.7 language backbone.

architectures

= ['Step3p5ForCausalLM']

model_type

= 'step3p5'

mtp_base_layer_idx

= int(mtp_base_layer_idx)

mtp_layer_types

mtp_partial_rotary_factors

mtp_rope_theta

mtp_swiglu_limits

mtp_swiglu_limits_shared

mtp_use_rope_layers

nemo_automodel.components.models.step3p7.configuration_step3p7.Step3p7TextConfig.to_dict()

class nemo_automodel.components.models.step3p7.configuration_step3p7.StepRoboticsVisionEncoderConfig(
    width = 1536,
    layers = 47,
    heads = 16,
    num_channels = 3,
    image_size = 728,
    mlp_ratio = 8960 / 1536,
    patch_size = 14,
    hidden_act = 'quick_gelu',
    layer_norm_eps = 1e-05,
    ues_cls_token = False,
    use_cls_token: typing.Optional[bool] = None,
    use_ln_pre = True,
    use_ln_post = False,
    use_abs_posemb = True,
    use_rope2d = True,
    ls_init_value = 0.1,
    kwargs = {}
)

Bases: PretrainedConfig

Configuration for the Step robotics vision encoder.

model_type

= 'perception_encoder'

nemo_automodel.components.models.step3p7.configuration_step3p7._json_safe_value(
    value: typing.Any
) -> typing.Any

Convert config values that are valid in-memory but not JSON serializable.

nemo_automodel.components.models.step3p7.configuration_step3p7._normalize_per_layer_values(
    values: typing.Optional[typing.Sequence[typing.Any]],
    num_hidden_layers: int
) -> typing.Optional[list[typing.Any]]

nemo_automodel.components.models.step3p7.configuration_step3p7._slice_mtp_per_layer_values(
    values: typing.Optional[typing.Sequence[typing.Any]],
    num_hidden_layers: int,
    num_nextn_predict_layers: int,
    default: typing.Any
) -> list[typing.Any]