nemo_automodel.components.models.kimivl.model#

Module Contents#

Classes#

MoonViTConfig

Configuration for MoonVit vision encoder.

KimiVLConfig

Configuration for KimiVL model.

Learnable2DInterpPosEmb

Learnable 2D interpolatable position embedding.

Rope2DPosEmb

MoonVitMLP

MLP for MoonVit.

MoonVitEncoderLayer

Single encoder layer for MoonVit.

MoonVitEncoder

MoonVit encoder.

MoonVisionPatchEmbed

Patch embedding for MoonVit.

MoonVitPretrainedModel

MoonVit vision encoder.

KimiVLMultiModalProjector

Projects vision features to language model dimension.

DeepSeekV3RotaryEmbeddingAdapter

Callable adapter that wraps DeepseekV3’s freqs_cis-based RoPE.

KimiVLLanguageModelBackend

Backend-aware language model wrapper using DeepseekV3 architecture.

KimiVLModel

KimiVL multimodal backbone with a DeepseekV3 text decoder.

KimiVLForConditionalGeneration

KimiVL model with backend-aware DeepseekV3 language model.

KimiVLStateDictAdapter

State dict adapter for KimiVL checkpoints.

Functions#

_apply_rope_vision

Apply rotary position embedding for vision.

vision_attention_flash

Flash attention for vision.

vision_attention_sdpa

SDPA attention for vision.

patch_merger

Merge patches.

_register_kimi_vl_with_transformers

Register KimiVLConfig and model with transformers Auto classes.

Data#

API#

nemo_automodel.components.models.kimivl.model.LOGGER#

‘getLogger(…)’

class nemo_automodel.components.models.kimivl.model.MoonViTConfig(
patch_size: int = 14,
init_pos_emb_height: int = 64,
init_pos_emb_width: int = 64,
num_attention_heads: int = 16,
num_hidden_layers: int = 27,
hidden_size: int = 1152,
intermediate_size: int = 4304,
merge_kernel_size: Tuple[int, int] = (2, 2),
**kwargs,
)#

Bases: transformers.configuration_utils.PretrainedConfig

Configuration for MoonVit vision encoder.

Initialization

model_type#

‘moonvit’

class nemo_automodel.components.models.kimivl.model.KimiVLConfig(
vision_config: Optional[Union[Dict, nemo_automodel.components.models.kimivl.model.MoonViTConfig]] = None,
text_config: Optional[Union[Dict, transformers.models.deepseek_v3.configuration_deepseek_v3.DeepseekV3Config]] = None,
ignore_index: int = -100,
media_placeholder_token_id: int = 163605,
pad_token_id: int = 0,
architectures: Optional[List[str]] = None,
**kwargs,
)#

Bases: transformers.configuration_utils.PretrainedConfig

Configuration for KimiVL model.

Initialization

model_type#

‘kimi_vl’

to_dict() Dict[str, Any]#
nemo_automodel.components.models.kimivl.model._apply_rope_vision(
xq: torch.Tensor,
xk: torch.Tensor,
freqs_cis: torch.Tensor,
) Tuple[torch.Tensor, torch.Tensor]#

Apply rotary position embedding for vision.

nemo_automodel.components.models.kimivl.model.vision_attention_flash(q, k, v, q_cu_seqlens, k_cu_seqlens)#

Flash attention for vision.

nemo_automodel.components.models.kimivl.model.vision_attention_sdpa(q, k, v, q_cu_seqlens, k_cu_seqlens)#

SDPA attention for vision.

class nemo_automodel.components.models.kimivl.model.Learnable2DInterpPosEmb(
height: int,
width: int,
dim: int,
interpolation_mode: str = 'bicubic',
)#

Bases: torch.nn.Module

Learnable 2D interpolatable position embedding.

Initialization

forward(x: torch.Tensor, grid_hws: torch.Tensor) torch.Tensor#
class nemo_automodel.components.models.kimivl.model.Rope2DPosEmb(
dim: int,
max_height: int,
max_width: int,
theta_base: float = 10000,
)#

Bases: torch.nn.Module

Initialization

_precompute_freqs_cis(device: torch.device) torch.Tensor#
get_freqs_cis(grid_hws: torch.Tensor) torch.Tensor#
class nemo_automodel.components.models.kimivl.model.MoonVitMLP(dims: List[int], activation, bias: bool = True)#

Bases: torch.nn.Module

MLP for MoonVit.

Initialization

forward(x: torch.Tensor) torch.Tensor#
class nemo_automodel.components.models.kimivl.model.MoonVitEncoderLayer(
num_heads: int,
hidden_dim: int,
mlp_dim: int,
*,
activation=F.gelu,
attn_bias: bool = False,
attn_implementation: str = 'flash_attention_2',
)#

Bases: torch.nn.Module

Single encoder layer for MoonVit.

Initialization

forward(
hidden_states: torch.Tensor,
cu_seqlens: torch.Tensor,
rope_freqs_cis: torch.Tensor,
) torch.Tensor#
class nemo_automodel.components.models.kimivl.model.MoonVitEncoder(hidden_dim: int, num_layers: int, block_cfg: dict)#

Bases: torch.nn.Module

MoonVit encoder.

Initialization

forward(
hidden_states: torch.Tensor,
grid_hws: torch.Tensor,
) torch.Tensor#
class nemo_automodel.components.models.kimivl.model.MoonVisionPatchEmbed(
out_dim: int,
in_dim: int = 3,
patch_size: int = 14,
pos_emb_height: int = 64,
pos_emb_width: int = 64,
)#

Bases: torch.nn.Module

Patch embedding for MoonVit.

Initialization

forward(x: torch.Tensor, grid_hws: torch.Tensor) torch.Tensor#
nemo_automodel.components.models.kimivl.model.patch_merger(
x: torch.Tensor,
grid_hws: torch.Tensor,
merge_kernel_size: List[int] = [2, 2],
) List[torch.Tensor]#

Merge patches.

class nemo_automodel.components.models.kimivl.model.MoonVitPretrainedModel(config)#

Bases: torch.nn.Module

MoonVit vision encoder.

Initialization

property dtype#
forward(
pixel_values: torch.Tensor,
grid_hws: torch.Tensor,
) List[torch.Tensor]#
class nemo_automodel.components.models.kimivl.model.KimiVLMultiModalProjector(config)#

Bases: torch.nn.Module

Projects vision features to language model dimension.

Initialization

forward(image_features: List[torch.Tensor]) torch.Tensor#
class nemo_automodel.components.models.kimivl.model.DeepSeekV3RotaryEmbeddingAdapter(
parent_module: torch.nn.Module,
rope_fusion: bool = False,
)#

Callable adapter that wraps DeepseekV3’s freqs_cis-based RoPE.

This is NOT an nn.Module to avoid being pruned during PP split. It holds a reference to the parent module’s freqs_cis buffer and computes position embeddings on demand.

The parent module (KimiVLLanguageModelBackend) owns the freqs_cis buffer, and this adapter accesses it via the reference.

Initialization

property freqs_cis#

Access freqs_cis from the parent module.

__call__(
hidden_states: torch.Tensor,
position_ids: torch.Tensor,
) torch.Tensor#

Compute position embeddings from pre-computed freqs_cis.

Parameters:
  • hidden_states – Input tensor (used only for device/dtype inference)

  • position_ids – Position indices tensor

Returns:

Position embeddings tensor compatible with DeepseekV3 Block layers

class nemo_automodel.components.models.kimivl.model.KimiVLLanguageModelBackend(
config,
backend: nemo_automodel.components.models.common.BackendConfig,
*,
moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None,
)#

Bases: torch.nn.Module

Backend-aware language model wrapper using DeepseekV3 architecture.

Note: lm_head is NOT included here - it’s at the top level of KimiVLForConditionalGeneration to match HF checkpoint structure.

Initialization

get_input_embeddings()#
set_input_embeddings(value)#
forward(
input_ids=None,
*,
inputs_embeds=None,
attention_mask=None,
position_ids=None,
padding_mask=None,
**kwargs,
)#
init_weights(buffer_device=None)#
property embed_tokens#
property layers#
property norm#
class nemo_automodel.components.models.kimivl.model.KimiVLModel(
config,
moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None,
backend: nemo_automodel.components.models.common.BackendConfig | None = None,
)#

Bases: torch.nn.Module

KimiVL multimodal backbone with a DeepseekV3 text decoder.

Initialization

property layers#
property embed_tokens#
property norm#
_merge_with_image_features(inputs_embeds, input_ids, image_features)#

Merge image features into input embeddings.

_extract_image_features(pixel_values, image_grid_hws)#

Extract and project image features.

forward(
input_ids=None,
attention_mask=None,
position_ids=None,
inputs_embeds=None,
pixel_values=None,
image_grid_hws=None,
padding_mask=None,
**kwargs,
)#
class nemo_automodel.components.models.kimivl.model.KimiVLForConditionalGeneration(
config,
moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None,
backend: nemo_automodel.components.models.common.BackendConfig | None = None,
**kwargs,
)#

Bases: nemo_automodel.components.models.common.hf_checkpointing_mixin.HFCheckpointingMixin, torch.nn.Module, nemo_automodel.components.moe.fsdp_mixin.MoEFSDPSyncMixin

KimiVL model with backend-aware DeepseekV3 language model.

Initialization

classmethod from_config(
config,
moe_config: nemo_automodel.components.moe.config.MoEConfig | None = None,
backend: nemo_automodel.components.models.common.BackendConfig | None = None,
**kwargs,
)#
classmethod from_pretrained(
pretrained_model_name_or_path: str,
*model_args,
**kwargs,
)#
property dtype#
get_input_embeddings()#
set_input_embeddings(value)#
get_output_embeddings()#
set_output_embeddings(new_embeddings)#
property lm_head#

Convenience property to access lm_head from top level.

forward(
input_ids=None,
attention_mask=None,
position_ids=None,
past_key_values=None,
inputs_embeds=None,
labels=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
pixel_values=None,
image_grid_hws=None,
padding_mask=None,
**kwargs,
)#
initialize_weights(buffer_device=None, dtype=torch.bfloat16)#
class nemo_automodel.components.models.kimivl.model.KimiVLStateDictAdapter(
config,
moe_config: nemo_automodel.components.moe.config.MoEConfig,
backend: nemo_automodel.components.models.common.BackendConfig,
dtype: torch.dtype = torch.float32,
)#

State dict adapter for KimiVL checkpoints.

Initialization

to_hf(state_dict: dict, **kwargs) dict#
from_hf(state_dict: dict, **kwargs) dict#
nemo_automodel.components.models.kimivl.model.ModelClass#

None

nemo_automodel.components.models.kimivl.model._register_kimi_vl_with_transformers()#

Register KimiVLConfig and model with transformers Auto classes.

This uses the official transformers registration API. When registered, AutoModelForImageTextToText.from_pretrained will use our local implementation directly, bypassing the trust_remote_code mechanism entirely.