nemo_automodel.components.models.llava_onevision.model#

LLaVA-OneVision-1.5 model implementation.

Module Contents#

Classes#

RiceConfig

Configuration for Rice ViT encoder.

LlavaOneVisionConfig

Configuration for LLaVA-OneVision-1.5 model.

RiceTransformer

Rice ViT transformer with 2D RoPE and patch merging.

LlavaOneVisionModel

Base LLaVA-OneVision model without LM head.

LlavaOneVisionForConditionalGeneration

LLaVA-OneVision-1.5 for conditional generation with Rice ViT + Qwen3.

Data#

API#

nemo_automodel.components.models.llava_onevision.model.LOGGER#

‘getLogger(…)’

class nemo_automodel.components.models.llava_onevision.model.RiceConfig(
depth: int = 24,
hidden_size: int = 1024,
intermediate_size: int = 4096,
num_heads: int = 16,
in_channels: int = 3,
patch_size: int = 14,
spatial_merge_size: int = 2,
temporal_patch_size: int = 1,
hidden_act: str = 'gelu',
layer_norm_eps: float = 1e-05,
text_hidden_size: int = 2560,
**kwargs,
)#

Bases: transformers.configuration_utils.PretrainedConfig

Configuration for Rice ViT encoder.

Initialization

model_type#

‘rice_vit’

class nemo_automodel.components.models.llava_onevision.model.LlavaOneVisionConfig(
vision_config: Optional[Union[Dict, nemo_automodel.components.models.llava_onevision.model.RiceConfig]] = None,
text_config: Optional[Union[Dict, transformers.configuration_utils.PretrainedConfig]] = None,
ignore_index: int = -100,
image_token_id: int = 151655,
video_token_id: int = 151656,
vision_start_token_id: int = 151652,
vision_end_token_id: int = 151653,
pad_token_id: int = 0,
architectures: Optional[List[str]] = None,
**kwargs,
)#

Bases: transformers.configuration_utils.PretrainedConfig

Configuration for LLaVA-OneVision-1.5 model.

Initialization

model_type#

‘llava_onevision’

to_dict() Dict[str, Any]#
class nemo_automodel.components.models.llava_onevision.model.RiceTransformer(
config: nemo_automodel.components.models.llava_onevision.model.RiceConfig,
)#

Bases: torch.nn.Module

Rice ViT transformer with 2D RoPE and patch merging.

Initialization

rot_pos_emb(grid_thw: torch.Tensor) torch.Tensor#

Compute 2D rotary position embeddings for variable-size grids.

forward(
pixel_values: torch.Tensor,
grid_thw: torch.Tensor,
) torch.Tensor#

Forward pass for Rice ViT.

Parameters:
  • pixel_values – Flattened pixel values [num_patches, CPP]

  • grid_thw – Grid dimensions [num_images, 3] as (T, H, W)

Returns:

Image embeddings [total_merged_patches, text_hidden_size]

class nemo_automodel.components.models.llava_onevision.model.LlavaOneVisionModel(
config: nemo_automodel.components.models.llava_onevision.model.LlavaOneVisionConfig,
)#

Bases: torch.nn.Module

Base LLaVA-OneVision model without LM head.

Initialization

_build_language_model(text_config)#

Build the Qwen3 language model.

get_input_embeddings()#
forward(
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
pixel_values_videos: Optional[torch.FloatTensor] = None,
image_grid_thw: Optional[torch.LongTensor] = None,
video_grid_thw: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) Union[Tuple, object]#
class nemo_automodel.components.models.llava_onevision.model.LlavaOneVisionForConditionalGeneration(
config: nemo_automodel.components.models.llava_onevision.model.LlavaOneVisionConfig,
**kwargs,
)#

Bases: nemo_automodel.components.models.common.hf_checkpointing_mixin.HFCheckpointingMixin, torch.nn.Module

LLaVA-OneVision-1.5 for conditional generation with Rice ViT + Qwen3.

Initialization

classmethod from_config(
config: nemo_automodel.components.models.llava_onevision.model.LlavaOneVisionConfig,
**kwargs,
)#
classmethod from_pretrained(
pretrained_model_name_or_path: str,
*model_args,
**kwargs,
)#
property dtype#
get_input_embeddings()#
set_input_embeddings(value)#
get_output_embeddings()#
set_output_embeddings(new_embeddings)#
property lm_head#
forward(
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pixel_values: Optional[torch.FloatTensor] = None,
pixel_values_videos: Optional[torch.FloatTensor] = None,
image_grid_thw: Optional[torch.LongTensor] = None,
video_grid_thw: Optional[torch.LongTensor] = None,
**kwargs,
) Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]#