Source code for nemo_automodel.datasets.vlm.utils

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch


# Common special tokens across VLM models
QWEN_TOKENS = [
    "<|im_start|>",
    "<|im_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>",
]
LLAVA_TOKENS = ["<image>", "<pad>"]
LLAMA_TOKENS = [
    "<|begin_of_text|>",
    "<|end_of_text|>",
    "<|finetune_right_pad_id|>",
    "<|step_id|>",
    "<|start_header_id|>",
    "<|end_header_id|>",
    "<|eom_id|>",
    "<|eot_id|>",
    "<|python_tag|>",
    "<|image|>",
]
GEMMA_TOKENS = ["<image_soft_token>"]

PAD_TOKENS = set(QWEN_TOKENS + LLAVA_TOKENS + LLAMA_TOKENS + GEMMA_TOKENS)



[docs]
def extract_skipped_token_ids(processor):
    """
    Returns list of tokens to mask in labels.

    Extracted from NeMo's HFAutoModelForImageTextToText.extract_skipped_token_ids
    """
    tokenizer = getattr(processor, "tokenizer", processor)

    skipped_token_ids = []
    for key, val in tokenizer.added_tokens_decoder.items():
        if str(val) in PAD_TOKENS:
            skipped_token_ids.append(key)

    return torch.IntTensor(list(set(skipped_token_ids)))




[docs]
def json2token(obj, sort_json_key: bool = True):
    """
    Convert an ordered JSON object into a token sequence.

    From NeMo's automodel_datasets.py
    """
    if type(obj) is dict:
        if len(obj) == 1 and "text_sequence" in obj:
            return obj["text_sequence"]
        output = ""
        keys = sorted(obj.keys(), reverse=True) if sort_json_key else obj.keys()
        for k in keys:
            output += rf"<s_{k}>" + json2token(obj[k], sort_json_key) + rf"</s_{k}>"
        return output
    if type(obj) is list:
        return r"<sep/>".join([json2token(item, sort_json_key) for item in obj])
    return str(obj)




[docs]
def process_text_batch(
    processor, texts: list[str], images: list | None = None,
) -> dict[str, torch.Tensor]:
    """
    Process a batch of texts and optionally images.

    Args:
        processor: The processor to use for tokenization and image processing
        texts: List of text strings to process
        images: Optional list of images to process

    Returns:
        Dict containing processed batch data
    """
    if images is not None:
        batch = processor(
            text=texts,
            images=images,
            padding=True,
            return_tensors="pt",
        )
        if "pixel_values" in batch:
            batch["pixel_values"] = batch["pixel_values"].to(torch.bfloat16)
    else:
        batch = processor(
            text=texts,
            padding=True,
            return_tensors="pt",
        )

    return batch