nemo_automodel.components.models.step3p7.processing_step3

Module Contents

Classes

Name	Description
`GPUToTensor`	-
`ImagePatcher`	-
`Step3VLImageEmbeddingInputs`	-
`Step3VLImagePixelInputs`	-
`Step3VLProcessor`	Processor that expands Step3.7 image inputs into image-token placeholders.
`Step3VisionProcessor`	-

Data

ImageWithPatches

MAX_IMAGE_SIZE

__all__

API

class nemo_automodel.components.models.step3p7.processing_step3.GPUToTensor()

Bases: Module

nemo_automodel.components.models.step3p7.processing_step3.GPUToTensor.forward(
    raw_image: typing.Union[numpy.ndarray, PIL.Image.Image]
) -> torch.Tensor

class nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher()

nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.__call__(
    img: PIL.Image.Image
) -> tuple[PIL.Image.Image, list[PIL.Image.Image], list[bool] | None]

nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.determine_window_size(
    long: int,
    short: int
) -> int

nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.get_image_size_for_crop(
    img_width: int,
    img_height: int,
    window_size: int
)

nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.get_image_size_for_padding(
    img_width: int,
    img_height: int
) -> tuple[int, int]

nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.get_image_size_for_preprocess(
    img_width: int,
    img_height: int
) -> tuple[int, int]

nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.get_num_patches(
    img_width: int,
    img_height: int
) -> tuple[int, int]

nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.patch_crop(
    img: PIL.Image.Image,
    i: int,
    j: int,
    th: int,
    tw: int
)

nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.slide_window(
    width: int,
    height: int,
    sizes: list[tuple[int, int]],
    steps: list[tuple[int, int]],
    img_rate_thr: float = 0.6
) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]

nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.square_pad(
    img: PIL.Image.Image
) -> PIL.Image.Image

class nemo_automodel.components.models.step3p7.processing_step3.Step3VLImageEmbeddingInputs

Bases: typing.TypedDict

image_embeds

Tensor

type

Literal['image_embeds']

class nemo_automodel.components.models.step3p7.processing_step3.Step3VLImagePixelInputs

Bases: typing.TypedDict

num_patches

list[int]

patch_pixel_values

Optional[Tensor]

pixel_values

Tensor

type

Literal['pixel_values']

class nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor(
    tokenizer = None,
    chat_template = None,
    kwargs = {}
)

Bases: ProcessorMixin

Processor that expands Step3.7 image inputs into image-token placeholders.

attributes

= ['tokenizer']

image_feature_placeholder

= self.image_token * self.num_image_feature_size

image_preprocessor

image_size

= 728

image_token

= '<im_patch>'

image_token_id

int

num_image_feature_size

= 169

num_patch_feature_size

= 81

patch_feature_placeholder

= self.image_token * self.num_patch_feature_size

patch_size

= 504

patcher

= ImagePatcher()

tokenizer_class

= 'AutoTokenizer'

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor.__call__(
    text: typing.Optional[typing.Union[str, list[str]]] = None,
    images: transformers.image_utils.ImageInput | None = None,
    return_tensors: typing.Optional[typing.Union[str, transformers.feature_extraction_utils.TensorType]] = None,
    kwargs = {}
) -> transformers.feature_extraction_utils.BatchFeature

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor._convert_images_to_pixel_values(
    images: list[PIL.Image.Image],
    is_patch: bool = False
) -> list[torch.Tensor]

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor._get_image_repl(
    num_images: int
) -> tuple[str, list[int]]

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor._get_image_repl_features(
    num_images: int,
    num_patches: int,
    patch_new_line_idx: typing.Optional[list[bool]]
) -> tuple[str, list[int]]

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor._get_patch_repl(
    num_patches: int,
    patch_newline_mask: list[bool] | None
) -> tuple[str, list[int]]

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor._normalize_batched_images(
    images,
    batch_size: int
) -> list[list[PIL.Image.Image]]

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor._split_images(
    images: list[PIL.Image.Image]
) -> list[nemo_automodel.components.models.step3p7.processing_step3.ImageWithPatches]

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor.batch_decode(
    args = (),
    kwargs = {}
)

This method forwards all its arguments to GemmaTokenizerFast’s [~PreTrainedTokenizer.batch_decode]. Please refer to the docstring of this method for more information.

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor.decode(
    args = (),
    kwargs = {}
)

This method forwards all its arguments to GemmaTokenizerFast’s [~PreTrainedTokenizer.decode]. Please refer to the docstring of this method for more information.

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor.get_num_image_tokens(
    img_width: int,
    img_height: int
) -> int

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor.replace_placeholder(
    text: str,
    placeholder: str,
    repls: list[str]
) -> str

class nemo_automodel.components.models.step3p7.processing_step3.Step3VisionProcessor(
    size,
    interpolation_mode = 'bicubic',
    patch_size = None
)

Bases: BaseImageProcessor

patch_transform

transform

nemo_automodel.components.models.step3p7.processing_step3.Step3VisionProcessor.__call__(
    image,
    is_patch = False
)

nemo_automodel.components.models.step3p7.processing_step3.ImageWithPatches = tuple[Image.Image, list[Image.Image], list[int] | None]

nemo_automodel.components.models.step3p7.processing_step3.MAX_IMAGE_SIZE: int = 3024

nemo_automodel.components.models.step3p7.processing_step3.__all__ = ['Step3VLProcessor']