`nemo_automodel.components.models.step3p7.processing_step3`#

Module Contents#

Classes#

`Step3VLImagePixelInputs`
`Step3VLImageEmbeddingInputs`
`GPUToTensor`
`Step3VisionProcessor`
`ImagePatcher`
`Step3VLProcessor`	Processor that expands Step3.7 image inputs into image-token placeholders.

Data#

`MAX_IMAGE_SIZE`
`ImageWithPatches`
`__all__`

API#

nemo_automodel.components.models.step3p7.processing_step3.MAX_IMAGE_SIZE: int#: 3024

class nemo_automodel.components.models.step3p7.processing_step3.Step3VLImagePixelInputs#

Bases: typing.TypedDict

type: Literal[nemo_automodel.components.models.step3p7.processing_step3.Step3VLImagePixelInputs.pixel_values]#: None

pixel_values: torch.Tensor#: None

patch_pixel_values: Optional[torch.Tensor]#: None

num_patches: list[int]#: None

class nemo_automodel.components.models.step3p7.processing_step3.Step3VLImageEmbeddingInputs#

Bases: typing.TypedDict

type: Literal[nemo_automodel.components.models.step3p7.processing_step3.Step3VLImageEmbeddingInputs.image_embeds]#: None

image_embeds: torch.Tensor#: None

nemo_automodel.components.models.step3p7.processing_step3.ImageWithPatches#: None

class nemo_automodel.components.models.step3p7.processing_step3.GPUToTensor#

Bases: torch.nn.Module

forward( raw_image: Union[numpy.ndarray, PIL.Image.Image], ) → torch.Tensor#

class nemo_automodel.components.models.step3p7.processing_step3.Step3VisionProcessor( size, interpolation_mode='bicubic', patch_size=None, )#

Bases: transformers.BaseImageProcessor

Initialization

__call__(image, is_patch=False)#

class nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher#

determine_window_size(long: int, short: int) → int#

slide_window( width: int, height: int, sizes: list[tuple[int, int]], steps: list[tuple[int, int]], img_rate_thr: float = 0.6, ) → tuple[list[tuple[int, int, int, int]], tuple[int, int]]#

square_pad(img: PIL.Image.Image) → PIL.Image.Image#

get_image_size_for_padding( img_width: int, img_height: int, ) → tuple[int, int]#

get_image_size_for_preprocess( img_width: int, img_height: int, ) → tuple[int, int]#

get_image_size_for_crop( img_width: int, img_height: int, window_size: int, )#

patch_crop(img: PIL.Image.Image, i: int, j: int, th: int, tw: int)#

get_num_patches(img_width: int, img_height: int) → tuple[int, int]#

__call__( img: PIL.Image.Image, ) → tuple[PIL.Image.Image, list[PIL.Image.Image], list[bool] | None]#

class nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor(tokenizer=None, chat_template=None, **kwargs)#

Bases: transformers.processing_utils.ProcessorMixin

Processor that expands Step3.7 image inputs into image-token placeholders.