nemo_automodel.components.models.step3p7.processing_step3#

Module Contents#

Classes#

Step3VLImagePixelInputs

Step3VLImageEmbeddingInputs

GPUToTensor

Step3VisionProcessor

ImagePatcher

Step3VLProcessor

Processor that expands Step3.7 image inputs into image-token placeholders.

Data#

API#

nemo_automodel.components.models.step3p7.processing_step3.MAX_IMAGE_SIZE: int#

3024

class nemo_automodel.components.models.step3p7.processing_step3.Step3VLImagePixelInputs#

Bases: typing.TypedDict

type: Literal[nemo_automodel.components.models.step3p7.processing_step3.Step3VLImagePixelInputs.pixel_values]#

None

pixel_values: torch.Tensor#

None

patch_pixel_values: Optional[torch.Tensor]#

None

num_patches: list[int]#

None

class nemo_automodel.components.models.step3p7.processing_step3.Step3VLImageEmbeddingInputs#

Bases: typing.TypedDict

type: Literal[nemo_automodel.components.models.step3p7.processing_step3.Step3VLImageEmbeddingInputs.image_embeds]#

None

image_embeds: torch.Tensor#

None

nemo_automodel.components.models.step3p7.processing_step3.ImageWithPatches#

None

class nemo_automodel.components.models.step3p7.processing_step3.GPUToTensor#

Bases: torch.nn.Module

forward(
raw_image: Union[numpy.ndarray, PIL.Image.Image],
) torch.Tensor#
class nemo_automodel.components.models.step3p7.processing_step3.Step3VisionProcessor(
size,
interpolation_mode='bicubic',
patch_size=None,
)#

Bases: transformers.BaseImageProcessor

Initialization

__call__(image, is_patch=False)#
class nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher#
determine_window_size(long: int, short: int) int#
slide_window(
width: int,
height: int,
sizes: list[tuple[int, int]],
steps: list[tuple[int, int]],
img_rate_thr: float = 0.6,
) tuple[list[tuple[int, int, int, int]], tuple[int, int]]#
square_pad(img: PIL.Image.Image) PIL.Image.Image#
get_image_size_for_padding(
img_width: int,
img_height: int,
) tuple[int, int]#
get_image_size_for_preprocess(
img_width: int,
img_height: int,
) tuple[int, int]#
get_image_size_for_crop(
img_width: int,
img_height: int,
window_size: int,
)#
patch_crop(img: PIL.Image.Image, i: int, j: int, th: int, tw: int)#
get_num_patches(img_width: int, img_height: int) tuple[int, int]#
__call__(
img: PIL.Image.Image,
) tuple[PIL.Image.Image, list[PIL.Image.Image], list[bool] | None]#
class nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor(tokenizer=None, chat_template=None, **kwargs)#

Bases: transformers.processing_utils.ProcessorMixin

Processor that expands Step3.7 image inputs into image-token placeholders.

Initialization

attributes#

[‘tokenizer’]

tokenizer_class#

‘AutoTokenizer’

property image_token_id: int#
get_num_image_tokens(img_width: int, img_height: int) int#
_split_images(
images: list[PIL.Image.Image],
) list[nemo_automodel.components.models.step3p7.processing_step3.ImageWithPatches]#
_convert_images_to_pixel_values(
images: list[PIL.Image.Image],
is_patch: bool = False,
) list[torch.Tensor]#
_get_patch_repl(
num_patches: int,
patch_newline_mask: list[bool] | None,
) tuple[str, list[int]]#
_get_image_repl(num_images: int) tuple[str, list[int]]#
_get_image_repl_features(
num_images: int,
num_patches: int,
patch_new_line_idx: Optional[list[bool]],
) tuple[str, list[int]]#
replace_placeholder(
text: str,
placeholder: str,
repls: list[str],
) str#
_normalize_batched_images(
images,
batch_size: int,
) list[list[PIL.Image.Image]]#
__call__(
text: Optional[Union[str, list[str]]] = None,
images: transformers.image_utils.ImageInput | None = None,
return_tensors: Optional[Union[str, transformers.feature_extraction_utils.TensorType]] = None,
**kwargs,
) transformers.feature_extraction_utils.BatchFeature#
batch_decode(*args, **kwargs)#

This method forwards all its arguments to GemmaTokenizerFast’s [~PreTrainedTokenizer.batch_decode]. Please refer to the docstring of this method for more information.

decode(*args, **kwargs)#

This method forwards all its arguments to GemmaTokenizerFast’s [~PreTrainedTokenizer.decode]. Please refer to the docstring of this method for more information.

nemo_automodel.components.models.step3p7.processing_step3.__all__#

[‘Step3VLProcessor’]