nemo_automodel.components.models.step3p7.processing_step3

View as Markdown

Module Contents

Classes

NameDescription
GPUToTensor-
ImagePatcher-
Step3VLImageEmbeddingInputs-
Step3VLImagePixelInputs-
Step3VLProcessorProcessor that expands Step3.7 image inputs into image-token placeholders.
Step3VisionProcessor-

Data

ImageWithPatches

MAX_IMAGE_SIZE

__all__

API

class nemo_automodel.components.models.step3p7.processing_step3.GPUToTensor()

Bases: Module

nemo_automodel.components.models.step3p7.processing_step3.GPUToTensor.forward(
raw_image: typing.Union[numpy.ndarray, PIL.Image.Image]
) -> torch.Tensor
class nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher()
nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.__call__(
img: PIL.Image.Image
) -> tuple[PIL.Image.Image, list[PIL.Image.Image], list[bool] | None]
nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.determine_window_size(
long: int,
short: int
) -> int
nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.get_image_size_for_crop(
img_width: int,
img_height: int,
window_size: int
)
nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.get_image_size_for_padding(
img_width: int,
img_height: int
) -> tuple[int, int]
nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.get_image_size_for_preprocess(
img_width: int,
img_height: int
) -> tuple[int, int]
nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.get_num_patches(
img_width: int,
img_height: int
) -> tuple[int, int]
nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.patch_crop(
img: PIL.Image.Image,
i: int,
j: int,
th: int,
tw: int
)
nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.slide_window(
width: int,
height: int,
sizes: list[tuple[int, int]],
steps: list[tuple[int, int]],
img_rate_thr: float = 0.6
) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]
nemo_automodel.components.models.step3p7.processing_step3.ImagePatcher.square_pad(
img: PIL.Image.Image
) -> PIL.Image.Image
class nemo_automodel.components.models.step3p7.processing_step3.Step3VLImageEmbeddingInputs

Bases: typing.TypedDict

image_embeds
Tensor
type
Literal['image_embeds']
class nemo_automodel.components.models.step3p7.processing_step3.Step3VLImagePixelInputs

Bases: typing.TypedDict

num_patches
list[int]
patch_pixel_values
Optional[Tensor]
pixel_values
Tensor
type
Literal['pixel_values']
class nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor(
tokenizer = None,
chat_template = None,
kwargs = {}
)

Bases: ProcessorMixin

Processor that expands Step3.7 image inputs into image-token placeholders.

attributes
= ['tokenizer']
image_feature_placeholder
= self.image_token * self.num_image_feature_size
image_preprocessor
image_size
= 728
image_token
= '<im_patch>'
image_token_id
int
num_image_feature_size
= 169
num_patch_feature_size
= 81
patch_feature_placeholder
= self.image_token * self.num_patch_feature_size
patch_size
= 504
patcher
= ImagePatcher()
tokenizer_class
= 'AutoTokenizer'
nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor.__call__(
text: typing.Optional[typing.Union[str, list[str]]] = None,
images: transformers.image_utils.ImageInput | None = None,
return_tensors: typing.Optional[typing.Union[str, transformers.feature_extraction_utils.TensorType]] = None,
kwargs = {}
) -> transformers.feature_extraction_utils.BatchFeature
nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor._convert_images_to_pixel_values(
images: list[PIL.Image.Image],
is_patch: bool = False
) -> list[torch.Tensor]
nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor._get_image_repl(
num_images: int
) -> tuple[str, list[int]]
nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor._get_image_repl_features(
num_images: int,
num_patches: int,
patch_new_line_idx: typing.Optional[list[bool]]
) -> tuple[str, list[int]]
nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor._get_patch_repl(
num_patches: int,
patch_newline_mask: list[bool] | None
) -> tuple[str, list[int]]
nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor._normalize_batched_images(
images,
batch_size: int
) -> list[list[PIL.Image.Image]]
nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor._split_images(
images: list[PIL.Image.Image]
) -> list[nemo_automodel.components.models.step3p7.processing_step3.ImageWithPatches]
nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor.batch_decode(
args = (),
kwargs = {}
)

This method forwards all its arguments to GemmaTokenizerFast’s [~PreTrainedTokenizer.batch_decode]. Please refer to the docstring of this method for more information.

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor.decode(
args = (),
kwargs = {}
)

This method forwards all its arguments to GemmaTokenizerFast’s [~PreTrainedTokenizer.decode]. Please refer to the docstring of this method for more information.

nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor.get_num_image_tokens(
img_width: int,
img_height: int
) -> int
nemo_automodel.components.models.step3p7.processing_step3.Step3VLProcessor.replace_placeholder(
text: str,
placeholder: str,
repls: list[str]
) -> str
class nemo_automodel.components.models.step3p7.processing_step3.Step3VisionProcessor(
size,
interpolation_mode = 'bicubic',
patch_size = None
)

Bases: BaseImageProcessor

patch_transform
transform
nemo_automodel.components.models.step3p7.processing_step3.Step3VisionProcessor.__call__(
image,
is_patch = False
)
nemo_automodel.components.models.step3p7.processing_step3.ImageWithPatches = tuple[Image.Image, list[Image.Image], list[int] | None]
nemo_automodel.components.models.step3p7.processing_step3.MAX_IMAGE_SIZE: int = 3024
nemo_automodel.components.models.step3p7.processing_step3.__all__ = ['Step3VLProcessor']