Source code for nv_ingest_api.interface.mutate

# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
from typing import Union, Dict

import pandas as pd

from nv_ingest_api.internal.mutate.deduplicate import deduplicate_images_internal
from nv_ingest_api.internal.mutate.filter import filter_images_internal
from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler

logger = logging.getLogger(__name__)


[docs] @unified_exception_handler def filter_images( *, df_ledger: pd.DataFrame, min_size: int = 128, max_aspect_ratio: Union[float, int] = 5.0, min_aspect_ratio: Union[float, int] = 2.0, ) -> pd.DataFrame: """ Apply an image filter to the ledger DataFrame based on size and aspect ratio criteria. This function builds a set of task parameters and then delegates the filtering work to `filter_images_internal`. If an exception occurs during filtering, the error is logged and re-raised with additional context. Parameters ---------- df_ledger : pd.DataFrame DataFrame containing image metadata. It must include the columns 'document_type' and 'metadata'. min_size : int, optional Minimum average image size threshold. Images with an average size less than or equal to this value are considered for filtering. Default is 128. max_aspect_ratio : float or int, optional Maximum allowed image aspect ratio. Images with an aspect ratio greater than or equal to this value are considered for filtering. Default is 5.0. min_aspect_ratio : float or int, optional Minimum allowed image aspect ratio. Images with an aspect ratio less than or equal to this value are considered for filtering. Default is 2.0. execution_trace_log : Optional[List[Any]], optional Returns ------- pd.DataFrame The DataFrame after applying the image filter. Raises ------ Exception If an error occurs during the filtering process. """ task_params: Dict[str, Union[int, float, bool]] = { "min_size": min_size, "max_aspect_ratio": max_aspect_ratio, "min_aspect_ratio": min_aspect_ratio, "filter": True, } mutate_config = ImageFilterSchema() result = filter_images_internal(df_ledger, task_params, mutate_config=mutate_config, execution_trace_log=None) return result
[docs] @unified_exception_handler def deduplicate_images( *, df_ledger: pd.DataFrame, hash_algorithm: str = "md5", ) -> pd.DataFrame: """ Deduplicate images in the DataFrame based on content hashes. This function constructs a task configuration using the specified hashing algorithm and delegates the deduplication process to the internal function ``deduplicate_images_internal``. The deduplication is performed by computing content hashes for each image in the DataFrame and then removing duplicate images. Parameters ---------- df_ledger : pd.DataFrame A pandas DataFrame containing image metadata. The DataFrame must include at least the columns: - ``document_type``: A string representing the document type (e.g., "png"). - ``metadata``: A dictionary that contains image-related metadata. For example, it should include keys such as ``content`` (base64-encoded image data), ``source_metadata``, and ``content_metadata``. hash_algorithm : str, optional The hashing algorithm to use for deduplication. Valid algorithms are those supported by Python's ``hashlib.new()`` function (e.g., "md5", "sha1", "sha256"). Default is "md5". Returns ------- pd.DataFrame A deduplicated DataFrame in which duplicate images have been removed. The structure of the returned DataFrame is the same as the input, with duplicate rows eliminated. Raises ------ Exception Propagates any exceptions encountered during the deduplication process. Examples -------- >>> import pandas as pd >>> # Example DataFrame with image metadata. >>> df = pd.DataFrame({ ... "source_name": ["image1.png", "image2.png"], ... "source_id": ["image1.png", "image2.png"], ... "content": ["<base64-encoded-image-1>", "<base64-encoded-image-2>"], ... "document_type": ["png", "png"], ... "metadata": [{ ... "content": "<base64-encoded-image-1>", ... "source_metadata": {"source_id": "image1.png", "source_name": "image1.png", "source_type": "png"}, ... "content_metadata": {"type": "image"}, ... "audio_metadata": None, ... "text_metadata": None, ... "image_metadata": {}, ... "raise_on_failure": False, ... }, ... { ... "content": "<base64-encoded-image-2>", ... "source_metadata": {"source_id": "image2.png", "source_name": "image2.png", "source_type": "png"}, ... "content_metadata": {"type": "image"}, ... "audio_metadata": None, ... "text_metadata": None, ... "image_metadata": {}, ... "raise_on_failure": False, ... }] ... }) >>> dedup_df = deduplicate_images(df_ledger=df, hash_algorithm="md5") >>> dedup_df """ task_config: Dict[str, Union[int, float, bool, str]] = { "hash_algorithm": hash_algorithm, } mutate_config: ImageDedupSchema = ImageDedupSchema() result = deduplicate_images_internal( df_ledger=df_ledger, task_config=task_config, mutate_config=mutate_config, execution_trace_log=None, ) return result