Source code for nv_ingest_client.primitives.tasks.dedup

# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0


# pylint: disable=too-few-public-methods
# pylint: disable=too-many-arguments

import logging
from typing import Dict
from typing import Literal

from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema

from .task_base import Task

logger = logging.getLogger(__name__)


[docs] class DedupTask(Task): """ Object for document dedup task """ _TypeContentType = Literal["image"] def __init__( self, content_type: _TypeContentType = "image", filter: bool = False, enable_bbox_dedup: bool = True, iou_threshold: float = 0.45, prefer_structured: bool = True, ) -> None: """ Setup Dedup Task Config Parameters ---------- content_type : str Content type to deduplicate (currently only "image" supported). filter : bool Legacy filter parameter. enable_bbox_dedup : bool Enable bounding box overlap deduplication. When True, images that substantially overlap with structured elements (tables/charts) on the same page are removed. iou_threshold : float IoU (Intersection over Union) threshold for bbox dedup (0.0-1.0). Elements with IoU >= threshold are considered duplicates. prefer_structured : bool When True, keep tables/charts and drop overlapping images. When False, keep images and drop overlapping structured elements. """ super().__init__() # Validate iou_threshold if not 0.0 <= iou_threshold <= 1.0: raise ValueError("iou_threshold must be between 0.0 and 1.0") # Use the API schema for validation validated_data = IngestTaskDedupSchema( content_type=content_type, params={ "filter": filter, "enable_bbox_dedup": enable_bbox_dedup, "iou_threshold": iou_threshold, "bbox_dedup_prefer_structured": prefer_structured, }, ) self._content_type = validated_data.content_type self._filter = validated_data.params.filter self._enable_bbox_dedup = validated_data.params.enable_bbox_dedup self._iou_threshold = validated_data.params.iou_threshold self._prefer_structured = validated_data.params.bbox_dedup_prefer_structured def __str__(self) -> str: """ Returns a string with the object's config and run time state """ info = "" info += "Dedup Task:\n" info += f" content_type: {self._content_type.value}\n" info += f" filter: {self._filter}\n" info += f" enable_bbox_dedup: {self._enable_bbox_dedup}\n" if self._enable_bbox_dedup: info += f" iou_threshold: {self._iou_threshold}\n" info += f" prefer_structured: {self._prefer_structured}\n" return info
[docs] def to_dict(self) -> Dict: """ Convert to a dict for submission to redis """ dedup_params = { "filter": self._filter, "enable_bbox_dedup": self._enable_bbox_dedup, "iou_threshold": self._iou_threshold, "bbox_dedup_prefer_structured": self._prefer_structured, } task_properties = { "content_type": self._content_type.value, "params": dedup_params, } return {"type": "dedup", "task_properties": task_properties}