Source code for nv_ingest_api.util.image_processing.processing

# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import numpy as np
from typing import List, Tuple, Optional

from nv_ingest_api.internal.primitives.nim.default_values import (
    YOLOX_MAX_BATCH_SIZE,
    YOLOX_NUM_CLASSES,
    YOLOX_CONF_THRESHOLD,
    YOLOX_IOU_THRESHOLD,
    YOLOX_MIN_SCORE,
    YOLOX_FINAL_SCORE,
)
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxPageElementsModelInterface
from nv_ingest_api.util.image_processing.transforms import crop_image, numpy_to_base64
from nv_ingest_api.util.metadata.aggregators import CroppedImageWithContent
from nv_ingest_api.util.nim import create_inference_client

logger = logging.getLogger(__name__)



[docs]
def extract_tables_and_charts_from_image(annotation_dict, original_image, page_idx, tables_and_charts):
    """
    Extract and process table and chart regions from the provided image based on detection annotations.

    Parameters
    ----------
    annotation_dict : dict
        A dictionary containing detected objects and their bounding boxes, e.g. keys "table" and "chart".
    original_image : np.ndarray
        The original image from which objects were detected.
    page_idx : int
        The index of the current page being processed.
    tables_and_charts : list of tuple
        A list to which extracted table/chart data will be appended. Each item is a tuple
        (page_idx, CroppedImageWithContent).

    Notes
    -----
    This function iterates over the detected table and chart objects. For each detected object, it:
      - Crops the original image based on the bounding box.
      - Converts the cropped image to a base64 encoded string.
      - Wraps the encoded image along with its bounding box and the image dimensions in a standardized data structure.

    Additional model inference or post-processing can be added where needed.

    Examples
    --------
    >>> annotation_dict = {"table": [ [...], [...] ], "chart": [ [...], [...] ]}
    >>> original_image = np.random.rand(1536, 1536, 3)
    >>> tables_and_charts = []
    >>> extract_tables_and_charts(annotation_dict, original_image, 0, tables_and_charts)
    """

    width, height, *_ = original_image.shape
    for label in ["table", "chart"]:
        if not annotation_dict:
            continue

        objects = annotation_dict[label]
        for idx, bboxes in enumerate(objects):
            *bbox, _ = bboxes
            h1, w1, h2, w2 = bbox

            cropped = crop_image(original_image, (int(h1), int(w1), int(h2), int(w2)))
            base64_img = numpy_to_base64(cropped)

            element_data = CroppedImageWithContent(
                content="",
                image=base64_img,
                bbox=(int(w1), int(h1), int(w2), int(h2)),
                max_width=width,
                max_height=height,
                type_string=label,
            )
            tables_and_charts.append((page_idx, element_data))




[docs]
def extract_tables_and_charts_yolox(
    pages: List[Tuple[int, np.ndarray]],
    config: dict,
    trace_info: Optional[List] = None,
) -> List[Tuple[int, object]]:
    """
    Given a list of (page_index, image) tuples and a configuration dictionary,
    this function calls the YOLOX-based inference service to extract table and chart
    annotations from all pages.

    Parameters
    ----------
    pages : List[Tuple[int, np.ndarray]]
        A list of tuples containing the page index and the corresponding image.
    config : dict
        A dictionary containing configuration parameters such as:
            - 'yolox_endpoints'
            - 'auth_token'
            - 'yolox_infer_protocol'
    trace_info : Optional[List], optional
        Optional tracing information for logging/debugging purposes.

    Returns
    -------
    List[Tuple[int, object]]
        For each page, returns a tuple (page_index, joined_content) where
        joined_content is the result of combining annotations from the inference.
    """
    tables_and_charts = []
    yolox_client = None

    try:
        model_interface = YoloxPageElementsModelInterface()
        yolox_client = create_inference_client(
            config["yolox_endpoints"],
            model_interface,
            config["auth_token"],
            config["yolox_infer_protocol"],
        )

        # Collect all page indices and images in order.
        image_page_indices = [page[0] for page in pages]
        original_images = [page[1] for page in pages]

        # Prepare the data payload with all images.
        data = {"images": original_images}

        # Perform inference using the YOLOX client.
        inference_results = yolox_client.infer(
            data,
            model_name="yolox",
            max_batch_size=YOLOX_MAX_BATCH_SIZE,
            num_classes=YOLOX_NUM_CLASSES,
            conf_thresh=YOLOX_CONF_THRESHOLD,
            iou_thresh=YOLOX_IOU_THRESHOLD,
            min_score=YOLOX_MIN_SCORE,
            final_thresh=YOLOX_FINAL_SCORE,
            trace_info=trace_info,
            stage_name="pdf_extraction",
        )

        # Process results: iterate over each image's inference output.
        for annotation_dict, page_index, original_image in zip(inference_results, image_page_indices, original_images):
            extract_tables_and_charts_from_image(
                annotation_dict,
                original_image,
                page_index,
                tables_and_charts,
            )

    except TimeoutError:
        logger.error("Timeout error during table/chart extraction.")
        raise

    except Exception as e:
        err_msg = f"Error during table/chart extraction: {str(e)}"
        logger.exception(err_msg)
        raise

    finally:
        if yolox_client:
            yolox_client.close()

    logger.debug(f"Extracted {len(tables_and_charts)} tables and charts.")
    return tables_and_charts