Source code for nv_ingest_api.util.converters.formats

# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# pylint: skip-file

import json



[docs]
def ingest_json_results_to_blob(result_content):
    """
    Parse a JSON string or BytesIO object, combine and sort entries, and create a blob string.

    Returns:
        str: The generated blob string.
    """
    try:
        # Load the JSON data
        data = json.loads(result_content) if isinstance(result_content, str) else json.loads(result_content)
        data = data["data"]

        # Smarter sorting: by page, then structured objects by x0, y0
        def sorting_key(entry):
            page = entry["metadata"]["content_metadata"]["page_number"]
            if entry["document_type"] == "structured":
                # Use table location's x0 and y0 as secondary keys
                x0 = entry["metadata"]["table_metadata"]["table_location"][0]
                y0 = entry["metadata"]["table_metadata"]["table_location"][1]
            else:
                # Non-structured objects are sorted after structured ones
                x0 = float("inf")
                y0 = float("inf")
            return page, x0, y0

        data.sort(key=sorting_key)

        # Initialize the blob string
        blob = []

        for entry in data:
            document_type = entry.get("document_type", "")

            if document_type == "structured":
                # Add table content to the blob
                blob.append(entry["metadata"]["table_metadata"]["table_content"])
                blob.append("\n")

            elif document_type == "text":
                # Add content to the blob
                blob.append(entry["metadata"]["content"])
                blob.append("\n")

            elif document_type == "image":
                # Add image caption to the blob
                caption = entry["metadata"]["image_metadata"].get("caption", "")
                blob.append(f"image_caption:[{caption}]")
                blob.append("\n")

        # Join all parts of the blob into a single string
        return "".join(blob)

    except Exception as e:
        print(f"[ERROR] An error occurred while processing JSON content: {e}")
        return ""