Source code for nv_ingest_api.util.converters.formats
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# pylint: skip-file
import json
[docs]
def ingest_json_results_to_blob(result_content):
"""
Parse a JSON string or BytesIO object, combine and sort entries, and create a blob string.
Returns:
str: The generated blob string.
"""
try:
# Load the JSON data
data = json.loads(result_content) if isinstance(result_content, str) else json.loads(result_content)
data = data["data"]
# Smarter sorting: by page, then structured objects by x0, y0
def sorting_key(entry):
page = entry["metadata"]["content_metadata"]["page_number"]
if entry["document_type"] == "structured":
# Use table location's x0 and y0 as secondary keys
x0 = entry["metadata"]["table_metadata"]["table_location"][0]
y0 = entry["metadata"]["table_metadata"]["table_location"][1]
else:
# Non-structured objects are sorted after structured ones
x0 = float("inf")
y0 = float("inf")
return page, x0, y0
data.sort(key=sorting_key)
# Initialize the blob string
blob = []
for entry in data:
document_type = entry.get("document_type", "")
if document_type == "structured":
# Add table content to the blob
blob.append(entry["metadata"]["table_metadata"]["table_content"])
blob.append("\n")
elif document_type == "text":
# Add content to the blob
blob.append(entry["metadata"]["content"])
blob.append("\n")
elif document_type == "image":
# Add image caption to the blob
caption = entry["metadata"]["image_metadata"].get("caption", "")
blob.append(f"image_caption:[{caption}]")
blob.append("\n")
# Join all parts of the blob into a single string
return "".join(blob)
except Exception as e:
print(f"[ERROR] An error occurred while processing JSON content: {e}")
return ""