Source code for nv_ingest_client.util.process_json_files
import json
import click
[docs]
def ingest_json_results_to_blob(result_content):
"""
Parse a JSON string or BytesIO object, combine and sort entries, and create a blob string.
Returns:
str: The generated blob string.
"""
try:
# Load the JSON data
data = json.loads(result_content) if isinstance(result_content, str) else result_content
# Smarter sorting: by page, then structured objects by x0, y0
def sorting_key(entry):
page = entry["metadata"]["content_metadata"].get("page_number", -1)
if entry["document_type"] == "structured":
# Use table location's x0 and y0 as secondary keys
x0 = entry["metadata"]["table_metadata"]["table_location"][0]
y0 = entry["metadata"]["table_metadata"]["table_location"][1]
else:
# Non-structured objects are sorted after structured ones
x0 = float("inf")
y0 = float("inf")
return page, x0, y0
data.sort(key=sorting_key)
# Initialize the blob string
blob = []
for entry in data:
document_type = entry.get("document_type", "")
if document_type == "structured":
# Add table content to the blob
blob.append(entry["metadata"]["table_metadata"]["table_content"])
blob.append("\n")
elif document_type == "text":
# Add content to the blob
blob.append(entry["metadata"]["content"])
blob.append("\n")
elif document_type == "image":
# Add image caption to the blob
caption = entry["metadata"]["image_metadata"].get("caption", "")
blob.append(f"image_caption:[{caption}]")
blob.append("\n")
elif document_type == "audio":
blob.append(entry["metadata"]["audio_metadata"]["audio_transcript"])
blob.append("\n")
# Join all parts of the blob into a single string
return "".join(blob)
except Exception as e:
print(f"[ERROR] An error occurred while processing JSON content: {e}")
return ""
@click.command()
@click.argument("json_files", type=click.Path(exists=True), nargs=-1, required=True)
@click.option(
"--output-file",
type=click.Path(dir_okay=False, writable=True, resolve_path=True),
required=True,
help="Path to save the combined blob output file.",
)
def main(json_files, output_file):
"""
Process multiple JSON files, combine and sort entries, and generate a single blob file.
JSON_FILES: One or more JSON files to process.
"""
click.echo(f"Processing {len(json_files)} JSON files...")
all_entries = []
try:
# Read and collect entries from all files
for json_file in json_files:
click.echo(f"Reading file: {json_file}")
with open(json_file, "r") as file:
content = file.read()
all_entries.extend(json.loads(content))
# Convert collected entries to JSON string
combined_content = json.dumps(all_entries)
# Generate the blob string
blob_string = ingest_json_results_to_blob(combined_content)
if blob_string:
# Write the blob to the output file
with open(output_file, "w+") as file:
file.write(blob_string)
click.echo(f"Blob string has been generated and saved to: {output_file}")
else:
click.echo("No valid data processed. Blob file not created.")
except Exception as e:
click.echo(f"[ERROR] An error occurred: {e}")
if __name__ == "__main__":
main()