Source code for nv_ingest_api.interface.utility

# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import base64
import os
from io import BytesIO

import pandas as pd
from datetime import datetime
from typing import List, Union

from nv_ingest_api.internal.enums.common import ContentTypeEnum, DocumentTypeEnum

# ------------------------------------------------------------------------------
# Mapping from DocumentTypeEnum to ContentTypeEnum
# ------------------------------------------------------------------------------
DOCUMENT_TO_CONTENT_MAPPING = {
    DocumentTypeEnum.BMP: ContentTypeEnum.IMAGE,
    DocumentTypeEnum.DOCX: ContentTypeEnum.STRUCTURED,
    DocumentTypeEnum.HTML: ContentTypeEnum.TEXT,
    DocumentTypeEnum.JPEG: ContentTypeEnum.IMAGE,
    DocumentTypeEnum.PDF: ContentTypeEnum.STRUCTURED,
    DocumentTypeEnum.PNG: ContentTypeEnum.IMAGE,
    DocumentTypeEnum.PPTX: ContentTypeEnum.STRUCTURED,
    DocumentTypeEnum.SVG: ContentTypeEnum.IMAGE,
    DocumentTypeEnum.TIFF: ContentTypeEnum.IMAGE,
    DocumentTypeEnum.TXT: ContentTypeEnum.TEXT,
    DocumentTypeEnum.MD: ContentTypeEnum.TEXT,
    DocumentTypeEnum.MP3: ContentTypeEnum.AUDIO,
    DocumentTypeEnum.WAV: ContentTypeEnum.AUDIO,
    DocumentTypeEnum.UNKNOWN: ContentTypeEnum.UNKNOWN,
}


# ------------------------------------------------------------------------------
# Helper function to get the document type from a file extension.
# ------------------------------------------------------------------------------
[docs] def get_document_type_from_extension(file_path: str) -> str: ext = os.path.splitext(file_path)[1].lower() mapping = { ".png": DocumentTypeEnum.PNG, ".jpg": DocumentTypeEnum.JPEG, ".jpeg": DocumentTypeEnum.JPEG, ".tiff": DocumentTypeEnum.TIFF, ".svg": DocumentTypeEnum.SVG, } return mapping.get(ext, DocumentTypeEnum.UNKNOWN)
# ------------------------------------------------------------------------------ # Helper function to read a file and return its base64-encoded string. # ------------------------------------------------------------------------------
[docs] def read_file_as_base64(file_path: str) -> str: """ Reads the file at file_path in binary mode and returns its base64-encoded string. """ with open(file_path, "rb") as f: file_bytes = f.read() return base64.b64encode(file_bytes).decode("utf-8")
# ------------------------------------------------------------------------------ # Helper function to read a BytesIO object and return its base64-encoded string. # ------------------------------------------------------------------------------
[docs] def read_bytesio_as_base64(file_io: BytesIO) -> str: """ Reads a BytesIO object and returns its base64-encoded string. Parameters: file_io (BytesIO): A file-like object containing binary data. Returns: str: The base64-encoded string representation of the file's contents. """ file_bytes = file_io.getvalue() return base64.b64encode(file_bytes).decode("utf-8")
# ------------------------------------------------------------------------------ # Helper function to create source metadata. # ------------------------------------------------------------------------------
[docs] def create_source_metadata(source_name: str, source_id: str, document_type: str) -> dict: """ Creates a source metadata dictionary for a file. The source_type is set to the provided document_type. The date_created and last_modified fields are set to the current ISO timestamp. """ now_iso = datetime.now().isoformat() return { "source_name": source_name, "source_id": source_id, "source_location": "", "source_type": document_type, # e.g., "pdf", "png", etc. "collection_id": "", "date_created": now_iso, "last_modified": now_iso, "summary": "", "partition_id": -1, "access_level": "unknown", # You may wish to adjust this if needed. }
# ------------------------------------------------------------------------------ # Helper function to create content metadata. # ------------------------------------------------------------------------------
[docs] def create_content_metadata(document_type: str) -> dict: """ Creates a content metadata dictionary for a file based on its document type. It maps the document type to the corresponding content type. """ # Use the mapping; if document_type is not found, fallback to "unknown". content_type = DOCUMENT_TO_CONTENT_MAPPING.get(document_type, ContentTypeEnum.UNKNOWN) return { "type": content_type, "description": "", "page_number": -1, "hierarchy": { "page_count": -1, "page": -1, "block": -1, "line": -1, "span": -1, "nearby_objects": { "text": {"content": [], "bbox": [], "type": []}, "images": {"content": [], "bbox": [], "type": []}, "structured": {"content": [], "bbox": [], "type": []}, }, }, "subtype": "", }
# ------------------------------------------------------------------------------ # Main helper function to build a DataFrame from lists of files. # ------------------------------------------------------------------------------
[docs] def build_dataframe_from_files( file_paths: List[Union[str, BytesIO]], source_names: List[str], source_ids: List[str], document_types: List[str], ) -> pd.DataFrame: """ Given lists of file paths (or BytesIO objects), source names, source IDs, and document types, reads each file (base64-encoding its contents) and constructs a DataFrame. For image content, 'image_metadata' is initialized as an empty dict, so it can later be updated. """ rows = [] # Validate that all lists have the same length. if not (len(file_paths) == len(source_names) == len(source_ids) == len(document_types)): raise ValueError("All input lists must have the same length.") for fp, sname, sid, d_type in zip(file_paths, source_names, source_ids, document_types): # Determine if fp is a file path (str) or a file-like object (e.g., BytesIO). if isinstance(fp, str): encoded_content = read_file_as_base64(fp) elif hasattr(fp, "read"): encoded_content = read_bytesio_as_base64(fp) else: raise ValueError("Each element in file_paths must be a string or a file-like object.") # Build metadata components. source_meta = create_source_metadata(sname, sid, d_type) content_meta = create_content_metadata(d_type) # If the content type is image, initialize image_metadata as {}. image_metadata = {} if content_meta.get("type") == ContentTypeEnum.IMAGE else None # Assemble the complete metadata dictionary. metadata = { "content": encoded_content, "content_url": "", "embedding": None, "source_metadata": source_meta, "content_metadata": content_meta, "audio_metadata": None, "text_metadata": None, "image_metadata": image_metadata, "table_metadata": None, "chart_metadata": None, "error_metadata": None, "info_message_metadata": None, "debug_metadata": None, "raise_on_failure": False, } # Build the row dictionary. row = { "source_name": sname, "source_id": sid, "content": encoded_content, "document_type": d_type, "metadata": metadata, } rows.append(row) # Create and return the DataFrame. return pd.DataFrame(rows)