Source code for nv_ingest_api.util.metadata.aggregators
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import base64
import io
import uuid
from dataclasses import dataclass
from datetime import datetime
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
import pandas as pd
import pypdfium2 as pdfium
from PIL import Image
from pypdfium2 import PdfImage
from nv_ingest_api.internal.enums.common import ContentDescriptionEnum, DocumentTypeEnum
from nv_ingest_api.internal.enums.common import ContentTypeEnum
from nv_ingest_api.internal.schemas.meta.metadata_schema import NearbyObjectsSchema
from nv_ingest_api.internal.enums.common import TableFormatEnum
from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
from nv_ingest_api.util.converters import datetools
from nv_ingest_api.util.detectors.language import detect_language
from nv_ingest_api.util.exception_handlers.pdf import pdfium_exception_handler
[docs]
@dataclass
class CroppedImageWithContent:
content: str
image: str
bbox: Tuple[int, int, int, int]
max_width: int
max_height: int
type_string: str
content_format: str = ""
[docs]
@dataclass
class LatexTable:
latex: pd.DataFrame
bbox: Tuple[int, int, int, int]
max_width: int
max_height: int
[docs]
@dataclass
class Base64Image:
image: str
bbox: Tuple[int, int, int, int]
width: int
height: int
max_width: int
max_height: int
[docs]
@dataclass
class PDFMetadata:
"""
A data object to store metadata information extracted from a PDF document.
"""
page_count: int
filename: str
last_modified: str
date_created: str
keywords: List[str]
source_type: str = "PDF"
[docs]
def extract_pdf_metadata(doc: pdfium.PdfDocument, source_id: str) -> PDFMetadata:
"""
Extracts metadata and relevant information from a PDF document.
Parameters
----------
pdf_stream : bytes
The PDF document data as a byte stream.
source_id : str
The identifier for the source document, typically the filename.
Returns
-------
PDFMetadata
An object containing extracted metadata and information including:
- `page_count`: The total number of pages in the PDF.
- `filename`: The source filename or identifier.
- `last_modified`: The last modified date of the PDF document.
- `date_created`: The creation date of the PDF document.
- `keywords`: Keywords associated with the PDF document.
- `source_type`: The type/format of the source, e.g., "PDF".
Raises
------
PdfiumError
If there is an issue processing the PDF document.
"""
page_count: int = len(doc)
filename: str = source_id
# Extract document metadata
doc_meta = doc.get_metadata_dict()
# Extract and process the last modified date
last_modified: str = doc_meta.get("ModDate")
if last_modified in (None, ""):
last_modified = datetools.remove_tz(datetime.now()).isoformat()
else:
last_modified = datetools.datetimefrompdfmeta(last_modified)
# Extract and process the creation date
date_created: str = doc_meta.get("CreationDate")
if date_created in (None, ""):
date_created = datetools.remove_tz(datetime.now()).isoformat()
else:
date_created = datetools.datetimefrompdfmeta(date_created)
# Extract keywords, defaulting to an empty list if not found
keywords: List[str] = doc_meta.get("Keywords", [])
# Create the PDFMetadata object
metadata = PDFMetadata(
page_count=page_count,
filename=filename,
last_modified=last_modified,
date_created=date_created,
keywords=keywords,
)
return metadata
[docs]
def construct_text_metadata(
accumulated_text,
keywords,
page_idx,
block_idx,
line_idx,
span_idx,
page_count,
text_depth,
source_metadata,
base_unified_metadata,
delimiter=" ",
bbox_max_dimensions: Tuple[int, int] = (-1, -1),
nearby_objects: Optional[Dict[str, Any]] = None,
):
extracted_text = delimiter.join(accumulated_text)
content_metadata = {
"type": ContentTypeEnum.TEXT,
"description": ContentDescriptionEnum.PDF_TEXT,
"page_number": page_idx,
"hierarchy": {
"page_count": page_count,
"page": page_idx,
"block": -1,
"line": -1,
"span": -1,
"nearby_objects": nearby_objects or NearbyObjectsSchema(),
},
}
language = detect_language(extracted_text)
# TODO(Devin) - Implement bounding box logic for text
bbox = (-1, -1, -1, -1)
text_metadata = {
"text_type": text_depth,
"summary": "",
"keywords": keywords,
"language": language,
"text_location": bbox,
"text_location_max_dimensions": bbox_max_dimensions,
}
ext_unified_metadata = base_unified_metadata.copy()
ext_unified_metadata.update(
{
"content": extracted_text,
"source_metadata": source_metadata,
"content_metadata": content_metadata,
"text_metadata": text_metadata,
}
)
validated_unified_metadata = validate_metadata(ext_unified_metadata)
return [ContentTypeEnum.TEXT, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
[docs]
def construct_image_metadata_from_base64(
base64_image: str,
page_idx: int,
page_count: int,
source_metadata: Dict[str, Any],
base_unified_metadata: Dict[str, Any],
) -> List[Any]:
"""
Extracts image data from a base64-encoded image string, decodes the image to get
its dimensions and bounding box, and constructs metadata for the image.
Parameters
----------
base64_image : str
A base64-encoded string representing the image.
page_idx : int
The index of the current page being processed.
page_count : int
The total number of pages in the PDF document.
source_metadata : Dict[str, Any]
Metadata related to the source of the PDF document.
base_unified_metadata : Dict[str, Any]
The base unified metadata structure to be updated with the extracted image information.
Returns
-------
List[Any]
A list containing the content type, validated metadata dictionary, and a UUID string.
Raises
------
ValueError
If the image cannot be decoded from the base64 string.
"""
# Decode the base64 image
try:
image_data = base64.b64decode(base64_image)
image = Image.open(io.BytesIO(image_data))
except Exception as e:
raise ValueError(f"Failed to decode image from base64: {e}")
# Extract image dimensions and bounding box
width, height = image.size
bbox = (0, 0, width, height) # Assuming the full image as the bounding box
# Construct content metadata
content_metadata: Dict[str, Any] = {
"type": ContentTypeEnum.IMAGE,
"description": ContentDescriptionEnum.PDF_IMAGE,
"page_number": page_idx,
"hierarchy": {
"page_count": page_count,
"page": page_idx,
"block": -1,
"line": -1,
"span": -1,
},
}
# Construct image metadata
image_metadata: Dict[str, Any] = {
"image_type": DocumentTypeEnum.PNG,
"structured_image_type": ContentTypeEnum.UNKNOWN,
"caption": "",
"text": "",
"image_location": bbox,
"image_location_max_dimensions": (width, height),
"height": height,
}
# Update the unified metadata with the extracted image information
unified_metadata: Dict[str, Any] = base_unified_metadata.copy()
unified_metadata.update(
{
"content": base64_image,
"source_metadata": source_metadata,
"content_metadata": content_metadata,
"image_metadata": image_metadata,
}
)
# Validate and return the unified metadata
validated_unified_metadata = validate_metadata(unified_metadata)
return [ContentTypeEnum.IMAGE, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
[docs]
def construct_image_metadata_from_pdf_image(
pdf_image: PdfImage,
page_idx: int,
page_count: int,
source_metadata: Dict[str, Any],
base_unified_metadata: Dict[str, Any],
) -> List[Any]:
"""
Extracts image data from a PdfImage object, converts it to a base64-encoded string,
and constructs metadata for the image.
Parameters
----------
image_obj : PdfImage
The PdfImage object from which the image will be extracted.
page_idx : int
The index of the current page being processed.
page_count : int
The total number of pages in the PDF document.
source_metadata : dict
Metadata related to the source of the PDF document.
base_unified_metadata : dict
The base unified metadata structure to be updated with the extracted image information.
Returns
-------
List[Any]
A list containing the content type, validated metadata dictionary, and a UUID string.
Raises
------
PdfiumError
If the image cannot be extracted due to an issue with the PdfImage object.
:param pdf_image:
"""
# Construct content metadata
content_metadata: Dict[str, Any] = {
"type": ContentTypeEnum.IMAGE,
"description": ContentDescriptionEnum.PDF_IMAGE,
"page_number": page_idx,
"hierarchy": {
"page_count": page_count,
"page": page_idx,
"block": -1,
"line": -1,
"span": -1,
},
}
# Construct image metadata
image_metadata: Dict[str, Any] = {
"image_type": DocumentTypeEnum.PNG,
"structured_image_type": ContentTypeEnum.UNKNOWN,
"caption": "",
"text": "",
"image_location": pdf_image.bbox,
"image_location_max_dimensions": (max(pdf_image.max_width, 0), max(pdf_image.max_height, 0)),
"height": pdf_image.height,
"width": pdf_image.width,
}
# Update the unified metadata with the extracted image information
unified_metadata: Dict[str, Any] = base_unified_metadata.copy()
unified_metadata.update(
{
"content": pdf_image.image,
"source_metadata": source_metadata,
"content_metadata": content_metadata,
"image_metadata": image_metadata,
}
)
# Validate and return the unified metadata
validated_unified_metadata = validate_metadata(unified_metadata)
return [ContentTypeEnum.IMAGE, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
# TODO(Devin): Disambiguate tables and charts, create two distinct processing methods
@pdfium_exception_handler(descriptor="pdfium")
def construct_page_element_metadata(
structured_image: CroppedImageWithContent,
page_idx: int,
page_count: int,
source_metadata: Dict,
base_unified_metadata: Dict,
):
"""
+--------------------------------+--------------------------+------------+---+
| Table/Chart Metadata | | Extracted | Y |
| (tables within documents) | | | |
+--------------------------------+--------------------------+------------+---+
| Table format | Structured (dataframe / | Extracted | |
| | lists of rows and | | |
| | columns), or serialized | | |
| | as markdown, html, | | |
| | latex, simple (cells | | |
| | separated just as spaces)| | |
+--------------------------------+--------------------------+------------+---+
| Table content | Extracted text content | | |
| | | | |
| | Important: Tables should | | |
| | not be chunked | | |
+--------------------------------+--------------------------+------------+---+
| Table location | Bounding box of the table| | |
+--------------------------------+--------------------------+------------+---+
| Caption | Detected captions for | | |
| | the table/chart | | |
+--------------------------------+--------------------------+------------+---+
| uploaded_image_uri | Mirrors | | |
| | source_metadata. | | |
| | source_location | | |
+--------------------------------+--------------------------+------------+---+
"""
if structured_image.type_string in ("table",):
content = structured_image.image
structured_content_text = structured_image.content
structured_content_format = structured_image.content_format
table_format = TableFormatEnum.IMAGE
subtype = ContentTypeEnum.TABLE
description = ContentDescriptionEnum.PDF_TABLE
meta_name = "table_metadata"
elif structured_image.type_string in ("chart",):
content = structured_image.image
structured_content_text = structured_image.content
structured_content_format = structured_image.content_format
table_format = TableFormatEnum.IMAGE
subtype = ContentTypeEnum.CHART
description = ContentDescriptionEnum.PDF_CHART
# TODO(Devin) swap this to chart_metadata after we confirm metadata schema changes.
meta_name = "table_metadata"
elif structured_image.type_string in ("infographic",):
content = structured_image.image
structured_content_text = structured_image.content
structured_content_format = structured_image.content_format
table_format = TableFormatEnum.IMAGE
subtype = ContentTypeEnum.INFOGRAPHIC
description = ContentDescriptionEnum.PDF_INFOGRAPHIC
meta_name = "table_metadata"
else:
raise ValueError(f"Unknown table/chart/infographic type: {structured_image.type_string}")
content_metadata = {
"type": ContentTypeEnum.STRUCTURED,
"description": description,
"page_number": page_idx,
"hierarchy": {
"page_count": page_count,
"page": page_idx,
"line": -1,
"span": -1,
},
"subtype": subtype,
}
structured_metadata = {
"caption": "",
"table_format": table_format,
"table_content": structured_content_text,
"table_content_format": structured_content_format,
"table_location": structured_image.bbox,
"table_location_max_dimensions": (structured_image.max_width, structured_image.max_height),
}
ext_unified_metadata = base_unified_metadata.copy()
ext_unified_metadata.update(
{
"content": content,
"source_metadata": source_metadata,
"content_metadata": content_metadata,
meta_name: structured_metadata,
}
)
validated_unified_metadata = validate_metadata(ext_unified_metadata)
return [ContentTypeEnum.STRUCTURED, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
# TODO: remove this alias
construct_table_and_chart_metadata = construct_page_element_metadata