Source code for nv_ingest.schemas.metadata_schema
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import logging
from datetime import datetime
from enum import Enum
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Union
from pydantic import field_validator, model_validator
from nv_ingest.schemas.base_model_noext import BaseModelNoExt
from nv_ingest.util.converters import datetools
logger = logging.getLogger(__name__)
# Do we want types and similar items to be enums or just strings?
[docs]
class SourceTypeEnum(str, Enum):
PDF = "pdf"
DOCX = "docx"
PPTX = "pptx"
source_type_1 = "source_type_1"
source_type_2 = "source_type_2"
[docs]
class ContentTypeEnum(str, Enum):
AUDIO = "audio"
EMBEDDING = "embedding"
IMAGE = "image"
INFO_MSG = "info_message"
STRUCTURED = "structured"
TEXT = "text"
UNSTRUCTURED = "unstructured"
VIDEO = "video"
[docs]
class StdContentDescEnum(str, Enum):
DOCX_IMAGE = "Image extracted from DOCX document."
DOCX_TABLE = "Structured table extracted from DOCX document."
DOCX_TEXT = "Unstructured text from DOCX document."
PDF_CHART = "Structured chart extracted from PDF document."
PDF_IMAGE = "Image extracted from PDF document."
PDF_INFOGRAPHIC = "Structured infographic extracted from PDF document."
PDF_TABLE = "Structured table extracted from PDF document."
PDF_TEXT = "Unstructured text from PDF document."
PPTX_IMAGE = "Image extracted from PPTX presentation."
PPTX_TABLE = "Structured table extracted from PPTX presentation."
PPTX_TEXT = "Unstructured text from PPTX presentation."
[docs]
class TextTypeEnum(str, Enum):
BLOCK = "block"
BODY = "body"
DOCUMENT = "document"
HEADER = "header"
LINE = "line"
NEARBY_BLOCK = "nearby_block"
OTHER = "other"
PAGE = "page"
SPAN = "span"
[docs]
class LanguageEnum(str, Enum):
AF = "af"
AR = "ar"
BG = "bg"
BN = "bn"
CA = "ca"
CS = "cs"
CY = "cy"
DA = "da"
DE = "de"
EL = "el"
EN = "en"
ES = "es"
ET = "et"
FA = "fa"
FI = "fi"
FR = "fr"
GU = "gu"
HE = "he"
HI = "hi"
HR = "hr"
HU = "hu"
ID = "id"
IT = "it"
JA = "ja"
KN = "kn"
KO = "ko"
LT = "lt"
LV = "lv"
MK = "mk"
ML = "ml"
MR = "mr"
NE = "ne"
NL = "nl"
NO = "no"
PA = "pa"
PL = "pl"
PT = "pt"
RO = "ro"
RU = "ru"
SK = "sk"
SL = "sl"
SO = "so"
SQ = "sq"
SV = "sv"
SW = "sw"
TA = "ta"
TE = "te"
TH = "th"
TL = "tl"
TR = "tr"
UK = "uk"
UR = "ur"
VI = "vi"
ZH_CN = "zh-cn"
ZH_TW = "zh-tw"
UNKNOWN = "unknown"
[docs]
class ImageTypeEnum(str, Enum):
BMP = "bmp"
GIF = "gif"
JPEG = "jpeg"
PNG = "png"
TIFF = "tiff"
image_type_1 = "image_type_1" # until classifier developed
image_type_2 = "image_type_2" # until classifier developed
[docs]
class TableFormatEnum(str, Enum):
HTML = "html"
IMAGE = "image"
LATEX = "latex"
MARKDOWN = "markdown"
PSEUDO_MARKDOWN = "pseudo_markdown"
SIMPLE = "simple"
[docs]
class TaskTypeEnum(str, Enum):
CAPTION = "caption"
EMBED = "embed"
EXTRACT = "extract"
FILTER = "filter"
SPLIT = "split"
TRANSFORM = "transform"
[docs]
class ContentSubtypeEnum(str, Enum):
TABLE = "table"
CHART = "chart"
INFOGRAPHIC = "infographic"
# Sub schemas
[docs]
class SourceMetadataSchema(BaseModelNoExt):
"""
Schema for the knowledge base file from which content
and metadata is extracted.
"""
source_name: str
source_id: str
source_location: str = ""
source_type: Union[SourceTypeEnum, str]
collection_id: str = ""
date_created: str = datetime.now().isoformat()
last_modified: str = datetime.now().isoformat()
summary: str = ""
partition_id: int = -1
access_level: Union[AccessLevelEnum, int] = -1
[docs]
@field_validator("date_created", "last_modified")
@classmethod
def validate_fields(cls, field_value):
datetools.validate_iso8601(field_value)
return field_value
[docs]
class NearbyObjectsSubSchema(BaseModelNoExt):
"""
Schema to hold related extracted object
"""
content: List[str] = []
bbox: List[tuple] = []
type: List[str] = []
[docs]
class NearbyObjectsSchema(BaseModelNoExt):
"""
Schema to hold types of related extracted objects.
"""
text: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
images: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
structured: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
[docs]
class ContentHierarchySchema(BaseModelNoExt):
"""
Schema for the extracted content hierarchy.
"""
page_count: int = -1
page: int = -1
block: int = -1
line: int = -1
span: int = -1
nearby_objects: NearbyObjectsSchema = NearbyObjectsSchema()
[docs]
class ContentMetadataSchema(BaseModelNoExt):
"""
Data extracted from a source; generally Text or Image.
"""
type: ContentTypeEnum
description: str = ""
page_number: int = -1
hierarchy: ContentHierarchySchema = ContentHierarchySchema()
subtype: Union[ContentSubtypeEnum, str] = ""
[docs]
class TextMetadataSchema(BaseModelNoExt):
text_type: TextTypeEnum
summary: str = ""
keywords: Union[str, List[str], Dict] = ""
language: LanguageEnum = "en" # default to Unknown? Maybe do some kind of heuristic check
text_location: tuple = (0, 0, 0, 0)
text_location_max_dimensions: tuple = (0, 0, 0, 0)
[docs]
class ImageMetadataSchema(BaseModelNoExt):
image_type: Union[ImageTypeEnum, str]
structured_image_type: ImageTypeEnum = ImageTypeEnum.image_type_1
caption: str = ""
text: str = ""
image_location: tuple = (0, 0, 0, 0)
image_location_max_dimensions: tuple = (0, 0)
uploaded_image_url: str = ""
width: int = 0
height: int = 0
[docs]
@field_validator("image_type")
def validate_image_type(cls, v):
if not isinstance(v, (ImageTypeEnum, str)):
raise ValueError("image_type must be a string or ImageTypeEnum")
return v
[docs]
@field_validator("width", "height")
def clamp_non_negative(cls, v, field):
if v < 0:
logger.warning(f"{field.field_name} is negative; clamping to 0. Original value: {v}")
return 0
return v
[docs]
class TableMetadataSchema(BaseModelNoExt):
caption: str = ""
table_format: TableFormatEnum
table_content: str = ""
table_content_format: Union[TableFormatEnum, str] = ""
table_location: tuple = (0, 0, 0, 0)
table_location_max_dimensions: tuple = (0, 0)
uploaded_image_uri: str = ""
[docs]
class ChartMetadataSchema(BaseModelNoExt):
caption: str = ""
table_format: TableFormatEnum
table_content: str = ""
table_content_format: Union[TableFormatEnum, str] = ""
table_location: tuple = (0, 0, 0, 0)
table_location_max_dimensions: tuple = (0, 0)
uploaded_image_uri: str = ""
# TODO consider deprecating this in favor of info msg...
[docs]
class ErrorMetadataSchema(BaseModelNoExt):
task: TaskTypeEnum
status: StatusEnum
source_id: str = ""
error_msg: str
[docs]
class InfoMessageMetadataSchema(BaseModelNoExt):
task: TaskTypeEnum
status: StatusEnum
message: str
filter: bool
# Main metadata schema
[docs]
class MetadataSchema(BaseModelNoExt):
content: str = ""
content_url: str = ""
embedding: Optional[List[float]] = None
source_metadata: Optional[SourceMetadataSchema] = None
content_metadata: Optional[ContentMetadataSchema] = None
audio_metadata: Optional[AudioMetadataSchema] = None
text_metadata: Optional[TextMetadataSchema] = None
image_metadata: Optional[ImageMetadataSchema] = None
table_metadata: Optional[TableMetadataSchema] = None
chart_metadata: Optional[ChartMetadataSchema] = None
error_metadata: Optional[ErrorMetadataSchema] = None
info_message_metadata: Optional[InfoMessageMetadataSchema] = None
debug_metadata: Optional[Dict[str, Any]] = None
raise_on_failure: bool = False
[docs]
@model_validator(mode="before")
@classmethod
def check_metadata_type(cls, values):
content_type = values.get("content_metadata", {}).get("type", None)
if content_type != ContentTypeEnum.AUDIO:
values["audio_metadata"] = None
if content_type != ContentTypeEnum.IMAGE:
values["image_metadata"] = None
if content_type != ContentTypeEnum.TEXT:
values["text_metadata"] = None
if content_type != ContentTypeEnum.STRUCTURED:
values["table_metadata"] = None
return values
[docs]
def validate_metadata(metadata: Dict[str, Any]) -> MetadataSchema:
"""
Validates the given metadata dictionary against the MetadataSchema.
Parameters:
- metadata: A dictionary representing metadata to be validated.
Returns:
- An instance of MetadataSchema if validation is successful.
Raises:
- ValidationError: If the metadata does not conform to the schema.
"""
return MetadataSchema(**metadata)