Source code for nv_ingest_api.internal.schemas.meta.metadata_schema
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import logging
from datetime import datetime
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Union
from pydantic import field_validator, model_validator, Field
from nv_ingest_api.internal.enums.common import (
AccessLevelEnum,
ContentTypeEnum,
TextTypeEnum,
LanguageEnum,
TableFormatEnum,
StatusEnum,
DocumentTypeEnum,
TaskTypeEnum,
)
from nv_ingest_api.internal.schemas.meta.base_model_noext import BaseModelNoExt
from nv_ingest_api.util.converters import datetools
logger = logging.getLogger(__name__)
# Sub schemas
[docs]
class SourceMetadataSchema(BaseModelNoExt):
"""
Schema for the knowledge base file from which content
and metadata is extracted.
"""
source_name: str
source_id: str
source_location: str = ""
source_type: Union[DocumentTypeEnum, str]
collection_id: str = ""
date_created: str = datetime.now().isoformat()
last_modified: str = datetime.now().isoformat()
summary: str = ""
partition_id: int = -1
access_level: Union[AccessLevelEnum, int] = AccessLevelEnum.UNKNOWN
[docs]
@field_validator("date_created", "last_modified")
@classmethod
def validate_fields(cls, field_value):
datetools.validate_iso8601(field_value)
return field_value
[docs]
class NearbyObjectsSubSchema(BaseModelNoExt):
"""
Schema to hold related extracted object.
"""
content: List[str] = Field(default_factory=list)
bbox: List[tuple] = Field(default_factory=list)
type: List[str] = Field(default_factory=list)
[docs]
class NearbyObjectsSchema(BaseModelNoExt):
"""
Schema to hold types of related extracted objects.
"""
text: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
images: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
structured: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
[docs]
class ContentHierarchySchema(BaseModelNoExt):
"""
Schema for the extracted content hierarchy.
"""
page_count: int = -1
page: int = -1
block: int = -1
line: int = -1
span: int = -1
nearby_objects: NearbyObjectsSchema = NearbyObjectsSchema()
[docs]
class ContentMetadataSchema(BaseModelNoExt):
"""
Data extracted from a source; generally Text or Image.
"""
type: ContentTypeEnum
description: str = ""
page_number: int = -1
hierarchy: ContentHierarchySchema = ContentHierarchySchema()
subtype: Union[ContentTypeEnum, str] = ""
start_time: int = -1
end_time: int = -1
[docs]
class TextMetadataSchema(BaseModelNoExt):
text_type: TextTypeEnum
summary: str = ""
keywords: Union[str, List[str], Dict] = ""
language: LanguageEnum = "en" # default to Unknown? Maybe do some kind of heuristic check
text_location: tuple = (0, 0, 0, 0)
text_location_max_dimensions: tuple = (0, 0, 0, 0)
[docs]
class ImageMetadataSchema(BaseModelNoExt):
image_type: Union[DocumentTypeEnum, str]
structured_image_type: ContentTypeEnum = ContentTypeEnum.NONE
caption: str = ""
text: str = ""
image_location: tuple = (0, 0, 0, 0)
image_location_max_dimensions: tuple = (0, 0)
uploaded_image_url: str = ""
width: int = 0
height: int = 0
[docs]
@field_validator("image_type")
def validate_image_type(cls, v):
if not isinstance(v, (DocumentTypeEnum, str)):
raise ValueError("image_type must be a string or DocumentTypeEnum")
return v
[docs]
@field_validator("width", "height")
def clamp_non_negative(cls, v, field):
if v < 0:
logger.warning(f"{field.field_name} is negative; clamping to 0. Original value: {v}")
return 0
return v
[docs]
class TableMetadataSchema(BaseModelNoExt):
caption: str = ""
table_format: TableFormatEnum
table_content: str = ""
table_content_format: Union[TableFormatEnum, str] = ""
table_location: tuple = (0, 0, 0, 0)
table_location_max_dimensions: tuple = (0, 0)
uploaded_image_uri: str = ""
[docs]
class ChartMetadataSchema(BaseModelNoExt):
caption: str = ""
table_format: TableFormatEnum
table_content: str = ""
table_content_format: Union[TableFormatEnum, str] = ""
table_location: tuple = (0, 0, 0, 0)
table_location_max_dimensions: tuple = (0, 0)
uploaded_image_uri: str = ""
# TODO consider deprecating this in favor of info msg...
[docs]
class ErrorMetadataSchema(BaseModelNoExt):
task: TaskTypeEnum
status: StatusEnum
source_id: str = ""
error_msg: str
[docs]
class InfoMessageMetadataSchema(BaseModelNoExt):
task: TaskTypeEnum
status: StatusEnum
message: str
filter: bool
# Main metadata schema
[docs]
class MetadataSchema(BaseModelNoExt):
content: str = ""
content_url: str = ""
embedding: Optional[List[float]] = None
source_metadata: Optional[SourceMetadataSchema] = None
content_metadata: Optional[ContentMetadataSchema] = None
audio_metadata: Optional[AudioMetadataSchema] = None
text_metadata: Optional[TextMetadataSchema] = None
image_metadata: Optional[ImageMetadataSchema] = None
table_metadata: Optional[TableMetadataSchema] = None
chart_metadata: Optional[ChartMetadataSchema] = None
error_metadata: Optional[ErrorMetadataSchema] = None
info_message_metadata: Optional[InfoMessageMetadataSchema] = None
debug_metadata: Optional[Dict[str, Any]] = None
raise_on_failure: bool = False
[docs]
@model_validator(mode="before")
@classmethod
def check_metadata_type(cls, values):
content_type = values.get("content_metadata", {}).get("type", None)
if content_type != ContentTypeEnum.AUDIO:
values["audio_metadata"] = None
if content_type != ContentTypeEnum.IMAGE:
values["image_metadata"] = None
if content_type != ContentTypeEnum.TEXT:
values["text_metadata"] = None
if content_type != ContentTypeEnum.STRUCTURED:
values["table_metadata"] = None
return values
[docs]
def validate_metadata(metadata: Dict[str, Any]) -> MetadataSchema:
"""
Validates the given metadata dictionary against the MetadataSchema.
Parameters:
- metadata: A dictionary representing metadata to be validated.
Returns:
- An instance of MetadataSchema if validation is successful.
Raises:
- ValidationError: If the metadata does not conform to the schema.
"""
return MetadataSchema(**metadata)