Source code for nv_ingest_api.internal.enums.common

# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0


import logging
from enum import Enum
from typing import Type, Any

logger = logging.getLogger(__name__)



[docs]
class AccessLevelEnum(int, Enum):
    """
    Note
    ----
    This is for future use, and currently has no functional use case.

    Enum for representing different access levels.

    Attributes
    ----------
    LEVEL_1 : int
        Represents access level 1.
    LEVEL_2 : int
        Represents access level 2.
    LEVEL_3 : int
        Represents access level 3.
    """

    UNKNOWN: int = -1
    LEVEL_1: int = 1
    LEVEL_2: int = 2
    LEVEL_3: int = 3




[docs]
class ContentDescriptionEnum(str, Enum):
    """
    Enum for standard content descriptions extracted from different source types.

    Attributes
    ----------
    DOCX_IMAGE : str
        Description for image extracted from DOCX document.
    DOCX_TABLE : str
        Description for structured table extracted from DOCX document.
    DOCX_TEXT : str
        Description for unstructured text from DOCX document.
    PDF_CHART : str
        Description for structured chart extracted from PDF document.
    PDF_IMAGE : str
        Description for image extracted from PDF document.
    PDF_INFOGRAPHIC : str
        Description for structured infographic extracted from PDF document.
    PDF_TABLE : str
        Description for structured table extracted from PDF document.
    PDF_TEXT : str
        Description for unstructured text from PDF document.
    PPTX_IMAGE : str
        Description for image extracted from PPTX presentation.
    PPTX_TABLE : str
        Description for structured table extracted from PPTX presentation.
    PPTX_TEXT : str
        Description for unstructured text from PPTX presentation.
    """

    DOCX_IMAGE: str = "Image extracted from DOCX document."
    DOCX_TABLE: str = "Structured table extracted from DOCX document."
    DOCX_TEXT: str = "Unstructured text from DOCX document."
    PDF_CHART: str = "Structured chart extracted from PDF document."
    PDF_IMAGE: str = "Image extracted from PDF document."
    PDF_INFOGRAPHIC: str = "Structured infographic extracted from PDF document."
    PDF_TABLE: str = "Structured table extracted from PDF document."
    PDF_TEXT: str = "Unstructured text from PDF document."
    PPTX_IMAGE: str = "Image extracted from PPTX presentation."
    PPTX_TABLE: str = "Structured table extracted from PPTX presentation."
    PPTX_TEXT: str = "Unstructured text from PPTX presentation."




[docs]
class ContentTypeEnum(str, Enum):
    """
    Enum for representing various content types.

    Note: Content type declares the broad category of the content, such as text, image, audio, etc.
    This is not equivalent to the Document type, which is a specific file format.

    Attributes
    ----------
    AUDIO : str
        Represents audio content.
    EMBEDDING : str
        Represents embedding content.
    IMAGE : str
        Represents image content.
    INFO_MSG : str
        Represents an informational message.
    STRUCTURED : str
        Represents structured content.
    TEXT : str
        Represents text content.
    UNSTRUCTURED : str
        Represents unstructured content.
    VIDEO : str
        Represents video content.
    """

    AUDIO: str = "audio"
    CHART: str = "chart"
    EMBEDDING: str = "embedding"
    IMAGE: str = "image"
    INFOGRAPHIC: str = "infographic"
    INFO_MSG: str = "info_message"
    NONE: str = "none"
    STRUCTURED: str = "structured"
    TABLE: str = "table"
    TEXT: str = "text"
    UNKNOWN: str = "unknown"
    VIDEO: str = "video"




[docs]
class DocumentTypeEnum(str, Enum):
    """
    Enum for representing various document file types.

    Note: Document type refers to the specific file format of the content, such as PDF, DOCX, etc.
    This is not equivalent to the Content type, which is a broad category of the content.

    Attributes
    ----------
    BMP: str
        BMP image format.
    DOCX: str
        Microsoft Word document format.
    HTML: str
        HTML document.
    JPEG: str
        JPEG image format.
    PDF: str
        PDF document format.
    PNG: str
        PNG image format.
    PPTX: str
        PowerPoint presentation format.
    SVG: str
        SVG image format.
    TIFF: str
        TIFF image format.
    TXT: str
        Plain text file.
    MP3: str
        MP3 audio format.
    WAV: str
        WAV audio format.
    """

    BMP: str = "bmp"
    DOCX: str = "docx"
    HTML: str = "html"
    JPEG: str = "jpeg"
    PDF: str = "pdf"
    PNG: str = "png"
    PPTX: str = "pptx"
    SVG: str = "svg"
    TIFF: str = "tiff"
    TXT: str = "text"
    MD: str = "text"
    MP3: str = "mp3"
    WAV: str = "wav"
    UNKNOWN: str = "unknown"




[docs]
class LanguageEnum(str, Enum):
    """
    Enum for representing various language codes.

    Attributes
    ----------
    AF : str
        Afrikaans language code.
    AR : str
        Arabic language code.
    BG : str
        Bulgarian language code.
    BN : str
        Bengali language code.
    CA : str
        Catalan language code.
    CS : str
        Czech language code.
    CY : str
        Welsh language code.
    DA : str
        Danish language code.
    DE : str
        German language code.
    EL : str
        Greek language code.
    EN : str
        English language code.
    ES : str
        Spanish language code.
    ET : str
        Estonian language code.
    FA : str
        Persian language code.
    FI : str
        Finnish language code.
    FR : str
        French language code.
    GU : str
        Gujarati language code.
    HE : str
        Hebrew language code.
    HI : str
        Hindi language code.
    HR : str
        Croatian language code.
    HU : str
        Hungarian language code.
    ID : str
        Indonesian language code.
    IT : str
        Italian language code.
    JA : str
        Japanese language code.
    KN : str
        Kannada language code.
    KO : str
        Korean language code.
    LT : str
        Lithuanian language code.
    LV : str
        Latvian language code.
    MK : str
        Macedonian language code.
    ML : str
        Malayalam language code.
    MR : str
        Marathi language code.
    NE : str
        Nepali language code.
    NL : str
        Dutch language code.
    NO : str
        Norwegian language code.
    PA : str
        Punjabi language code.
    PL : str
        Polish language code.
    PT : str
        Portuguese language code.
    RO : str
        Romanian language code.
    RU : str
        Russian language code.
    SK : str
        Slovak language code.
    SL : str
        Slovenian language code.
    SO : str
        Somali language code.
    SQ : str
        Albanian language code.
    SV : str
        Swedish language code.
    SW : str
        Swahili language code.
    TA : str
        Tamil language code.
    TE : str
        Telugu language code.
    TH : str
        Thai language code.
    TL : str
        Tagalog language code.
    TR : str
        Turkish language code.
    UK : str
        Ukrainian language code.
    UR : str
        Urdu language code.
    VI : str
        Vietnamese language code.
    ZH_CN : str
        Chinese (Simplified) language code.
    ZH_TW : str
        Chinese (Traditional) language code.
    UNKNOWN : str
        Represents an unknown language.
    """

    AF: str = "af"
    AR: str = "ar"
    BG: str = "bg"
    BN: str = "bn"
    CA: str = "ca"
    CS: str = "cs"
    CY: str = "cy"
    DA: str = "da"
    DE: str = "de"
    EL: str = "el"
    EN: str = "en"
    ES: str = "es"
    ET: str = "et"
    FA: str = "fa"
    FI: str = "fi"
    FR: str = "fr"
    GU: str = "gu"
    HE: str = "he"
    HI: str = "hi"
    HR: str = "hr"
    HU: str = "hu"
    ID: str = "id"
    IT: str = "it"
    JA: str = "ja"
    KN: str = "kn"
    KO: str = "ko"
    LT: str = "lt"
    LV: str = "lv"
    MK: str = "mk"
    ML: str = "ml"
    MR: str = "mr"
    NE: str = "ne"
    NL: str = "nl"
    NO: str = "no"
    PA: str = "pa"
    PL: str = "pl"
    PT: str = "pt"
    RO: str = "ro"
    RU: str = "ru"
    SK: str = "sk"
    SL: str = "sl"
    SO: str = "so"
    SQ: str = "sq"
    SV: str = "sv"
    SW: str = "sw"
    TA: str = "ta"
    TE: str = "te"
    TH: str = "th"
    TL: str = "tl"
    TR: str = "tr"
    UK: str = "uk"
    UR: str = "ur"
    VI: str = "vi"
    ZH_CN: str = "zh-cn"
    ZH_TW: str = "zh-tw"
    UNKNOWN: str = "unknown"


[docs]
    @classmethod
    def has_value(cls: Type["LanguageEnum"], value: Any) -> bool:
        """
        Check if the enum contains the given value.

        Parameters
        ----------
        value : Any
            The value to check against the enum members.

        Returns
        -------
        bool
            True if the value exists in the enum, False otherwise.
        """
        return value in cls._value2member_map_





[docs]
class StatusEnum(str, Enum):
    """
    Enum for representing status messages.

    Attributes
    ----------
    ERROR : str
        Represents an error status.
    SUCCESS : str
        Represents a success status.
    """

    ERROR: str = "error"
    SUCCESS: str = "success"




[docs]
class TableFormatEnum(str, Enum):
    """
    Enum for representing table formats.

    Attributes
    ----------
    HTML : str
        Represents HTML table format.
    IMAGE : str
        Represents image table format.
    LATEX : str
        Represents LaTeX table format.
    MARKDOWN : str
        Represents Markdown table format.
    PSEUDO_MARKDOWN : str
        Represents pseudo Markdown table format.
    SIMPLE : str
        Represents simple table format.
    """

    HTML: str = "html"
    IMAGE: str = "image"
    LATEX: str = "latex"
    MARKDOWN: str = "markdown"
    PSEUDO_MARKDOWN: str = "pseudo_markdown"
    SIMPLE: str = "simple"




[docs]
class TaskTypeEnum(str, Enum):
    """
    Enum for representing various task types.

    Attributes
    ----------
    CAPTION : str
        Represents a caption task.
    DEDUP : str
        Represents a deduplication task.
    EMBED : str
        Represents an embedding task.
    EXTRACT : str
        Represents an extraction task.
    FILTER : str
        Represents a filtering task.
    SPLIT : str
        Represents a splitting task.
    STORE : str
        Represents a storing task.
    STORE_EMBEDDING : str
        Represents a task for storing embeddings.
    VDB_UPLOAD : str
        Represents a task for uploading to a vector database.
    AUDIO_DATA_EXTRACT : str
        Represents a task for extracting audio data.
    TABLE_DATA_EXTRACT : str
        Represents a task for extracting table data.
    CHART_DATA_EXTRACT : str
        Represents a task for extracting chart data.
    INFOGRAPHIC_DATA_EXTRACT : str
        Represents a task for extracting infographic data.
    """

    AUDIO_DATA_EXTRACT: str = "audio_data_extract"
    CAPTION: str = "caption"
    CHART_DATA_EXTRACT: str = "chart_data_extract"
    DEDUP: str = "dedup"
    EMBED: str = "embed"
    EXTRACT: str = "extract"
    FILTER: str = "filter"
    INFOGRAPHIC_DATA_EXTRACT: str = "infographic_data_extract"
    SPLIT: str = "split"
    STORE_EMBEDDING: str = "store_embedding"
    STORE: str = "store"
    TABLE_DATA_EXTRACT: str = "table_data_extract"
    VDB_UPLOAD: str = "vdb_upload"




[docs]
class TextTypeEnum(str, Enum):
    """
    Enum for representing different types of text segments.

    Attributes
    ----------
    BLOCK : str
        Represents a text block.
    BODY : str
        Represents body text.
    DOCUMENT : str
        Represents an entire document.
    HEADER : str
        Represents a header text.
    LINE : str
        Represents a single line of text.
    NEARBY_BLOCK : str
        Represents a block of text in close proximity to another.
    OTHER : str
        Represents other unspecified text type.
    PAGE : str
        Represents a page of text.
    SPAN : str
        Represents an inline text span.
    """

    BLOCK: str = "block"
    BODY: str = "body"
    DOCUMENT: str = "document"
    HEADER: str = "header"
    LINE: str = "line"
    NEARBY_BLOCK: str = "nearby_block"
    OTHER: str = "other"
    PAGE: str = "page"
    SPAN: str = "span"