nv_ingest.schemas package#

Submodules#

nv_ingest.schemas.associate_nearby_text_schema module#

class nv_ingest.schemas.associate_nearby_text_schema.AssociateNearbyTextSchema( *, n_neighbors: int = 5, raise_on_failure: bool = False, )[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_neighbors: int#

raise_on_failure: bool#

nv_ingest.schemas.base_model_noext module#

class nv_ingest.schemas.base_model_noext.BaseModelNoExt[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nv_ingest.schemas.chart_extractor_schema module#

class nv_ingest.schemas.chart_extractor_schema.ChartExtractorConfigSchema( *, auth_token: str | None = None, yolox_endpoints: Tuple[str | None, str | None] = (None, None), yolox_infer_protocol: str = '', paddle_endpoints: Tuple[str | None, str | None] = (None, None), paddle_infer_protocol: str = '', nim_batch_size: int = 2, workers_per_progress_engine: int = 5, )[source]#

Bases: BaseModel

Configuration schema for chart extraction service endpoints and options.

Parameters:

auth_token (Optional[str], default=None) – Authentication token required for secure services.
yolox_endpoints (Tuple[Optional[str], Optional[str]], default=(None, None)) – A tuple containing the gRPC and HTTP services for the yolox endpoint. Either the gRPC or HTTP service can be empty, but not both.
paddle_endpoints (Tuple[Optional[str], Optional[str]], default=(None, None)) – A tuple containing the gRPC and HTTP services for the paddle endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#: Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.
Config –
------ –

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nim_batch_size: int#

paddle_endpoints: Tuple[str | None, str | None]#

paddle_infer_protocol: str#

classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Ensures that at least one service (either gRPC or HTTP) is provided for each endpoint in the configuration.

Parameters:: values (dict) – Dictionary containing the values of the attributes for the class.
Returns:: The validated dictionary of values.
Return type:: dict
Raises:: ValueError – If both gRPC and HTTP services are empty for any endpoint.

workers_per_progress_engine: int#

yolox_endpoints: Tuple[str | None, str | None]#

yolox_infer_protocol: str#

class nv_ingest.schemas.chart_extractor_schema.ChartExtractorSchema( *, max_queue_size: int = 1, n_workers: int = 2, raise_on_failure: bool = False, stage_config: ChartExtractorConfigSchema | None = None, )[source]#

Bases: BaseModel

Configuration schema for chart extraction processing settings.

Parameters:

max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.
n_workers (int, default=2) – The number of worker threads to use for processing.
raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception if a failure occurs during chart extraction.
stage_config (Optional[ChartExtractorConfigSchema], default=None) – Configuration for the chart extraction stage, including yolox and paddle service endpoints.

classmethod check_positive(v, field)[source]#

max_queue_size: int#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#

raise_on_failure: bool#

stage_config: ChartExtractorConfigSchema | None#

nv_ingest.schemas.docx_extractor_schema module#

class nv_ingest.schemas.docx_extractor_schema.DocxConfigSchema( *, auth_token: str | None = None, yolox_endpoints: Tuple[str | None, str | None] = (None, None), yolox_infer_protocol: str = '', )[source]#

Bases: BaseModel

Configuration schema for docx extraction endpoints and options.

Parameters:

auth_token (Optional[str], default=None) – Authentication token required for secure services.
yolox_endpoints (Tuple[str, str]) – A tuple containing the gRPC and HTTP services for the yolox endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#: Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.
Config –
------ –

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Parameters:: values (dict) – Dictionary containing the values of the attributes for the class.
Returns:: The validated dictionary of values.
Return type:: dict
Raises:: ValueError – If both gRPC and HTTP services are empty for any endpoint.

yolox_endpoints: Tuple[str | None, str | None]#

yolox_infer_protocol: str#

class nv_ingest.schemas.docx_extractor_schema.DocxExtractorSchema( *, max_queue_size: int = 1, n_workers: int = 16, raise_on_failure: bool = False, docx_extraction_config: DocxConfigSchema | None = None, )[source]#

Bases: BaseModel

Configuration schema for the PDF extractor settings.

Parameters:

max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.
n_workers (int, default=16) – The number of worker threads to use for processing.
raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception on processing failure.
image_extraction_config (Optional[ImageConfigSchema], default=None) – Configuration schema for the image extraction stage.

docx_extraction_config: DocxConfigSchema | None#

max_queue_size: int#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#

raise_on_failure: bool#

nv_ingest.schemas.embed_extractions_schema module#

class nv_ingest.schemas.embed_extractions_schema.EmbedExtractionsSchema( *, api_key: str = 'api_key', batch_size: int = 8192, embedding_model: str = 'nvidia/nv-embedqa-e5-v5', embedding_nim_endpoint: str = 'http://embedding:8000/v1', encoding_format: str = 'float', httpx_log_level: LogLevel = LogLevel.WARNING, input_type: str = 'passage', raise_on_failure: bool = False, truncate: str = 'END', )[source]#

Bases: BaseModel

api_key: str#

batch_size: int#

embedding_model: str#

embedding_nim_endpoint: str#

encoding_format: str#

httpx_log_level: LogLevel#

input_type: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

truncate: str#

nv_ingest.schemas.embedding_storage_schema module#

class nv_ingest.schemas.embedding_storage_schema.EmbeddingStorageModuleSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

nv_ingest.schemas.file_source_pipe_schema module#

class nv_ingest.schemas.file_source_pipe_schema.FileSourcePipeSchema( *, batch_size: int = 1024, chunk_overlap: int = 51, chunk_size: int = 512, converters_meta: ~typing.Dict[~typing.Any, ~typing.Any] | None = {}, enable_monitor: bool = False, extractor_config: ~typing.Dict[~typing.Any, ~typing.Any] | None = {}, filenames: ~typing.List[str] = <factory>, num_threads: int = 1, vdb_resource_name: str, watch: bool = False, watch_interval: float = -5.0, )[source]#

Bases: BaseModel

batch_size: int#

chunk_overlap: int#

chunk_size: int#

converters_meta: Dict[Any, Any] | None#

enable_monitor: bool#

extractor_config: Dict[Any, Any] | None#

filenames: List[str]#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

num_threads: int#

vdb_resource_name: str#

watch: bool#

watch_interval: float#

nv_ingest.schemas.image_caption_extraction_schema module#

class nv_ingest.schemas.image_caption_extraction_schema.ImageCaptionExtractionSchema( *, api_key: str = 'api_key', endpoint_url: str = 'https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions', prompt: str = 'Caption the content of this image:', model_name: str = 'meta/llama-3.2-11b-vision-instruct', raise_on_failure: bool = False, )[source]#

Bases: BaseModel

api_key: str#

endpoint_url: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str#

prompt: str#

raise_on_failure: bool#

nv_ingest.schemas.image_dedup_schema module#

class nv_ingest.schemas.image_dedup_schema.ImageDedupSchema( *, raise_on_failure: Annotated[bool, Strict(strict=True)] = False, cpu_only: Annotated[bool, Strict(strict=True)] = False, )[source]#

Bases: BaseModel

cpu_only: Annotated[bool, Strict(strict=True)]#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: Annotated[bool, Strict(strict=True)]#

nv_ingest.schemas.image_extractor_schema module#

class nv_ingest.schemas.image_extractor_schema.ImageConfigSchema( *, auth_token: str | None = None, yolox_endpoints: Tuple[str | None, str | None] = (None, None), yolox_infer_protocol: str = '', )[source]#

Bases: BaseModel

Configuration schema for image extraction endpoints and options.

Parameters:

auth_token (Optional[str], default=None) – Authentication token required for secure services.
yolox_endpoints (Tuple[str, str]) – A tuple containing the gRPC and HTTP services for the yolox endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#: Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.
Config –
------ –

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Parameters:: values (dict) – Dictionary containing the values of the attributes for the class.
Returns:: The validated dictionary of values.
Return type:: dict
Raises:: ValueError – If both gRPC and HTTP services are empty for any endpoint.

yolox_endpoints: Tuple[str | None, str | None]#

yolox_infer_protocol: str#

class nv_ingest.schemas.image_extractor_schema.ImageExtractorSchema( *, max_queue_size: int = 1, n_workers: int = 16, raise_on_failure: bool = False, image_extraction_config: ImageConfigSchema | None = None, )[source]#

Bases: BaseModel

Configuration schema for the PDF extractor settings.

Parameters:

max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.
n_workers (int, default=16) – The number of worker threads to use for processing.
raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception on processing failure.
image_extraction_config (Optional[ImageConfigSchema], default=None) – Configuration schema for the image extraction stage.

image_extraction_config: ImageConfigSchema | None#

max_queue_size: int#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#

raise_on_failure: bool#

nv_ingest.schemas.image_filter_schema module#

class nv_ingest.schemas.image_filter_schema.ImageFilterSchema( *, raise_on_failure: Annotated[bool, Strict(strict=True)] = False, cpu_only: Annotated[bool, Strict(strict=True)] = False, )[source]#

Bases: BaseModel

cpu_only: Annotated[bool, Strict(strict=True)]#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: Annotated[bool, Strict(strict=True)]#

nv_ingest.schemas.image_storage_schema module#

class nv_ingest.schemas.image_storage_schema.ImageStorageModuleSchema( *, structured: bool = True, images: bool = True, raise_on_failure: bool = False, )[source]#

Bases: BaseModel

images: bool#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

structured: bool#

nv_ingest.schemas.infographic_extractor_schema module#

class nv_ingest.schemas.infographic_extractor_schema.InfographicExtractorConfigSchema( *, auth_token: str | None = None, paddle_endpoints: Tuple[str | None, str | None] = (None, None), paddle_infer_protocol: str = '', nim_batch_size: int = 2, workers_per_progress_engine: int = 5, )[source]#

Bases: BaseModel

Configuration schema for infographic extraction service endpoints and options.

Parameters:

auth_token (Optional[str], default=None) – Authentication token required for secure services.
paddle_endpoints (Tuple[Optional[str], Optional[str]], default=(None, None)) – A tuple containing the gRPC and HTTP services for the paddle endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#: Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.
Config –
------ –

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nim_batch_size: int#

paddle_endpoints: Tuple[str | None, str | None]#

paddle_infer_protocol: str#

classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Ensures that at least one service (either gRPC or HTTP) is provided for each endpoint in the configuration.

Parameters:: values (dict) – Dictionary containing the values of the attributes for the class.
Returns:: The validated dictionary of values.
Return type:: dict
Raises:: ValueError – If both gRPC and HTTP services are empty for any endpoint.

workers_per_progress_engine: int#

class nv_ingest.schemas.infographic_extractor_schema.InfographicExtractorSchema( *, max_queue_size: int = 1, n_workers: int = 2, raise_on_failure: bool = False, stage_config: InfographicExtractorConfigSchema | None = None, )[source]#

Bases: BaseModel

Configuration schema for infographic extraction processing settings.

Parameters:

max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.
n_workers (int, default=2) – The number of worker threads to use for processing.
raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception if a failure occurs during infographic extraction.
stage_config (Optional[InfographicExtractorConfigSchema], default=None) – Configuration for the infographic extraction stage, including yolox and paddle service endpoints.

classmethod check_positive(v, field)[source]#

max_queue_size: int#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#

raise_on_failure: bool#

stage_config: InfographicExtractorConfigSchema | None#

nv_ingest.schemas.ingest_job_schema module#

class nv_ingest.schemas.ingest_job_schema.DocumentTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

bmp = 'bmp'#

docx = 'docx'#

html = 'html'#

jpeg = 'jpeg'#

mp3 = 'mp3'#

pdf = 'pdf'#

png = 'png'#

pptx = 'pptx'#

svg = 'svg'#

tiff = 'tiff'#

txt = 'text'#

wav = 'wav'#

class nv_ingest.schemas.ingest_job_schema.FilterTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

image = 'image'#

class nv_ingest.schemas.ingest_job_schema.IngestJobSchema( *, job_payload: JobPayloadSchema, job_id: str | int, tasks: List[IngestTaskSchema], tracing_options: TracingOptionsSchema | None = None, )[source]#

Bases: BaseModelNoExt

job_id: str | int#

job_payload: JobPayloadSchema#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

tasks: List[IngestTaskSchema]#

tracing_options: TracingOptionsSchema | None#

Bases: BaseModelNoExt

auth_token: str | None#

function_id: str | None#

grpc_endpoint: str | None#

http_endpoint: str | None#

infer_protocol: str | None#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

ssl_cert: str | None#

use_ssl: bool | None#

class nv_ingest.schemas.ingest_job_schema.IngestTaskCaptionSchema( *, api_key: str | None = None, endpoint_url: str | None = None, prompt: str | None = None, model_name: str | None = None, )[source]#

Bases: BaseModelNoExt

api_key: str | None#

endpoint_url: str | None#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str | None#

prompt: str | None#

class nv_ingest.schemas.ingest_job_schema.IngestTaskChartExtraction(*, params: ~typing.Dict = <factory>)[source]#

Bases: BaseModelNoExt

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: Dict#

class nv_ingest.schemas.ingest_job_schema.IngestTaskDedupParams(*, filter: bool = False)[source]#

Bases: BaseModelNoExt

filter: bool#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest.schemas.ingest_job_schema.IngestTaskDedupSchema( *, content_type: ContentTypeEnum = ContentTypeEnum.IMAGE, params: IngestTaskDedupParams = IngestTaskDedupParams(filter=False), )[source]#

Bases: BaseModelNoExt

content_type: ContentTypeEnum#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: IngestTaskDedupParams#

class nv_ingest.schemas.ingest_job_schema.IngestTaskEmbedSchema( *, endpoint_url: str | None = None, model_name: str | None = None, api_key: str | None = None, filter_errors: bool = False, )[source]#

Bases: BaseModelNoExt

api_key: str | None#

endpoint_url: str | None#

filter_errors: bool#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str | None#

class nv_ingest.schemas.ingest_job_schema.IngestTaskExtractSchema( *, document_type: DocumentTypeEnum, method: str, params: dict, )[source]#

Bases: BaseModelNoExt

classmethod case_insensitive_document_type(v)[source]#

document_type: DocumentTypeEnum#

method: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#

class nv_ingest.schemas.ingest_job_schema.IngestTaskFilterParamsSchema( *, min_size: int = 128, max_aspect_ratio: float | int = 5.0, min_aspect_ratio: float | int = 0.2, filter: bool = False, )[source]#

Bases: BaseModelNoExt

filter: bool#

max_aspect_ratio: float | int#

min_aspect_ratio: float | int#

min_size: int#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest.schemas.ingest_job_schema.IngestTaskFilterSchema( *, content_type: ContentTypeEnum = ContentTypeEnum.IMAGE, params: IngestTaskFilterParamsSchema = IngestTaskFilterParamsSchema(min_size=128, max_aspect_ratio=5.0, min_aspect_ratio=0.2, filter=False), )[source]#

Bases: BaseModelNoExt

content_type: ContentTypeEnum#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: IngestTaskFilterParamsSchema#

class nv_ingest.schemas.ingest_job_schema.IngestTaskInfographicExtraction(*, params: ~typing.Dict = <factory>)[source]#

Bases: BaseModelNoExt

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: Dict#

Bases: BaseModelNoExt

classmethod case_insensitive_task_type(v)[source]#

classmethod check_task_properties_type(values)[source]#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

task_properties: IngestTaskSplitSchema | IngestTaskExtractSchema | IngestTaskStoreEmbedSchema | IngestTaskStoreSchema | IngestTaskEmbedSchema | IngestTaskCaptionSchema | IngestTaskDedupSchema | IngestTaskFilterSchema | IngestTaskVdbUploadSchema | IngestTaskAudioExtraction | IngestTaskTableExtraction | IngestTaskChartExtraction | IngestTaskInfographicExtraction#

type: TaskTypeEnum#

class nv_ingest.schemas.ingest_job_schema.IngestTaskSplitSchema( *, tokenizer: str | None = None, chunk_size: Annotated[int, Gt(gt=0)] = 1024, chunk_overlap: Annotated[int, Ge(ge=0)] = 150, params: dict, )[source]#

Bases: BaseModelNoExt

classmethod check_chunk_overlap(v, values, **kwargs)[source]#

chunk_overlap: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])]#

chunk_size: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Gt(gt=0)])]#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#

tokenizer: str | None#

class nv_ingest.schemas.ingest_job_schema.IngestTaskStoreEmbedSchema(*, params: dict)[source]#

Bases: BaseModelNoExt

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#

class nv_ingest.schemas.ingest_job_schema.IngestTaskStoreSchema( *, structured: bool = True, images: bool = False, method: str, params: dict, )[source]#

Bases: BaseModelNoExt

images: bool#

method: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#

structured: bool#

class nv_ingest.schemas.ingest_job_schema.IngestTaskTableExtraction(*, params: ~typing.Dict = <factory>)[source]#

Bases: BaseModelNoExt

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: Dict#

class nv_ingest.schemas.ingest_job_schema.IngestTaskVdbUploadSchema( *, bulk_ingest: bool = False, bulk_ingest_path: str = None, params: dict = None, filter_errors: bool = True, )[source]#

Bases: BaseModelNoExt

bulk_ingest: bool#

bulk_ingest_path: str#

filter_errors: bool#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#

class nv_ingest.schemas.ingest_job_schema.JobPayloadSchema( *, content: List[str | bytes], source_name: List[str], source_id: List[str | int], document_type: List[str], )[source]#

Bases: BaseModelNoExt

content: List[str | bytes]#

document_type: List[str]#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

source_id: List[str | int]#

source_name: List[str]#

class nv_ingest.schemas.ingest_job_schema.TaskTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

audio_data_extract = 'audio_data_extract'#

caption = 'caption'#

chart_data_extract = 'chart_data_extract'#

dedup = 'dedup'#

embed = 'embed'#

extract = 'extract'#

filter = 'filter'#

infographic_data_extract = 'infographic_data_extract'#

split = 'split'#

store = 'store'#

store_embedding = 'store_embedding'#

table_data_extract = 'table_data_extract'#

vdb_upload = 'vdb_upload'#

class nv_ingest.schemas.ingest_job_schema.TracingOptionsSchema( *, trace: bool = False, ts_send: int, trace_id: str | None = None, )[source]#

Bases: BaseModelNoExt

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

trace: bool#

trace_id: str | None#

ts_send: int#

nv_ingest.schemas.ingest_job_schema.validate_ingest_job( job_data: Dict[str, Any], ) → IngestJobSchema[source]#

Validates a dictionary representing an ingest_job using the IngestJobSchema.

Parameters: - job_data: Dictionary representing an ingest job.

Returns: - IngestJobSchema: The validated ingest job.

Raises: - ValidationError: If the input data does not conform to the IngestJobSchema.

nv_ingest.schemas.ingest_pipeline_config_schema module#

class nv_ingest.schemas.ingest_pipeline_config_schema.PipelineConfigSchema(*, audio_extractor_schema: ~nv_ingest.schemas.audio_extractor_schema.AudioExtractorSchema = AudioExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, audio_extraction_config=None), chart_extractor_module: ~nv_ingest.schemas.chart_extractor_schema.ChartExtractorSchema = ChartExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, stage_config=None), text_splitter_module: ~nv_ingest.schemas.text_splitter_schema.TextSplitterSchema = TextSplitterSchema(tokenizer=None, chunk_size=1024, chunk_overlap=150, raise_on_failure=False), embedding_storage_module: ~nv_ingest.schemas.embedding_storage_schema.EmbeddingStorageModuleSchema = EmbeddingStorageModuleSchema(raise_on_failure=False), embed_extractions_module: ~nv_ingest.schemas.embed_extractions_schema.EmbedExtractionsSchema = EmbedExtractionsSchema(api_key='api_key', batch_size=8192, embedding_model='nvidia/nv-embedqa-e5-v5', embedding_nim_endpoint='http://embedding:8000/v1', encoding_format='float', httpx_log_level=<LogLevel.WARNING: 'WARNING'>, input_type='passage', raise_on_failure=False, truncate='END'), image_caption_extraction_module: ~nv_ingest.schemas.image_caption_extraction_schema.ImageCaptionExtractionSchema = ImageCaptionExtractionSchema(api_key='api_key', endpoint_url='https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions', prompt='Caption the content of this image:', model_name='meta/llama-3.2-11b-vision-instruct', raise_on_failure=False), image_dedup_module: ~nv_ingest.schemas.image_dedup_schema.ImageDedupSchema = ImageDedupSchema(raise_on_failure=False, cpu_only=False), image_filter_module: ~nv_ingest.schemas.image_filter_schema.ImageFilterSchema = ImageFilterSchema(raise_on_failure=False, cpu_only=False), image_storage_module: ~nv_ingest.schemas.image_storage_schema.ImageStorageModuleSchema = ImageStorageModuleSchema(structured=True, images=True, raise_on_failure=False), infographic_extractor_module: ~nv_ingest.schemas.infographic_extractor_schema.InfographicExtractorSchema = InfographicExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, stage_config=None), job_counter_module: ~nv_ingest.schemas.job_counter_schema.JobCounterSchema = JobCounterSchema(name='job_counter', raise_on_failure=False), metadata_injection_module: ~nv_ingest.schemas.metadata_injector_schema.MetadataInjectorSchema = MetadataInjectorSchema(raise_on_failure=False), otel_meter_module: ~nv_ingest.schemas.otel_meter_schema.OpenTelemetryMeterSchema = OpenTelemetryMeterSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), otel_endpoint='localhost:4317', raise_on_failure=False), otel_tracer_module: ~nv_ingest.schemas.otel_tracer_schema.OpenTelemetryTracerSchema = OpenTelemetryTracerSchema(otel_endpoint='localhost:4317', raise_on_failure=False), pdf_extractor_module: ~nv_ingest.schemas.pdf_extractor_schema.PDFExtractorSchema = PDFExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pdfium_config=None, nemoretriever_parse_config=None), pptx_extractor_module: ~nv_ingest.schemas.pptx_extractor_schema.PPTXExtractorSchema = PPTXExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pptx_extraction_config=None), redis_task_sink: ~nv_ingest.schemas.message_broker_sink_schema.MessageBrokerTaskSinkSchema = MessageBrokerTaskSinkSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), raise_on_failure=False, progress_engines=6), redis_task_source: ~nv_ingest.schemas.message_broker_source_schema.MessageBrokerTaskSourceSchema = MessageBrokerTaskSourceSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), task_queue='morpheus_task_queue', raise_on_failure=False, progress_engines=6), table_extractor_module: ~nv_ingest.schemas.table_extractor_schema.TableExtractorSchema = TableExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, stage_config=None), vdb_task_sink: ~nv_ingest.schemas.vdb_task_sink_schema.VdbTaskSinkSchema = VdbTaskSinkSchema(recreate=False, service='milvus', is_service_serialized=False, default_resource_name='nv_ingest_collection', resource_schemas={'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'metric_type': 'L2', 'index_type': 'GPU_CAGRA', 'params': {'intermediate_graph_degree': 128, 'graph_degree': 64, 'build_algo': 'NN_DESCENT'}}, 'schema_conf': {'enable_dynamic_field': True, 'schema_fields': [{'name': 'pk', 'description': 'Primary key for the collection', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': 'Extracted content', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'vector', 'description': 'Embedding vectors', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}, {'name': 'source', 'description': 'Source document and raw data extracted content', 'type': <DataType.JSON: 23>}, {'name': 'content_metadata', 'description': 'Content metadata', 'type': <DataType.JSON: 23>}], 'description': 'NV-INGEST collection schema'}}}, resource_kwargs={}, service_kwargs={}, batch_size=5120, write_time_interval=1.0, retry_interval=60.0, raise_on_failure=False, progress_engines=1))[source]#

Bases: BaseModel

audio_extractor_schema: AudioExtractorSchema#

chart_extractor_module: ChartExtractorSchema#

embed_extractions_module: EmbedExtractionsSchema#

embedding_storage_module: EmbeddingStorageModuleSchema#

image_caption_extraction_module: ImageCaptionExtractionSchema#

image_dedup_module: ImageDedupSchema#

image_filter_module: ImageFilterSchema#

image_storage_module: ImageStorageModuleSchema#

infographic_extractor_module: InfographicExtractorSchema#

job_counter_module: JobCounterSchema#

metadata_injection_module: MetadataInjectorSchema#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_meter_module: OpenTelemetryMeterSchema#

otel_tracer_module: OpenTelemetryTracerSchema#

pdf_extractor_module: PDFExtractorSchema#

pptx_extractor_module: PPTXExtractorSchema#

redis_task_sink: MessageBrokerTaskSinkSchema#

redis_task_source: MessageBrokerTaskSourceSchema#

table_extractor_module: TableExtractorSchema#

text_splitter_module: TextSplitterSchema#

vdb_task_sink: VdbTaskSinkSchema#

nv_ingest.schemas.job_counter_schema module#

class nv_ingest.schemas.job_counter_schema.JobCounterSchema( *, name: str = 'job_counter', raise_on_failure: bool = False, )[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

name: str#

raise_on_failure: bool#

nv_ingest.schemas.message_broker_client_schema module#

class nv_ingest.schemas.message_broker_client_schema.MessageBrokerClientSchema( *, host: str = 'redis', port: Annotated[int, Gt(gt=0), Lt(lt=65536)] = 6379, client_type: Literal['redis', 'simple'] = 'redis', broker_params: dict | None = {}, connection_timeout: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None = 300, max_backoff: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None = 300, max_retries: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None = 0, )[source]#

Bases: BaseModel

broker_params: dict | None#

client_type: Literal['redis', 'simple']#

connection_timeout: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None#

host: str#

max_backoff: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None#

max_retries: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

port: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Gt(gt=0), Lt(lt=65536)])]#

nv_ingest.schemas.message_broker_sink_schema module#

class nv_ingest.schemas.message_broker_sink_schema.MessageBrokerTaskSinkSchema( *, broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), raise_on_failure: bool = False, progress_engines: Annotated[int, Ge(ge=1)] = 6, )[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#

raise_on_failure: bool#

nv_ingest.schemas.message_broker_source_schema module#

class nv_ingest.schemas.message_broker_source_schema.MessageBrokerTaskSourceSchema( *, broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), task_queue: str = 'morpheus_task_queue', raise_on_failure: bool = False, progress_engines: Annotated[int, Ge(ge=1)] = 6, )[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#

raise_on_failure: bool#

task_queue: str#

nv_ingest.schemas.message_wrapper_schema module#

class nv_ingest.schemas.message_wrapper_schema.MessageWrapper(*, payload: str)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

payload: str#

nv_ingest.schemas.metadata_injector_schema module#

class nv_ingest.schemas.metadata_injector_schema.MetadataInjectorSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

nv_ingest.schemas.metadata_schema module#

class nv_ingest.schemas.metadata_schema.AccessLevelEnum(value)[source]#

Bases: int, Enum

An enumeration.

LEVEL_1 = 1#

LEVEL_2 = 2#

LEVEL_3 = 3#

class nv_ingest.schemas.metadata_schema.AudioMetadataSchema( *, audio_transcript: str = '', audio_type: str = '', )[source]#

Bases: BaseModelNoExt

audio_transcript: str#

audio_type: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest.schemas.metadata_schema.ChartMetadataSchema( *, caption: str = '', table_format: TableFormatEnum, table_content: str = '', table_content_format: TableFormatEnum | str = '', table_location: tuple = (0, 0, 0, 0), table_location_max_dimensions: tuple = (0, 0), uploaded_image_uri: str = '', )[source]#

Bases: BaseModelNoExt

caption: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

table_content: str#

table_content_format: TableFormatEnum | str#

table_format: TableFormatEnum#

table_location: tuple#

table_location_max_dimensions: tuple#

uploaded_image_uri: str#

class nv_ingest.schemas.metadata_schema.ContentHierarchySchema( *, page_count: int = -1, page: int = -1, block: int = -1, line: int = -1, span: int = -1, nearby_objects: NearbyObjectsSchema = NearbyObjectsSchema(text=NearbyObjectsSubSchema(content=[], bbox=[], type=[]), images=NearbyObjectsSubSchema(content=[], bbox=[], type=[]), structured=NearbyObjectsSubSchema(content=[], bbox=[], type=[])), )[source]#

Bases: BaseModelNoExt

Schema for the extracted content hierarchy.

block: int#

line: int#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nearby_objects: NearbyObjectsSchema#

page: int#

page_count: int#

span: int#

class nv_ingest.schemas.metadata_schema.ContentMetadataSchema( *, type: ContentTypeEnum, description: str = '', page_number: int = -1, hierarchy: ContentHierarchySchema = ContentHierarchySchema(page_count=-1, page=-1, block=-1, line=-1, span=-1, nearby_objects=NearbyObjectsSchema(text=NearbyObjectsSubSchema(content=[], bbox=[], type=[]), images=NearbyObjectsSubSchema(content=[], bbox=[], type=[]), structured=NearbyObjectsSubSchema(content=[], bbox=[], type=[]))), subtype: ContentSubtypeEnum | str = '', )[source]#

Bases: BaseModelNoExt

Data extracted from a source; generally Text or Image.

description: str#

hierarchy: ContentHierarchySchema#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

page_number: int#

subtype: ContentSubtypeEnum | str#

type: ContentTypeEnum#

class nv_ingest.schemas.metadata_schema.ContentSubtypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

CHART = 'chart'#

INFOGRAPHIC = 'infographic'#

TABLE = 'table'#

class nv_ingest.schemas.metadata_schema.ContentTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

AUDIO = 'audio'#

EMBEDDING = 'embedding'#

IMAGE = 'image'#

INFO_MSG = 'info_message'#

STRUCTURED = 'structured'#

TEXT = 'text'#

UNSTRUCTURED = 'unstructured'#

VIDEO = 'video'#

class nv_ingest.schemas.metadata_schema.ErrorMetadataSchema( *, task: TaskTypeEnum, status: StatusEnum, source_id: str = '', error_msg: str, )[source]#

Bases: BaseModelNoExt

error_msg: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

source_id: str#

status: StatusEnum#

task: TaskTypeEnum#

class nv_ingest.schemas.metadata_schema.ImageMetadataSchema( *, image_type: ImageTypeEnum | str, structured_image_type: ImageTypeEnum = ImageTypeEnum.image_type_1, caption: str = '', text: str = '', image_location: tuple = (0, 0, 0, 0), image_location_max_dimensions: tuple = (0, 0), uploaded_image_url: str = '', width: int = 0, height: int = 0, )[source]#

Bases: BaseModelNoExt

caption: str#

classmethod clamp_non_negative(v, field)[source]#

height: int#

image_location: tuple#

image_location_max_dimensions: tuple#

image_type: ImageTypeEnum | str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

structured_image_type: ImageTypeEnum#

text: str#

uploaded_image_url: str#

classmethod validate_image_type(v)[source]#

width: int#

class nv_ingest.schemas.metadata_schema.ImageTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

BMP = 'bmp'#

GIF = 'gif'#

JPEG = 'jpeg'#

PNG = 'png'#

TIFF = 'tiff'#

classmethod has_value(value)[source]#

image_type_1 = 'image_type_1'#

image_type_2 = 'image_type_2'#

class nv_ingest.schemas.metadata_schema.InfoMessageMetadataSchema( *, task: TaskTypeEnum, status: StatusEnum, message: str, filter: bool, )[source]#

Bases: BaseModelNoExt

filter: bool#

message: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

status: StatusEnum#

task: TaskTypeEnum#

class nv_ingest.schemas.metadata_schema.LanguageEnum(value)[source]#

Bases: str, Enum

An enumeration.

AF = 'af'#

AR = 'ar'#

BG = 'bg'#

BN = 'bn'#

CA = 'ca'#

CS = 'cs'#

CY = 'cy'#

DA = 'da'#

DE = 'de'#

EL = 'el'#

EN = 'en'#

ES = 'es'#

ET = 'et'#

FA = 'fa'#

FI = 'fi'#

FR = 'fr'#

GU = 'gu'#

HE = 'he'#

HI = 'hi'#

HR = 'hr'#

HU = 'hu'#

ID = 'id'#

IT = 'it'#

JA = 'ja'#

KN = 'kn'#

KO = 'ko'#

LT = 'lt'#

LV = 'lv'#

MK = 'mk'#

ML = 'ml'#

MR = 'mr'#

NE = 'ne'#

NL = 'nl'#

NO = 'no'#

PA = 'pa'#

PL = 'pl'#

PT = 'pt'#

RO = 'ro'#

RU = 'ru'#

SK = 'sk'#

SL = 'sl'#

SO = 'so'#

SQ = 'sq'#

SV = 'sv'#

SW = 'sw'#

TA = 'ta'#

TE = 'te'#

TH = 'th'#

TL = 'tl'#

TR = 'tr'#

UK = 'uk'#

UNKNOWN = 'unknown'#

UR = 'ur'#

VI = 'vi'#

ZH_CN = 'zh-cn'#

ZH_TW = 'zh-tw'#

classmethod has_value(value)[source]#

class nv_ingest.schemas.metadata_schema.MetadataSchema( *, content: str = '', content_url: str = '', embedding: List[float] | None = None, source_metadata: SourceMetadataSchema | None = None, content_metadata: ContentMetadataSchema | None = None, audio_metadata: AudioMetadataSchema | None = None, text_metadata: TextMetadataSchema | None = None, image_metadata: ImageMetadataSchema | None = None, table_metadata: TableMetadataSchema | None = None, chart_metadata: ChartMetadataSchema | None = None, error_metadata: ErrorMetadataSchema | None = None, info_message_metadata: InfoMessageMetadataSchema | None = None, debug_metadata: Dict[str, Any] | None = None, raise_on_failure: bool = False, )[source]#

Bases: BaseModelNoExt

audio_metadata: AudioMetadataSchema | None#

chart_metadata: ChartMetadataSchema | None#

classmethod check_metadata_type(values)[source]#

content: str#

content_metadata: ContentMetadataSchema | None#

content_url: str#

debug_metadata: Dict[str, Any] | None#

embedding: List[float] | None#

error_metadata: ErrorMetadataSchema | None#

image_metadata: ImageMetadataSchema | None#

info_message_metadata: InfoMessageMetadataSchema | None#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

source_metadata: SourceMetadataSchema | None#

table_metadata: TableMetadataSchema | None#

text_metadata: TextMetadataSchema | None#

class nv_ingest.schemas.metadata_schema.NearbyObjectsSchema( *, text: NearbyObjectsSubSchema = NearbyObjectsSubSchema(content=[], bbox=[], type=[]), images: NearbyObjectsSubSchema = NearbyObjectsSubSchema(content=[], bbox=[], type=[]), structured: NearbyObjectsSubSchema = NearbyObjectsSubSchema(content=[], bbox=[], type=[]), )[source]#

Bases: BaseModelNoExt

Schema to hold types of related extracted objects.

images: NearbyObjectsSubSchema#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

structured: NearbyObjectsSubSchema#

text: NearbyObjectsSubSchema#

class nv_ingest.schemas.metadata_schema.NearbyObjectsSubSchema( *, content: List[str] = [], bbox: List[tuple] = [], type: List[str] = [], )[source]#

Bases: BaseModelNoExt

Schema to hold related extracted object

bbox: List[tuple]#

content: List[str]#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

type: List[str]#

class nv_ingest.schemas.metadata_schema.SourceMetadataSchema( *, source_name: str, source_id: str, source_location: str = '', source_type: SourceTypeEnum | str, collection_id: str = '', date_created: str = '2025-06-30T17:04:05.709444', last_modified: str = '2025-06-30T17:04:05.709461', summary: str = '', partition_id: int = -1, access_level: AccessLevelEnum | int = -1, )[source]#

Bases: BaseModelNoExt

Schema for the knowledge base file from which content and metadata is extracted.

access_level: AccessLevelEnum | int#

collection_id: str#

date_created: str#

last_modified: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

partition_id: int#

source_id: str#

source_location: str#

source_name: str#

source_type: SourceTypeEnum | str#

summary: str#

classmethod validate_fields(field_value)[source]#

class nv_ingest.schemas.metadata_schema.SourceTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

DOCX = 'docx'#

PDF = 'pdf'#

PPTX = 'pptx'#

source_type_1 = 'source_type_1'#

source_type_2 = 'source_type_2'#

class nv_ingest.schemas.metadata_schema.StatusEnum(value)[source]#

Bases: str, Enum

An enumeration.

ERROR: str = 'error'#

SUCCESS: str = 'success'#

class nv_ingest.schemas.metadata_schema.StdContentDescEnum(value)[source]#

Bases: str, Enum

An enumeration.

DOCX_IMAGE = 'Image extracted from DOCX document.'#

DOCX_TABLE = 'Structured table extracted from DOCX document.'#

DOCX_TEXT = 'Unstructured text from DOCX document.'#

PDF_CHART = 'Structured chart extracted from PDF document.'#

PDF_IMAGE = 'Image extracted from PDF document.'#

PDF_INFOGRAPHIC = 'Structured infographic extracted from PDF document.'#

PDF_TABLE = 'Structured table extracted from PDF document.'#

PDF_TEXT = 'Unstructured text from PDF document.'#

PPTX_IMAGE = 'Image extracted from PPTX presentation.'#

PPTX_TABLE = 'Structured table extracted from PPTX presentation.'#

PPTX_TEXT = 'Unstructured text from PPTX presentation.'#

class nv_ingest.schemas.metadata_schema.TableFormatEnum(value)[source]#

Bases: str, Enum

An enumeration.

HTML = 'html'#

IMAGE = 'image'#

LATEX = 'latex'#

MARKDOWN = 'markdown'#

PSEUDO_MARKDOWN = 'pseudo_markdown'#

SIMPLE = 'simple'#

class nv_ingest.schemas.metadata_schema.TableMetadataSchema( *, caption: str = '', table_format: TableFormatEnum, table_content: str = '', table_content_format: TableFormatEnum | str = '', table_location: tuple = (0, 0, 0, 0), table_location_max_dimensions: tuple = (0, 0), uploaded_image_uri: str = '', )[source]#

Bases: BaseModelNoExt

caption: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

table_content: str#

table_content_format: TableFormatEnum | str#

table_format: TableFormatEnum#

table_location: tuple#

table_location_max_dimensions: tuple#

uploaded_image_uri: str#

class nv_ingest.schemas.metadata_schema.TaskTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

CAPTION = 'caption'#

EMBED = 'embed'#

EXTRACT = 'extract'#

FILTER = 'filter'#

SPLIT = 'split'#

TRANSFORM = 'transform'#

class nv_ingest.schemas.metadata_schema.TextMetadataSchema( *, text_type: TextTypeEnum, summary: str = '', keywords: str | List[str] | Dict = '', language: LanguageEnum = 'en', text_location: tuple = (0, 0, 0, 0), text_location_max_dimensions: tuple = (0, 0, 0, 0), )[source]#

Bases: BaseModelNoExt

keywords: str | List[str] | Dict#

language: LanguageEnum#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

summary: str#

text_location: tuple#

text_location_max_dimensions: tuple#

text_type: TextTypeEnum#

class nv_ingest.schemas.metadata_schema.TextTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

BLOCK = 'block'#

BODY = 'body'#

DOCUMENT = 'document'#

HEADER = 'header'#

LINE = 'line'#

NEARBY_BLOCK = 'nearby_block'#

OTHER = 'other'#

PAGE = 'page'#

SPAN = 'span'#

nv_ingest.schemas.metadata_schema.validate_metadata( metadata: Dict[str, Any], ) → MetadataSchema[source]#

Validates the given metadata dictionary against the MetadataSchema.

Parameters: - metadata: A dictionary representing metadata to be validated.

Returns: - An instance of MetadataSchema if validation is successful.

Raises: - ValidationError: If the metadata does not conform to the schema.

nv_ingest.schemas.otel_meter_schema module#

class nv_ingest.schemas.otel_meter_schema.OpenTelemetryMeterSchema( *, broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), otel_endpoint: str = 'localhost:4317', raise_on_failure: bool = False, )[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_endpoint: str#

raise_on_failure: bool#

nv_ingest.schemas.otel_tracer_schema module#

class nv_ingest.schemas.otel_tracer_schema.OpenTelemetryTracerSchema( *, otel_endpoint: str = 'localhost:4317', raise_on_failure: bool = False, )[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_endpoint: str#

raise_on_failure: bool#

nv_ingest.schemas.pdf_extractor_schema module#

class nv_ingest.schemas.pdf_extractor_schema.NemoRetrieverParseConfigSchema( *, auth_token: str | None = None, nemoretriever_parse_endpoints: Tuple[str | None, str | None] = (None, None), nemoretriever_parse_infer_protocol: str = '', model_name: str = 'nvidia/nemoretriever-parse', timeout: float = 300.0, workers_per_progress_engine: int = 5, )[source]#

Bases: BaseModel

Configuration schema for NemoRetrieverParse endpoints and options.

Parameters:

auth_token (Optional[str], default=None) – Authentication token required for secure services.
nemoretriever_parse_endpoints (Tuple[str, str]) – A tuple containing the gRPC and HTTP services for the nemoretriever_parse endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#: Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.
Config –
------ –

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str#

nemoretriever_parse_endpoints: Tuple[str | None, str | None]#

nemoretriever_parse_infer_protocol: str#

timeout: float#

classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Parameters:: values (dict) – Dictionary containing the values of the attributes for the class.
Returns:: The validated dictionary of values.
Return type:: dict
Raises:: ValueError – If both gRPC and HTTP services are empty for any endpoint.

workers_per_progress_engine: int#

class nv_ingest.schemas.pdf_extractor_schema.PDFExtractorSchema( *, max_queue_size: int = 1, n_workers: int = 16, raise_on_failure: bool = False, pdfium_config: PDFiumConfigSchema | None = None, nemoretriever_parse_config: NemoRetrieverParseConfigSchema | None = None, )[source]#

Bases: BaseModel

Configuration schema for the PDF extractor settings.

Parameters:

max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.
n_workers (int, default=16) – The number of worker threads to use for processing.
raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception on processing failure.
pdfium_config (Optional[PDFiumConfigSchema], default=None) – Configuration for the PDFium service endpoints.

max_queue_size: int#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#

nemoretriever_parse_config: NemoRetrieverParseConfigSchema | None#

pdfium_config: PDFiumConfigSchema | None#

raise_on_failure: bool#

class nv_ingest.schemas.pdf_extractor_schema.PDFiumConfigSchema( *, auth_token: str | None = None, yolox_endpoints: Tuple[str | None, str | None] = (None, None), yolox_infer_protocol: str = '', nim_batch_size: int = 4, workers_per_progress_engine: int = 5, )[source]#

Bases: BaseModel

Configuration schema for PDFium endpoints and options.

Parameters:

auth_token (Optional[str], default=None) – Authentication token required for secure services.
yolox_endpoints (Tuple[str, str]) – A tuple containing the gRPC and HTTP services for the yolox endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#: Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.
Config –
------ –

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nim_batch_size: int#

classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Parameters:: values (dict) – Dictionary containing the values of the attributes for the class.
Returns:: The validated dictionary of values.
Return type:: dict
Raises:: ValueError – If both gRPC and HTTP services are empty for any endpoint.

workers_per_progress_engine: int#

yolox_endpoints: Tuple[str | None, str | None]#

yolox_infer_protocol: str#

nv_ingest.schemas.pptx_extractor_schema module#

class nv_ingest.schemas.pptx_extractor_schema.PPTXConfigSchema( *, auth_token: str | None = None, yolox_endpoints: Tuple[str | None, str | None] = (None, None), yolox_infer_protocol: str = '', )[source]#

Bases: BaseModel

Configuration schema for docx extraction endpoints and options.

Parameters:

auth_token (Optional[str], default=None) – Authentication token required for secure services.
yolox_endpoints (Tuple[str, str]) – A tuple containing the gRPC and HTTP services for the yolox endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#: Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.
Config –
------ –

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Parameters:: values (dict) – Dictionary containing the values of the attributes for the class.
Returns:: The validated dictionary of values.
Return type:: dict
Raises:: ValueError – If both gRPC and HTTP services are empty for any endpoint.

yolox_endpoints: Tuple[str | None, str | None]#

yolox_infer_protocol: str#

class nv_ingest.schemas.pptx_extractor_schema.PPTXExtractorSchema( *, max_queue_size: int = 1, n_workers: int = 16, raise_on_failure: bool = False, pptx_extraction_config: PPTXConfigSchema | None = None, )[source]#

Bases: BaseModel

Configuration schema for the PDF extractor settings.

Parameters:

max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.
n_workers (int, default=16) – The number of worker threads to use for processing.
raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception on processing failure.
image_extraction_config (Optional[ImageConfigSchema], default=None) – Configuration schema for the image extraction stage.

max_queue_size: int#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#

pptx_extraction_config: PPTXConfigSchema | None#

raise_on_failure: bool#

nv_ingest.schemas.processing_job_schema module#

class nv_ingest.schemas.processing_job_schema.ConversionStatus(value)[source]#

Bases: str, Enum

An enumeration.

FAILED = 'failed'#

IN_PROGRESS = 'in_progress'#

SUCCESS = 'success'#

model_config = "{'extra': 'forbid'}"#

class nv_ingest.schemas.processing_job_schema.ProcessingJob( *, submitted_job_id: str, filename: str, raw_result: str = '', content: str = '', status: ConversionStatus, error: str | None = None, )[source]#

Bases: BaseModel

content: str#

error: str | None#

filename: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raw_result: str#

status: ConversionStatus#

submitted_job_id: str#

nv_ingest.schemas.table_extractor_schema module#

class nv_ingest.schemas.table_extractor_schema.TableExtractorConfigSchema( *, auth_token: str | None = None, yolox_endpoints: Tuple[str | None, str | None] = (None, None), yolox_infer_protocol: str = '', paddle_endpoints: Tuple[str | None, str | None] = (None, None), paddle_infer_protocol: str = '', nim_batch_size: int = 2, workers_per_progress_engine: int = 5, )[source]#

Bases: BaseModel

Configuration schema for the table extraction stage settings.

Parameters:

auth_token (Optional[str], default=None) – Authentication token required for secure services.
paddle_endpoints (Tuple[Optional[str], Optional[str]], default=(None, None)) – A tuple containing the gRPC and HTTP services for the paddle endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#: Validates that at least one of the gRPC or HTTP services is provided for the yolox endpoint.

Raises:

ValueError – If both gRPC and HTTP services are empty for the yolox endpoint.
Config –
------ –

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nim_batch_size: int#

paddle_endpoints: Tuple[str | None, str | None]#

paddle_infer_protocol: str#

classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for the yolox endpoint.

Parameters:: values (dict) – Dictionary containing the values of the attributes for the class.
Returns:: The validated dictionary of values.
Return type:: dict
Raises:: ValueError – If both gRPC and HTTP services are empty for the yolox endpoint.

workers_per_progress_engine: int#

yolox_endpoints: Tuple[str | None, str | None]#

yolox_infer_protocol: str#

class nv_ingest.schemas.table_extractor_schema.TableExtractorSchema( *, max_queue_size: int = 1, n_workers: int = 2, raise_on_failure: bool = False, stage_config: TableExtractorConfigSchema | None = None, )[source]#

Bases: BaseModel

Configuration schema for the table extraction processing settings.

Parameters:

max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.
n_workers (int, default=2) – The number of worker threads to use for processing.
raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception if a failure occurs during table extraction.
stage_config (Optional[TableExtractorConfigSchema], default=None) – Configuration for the table extraction stage, including yolox service endpoints.

classmethod check_positive(v, field)[source]#

max_queue_size: int#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#

raise_on_failure: bool#

stage_config: TableExtractorConfigSchema | None#

nv_ingest.schemas.task_injection_schema module#

class nv_ingest.schemas.task_injection_schema.TaskInjectionSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

nv_ingest.schemas.text_splitter_schema module#

class nv_ingest.schemas.text_splitter_schema.TextSplitterSchema( *, tokenizer: str | None = None, chunk_size: Annotated[int, Gt(gt=0)] = 1024, chunk_overlap: Annotated[int, Ge(ge=0)] = 150, raise_on_failure: bool = False, )[source]#

Bases: BaseModel

classmethod check_chunk_overlap(v, values, **kwargs)[source]#

chunk_overlap: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])]#

chunk_size: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Gt(gt=0)])]#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

tokenizer: str | None#

nv_ingest.schemas.vdb_task_sink_schema module#

class nv_ingest.schemas.vdb_task_sink_schema.VdbTaskSinkSchema( *, recreate: bool = False, service: str = 'milvus', is_service_serialized: bool = False, default_resource_name: str = 'nv_ingest_collection', resource_schemas: dict = {'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'index_type': 'GPU_CAGRA', 'metric_type': 'L2', 'params': {'build_algo': 'NN_DESCENT', 'graph_degree': 64, 'intermediate_graph_degree': 128}}, 'schema_conf': {'description': 'NV-INGEST collection schema', 'enable_dynamic_field': True, 'schema_fields': [{'auto_id': True, 'description': 'Primary key for the collection', 'is_primary': True, 'name': 'pk', 'type': DataType.INT64}, {'description': 'Extracted content', 'name': 'text', 'params': {'max_length': 65535}, 'type': DataType.VARCHAR}, {'description': 'Embedding vectors', 'name': 'vector', 'params': {'dim': 1024}, 'type': DataType.FLOAT_VECTOR}, {'description': 'Source document and raw data extracted content', 'name': 'source', 'type': DataType.JSON}, {'description': 'Content metadata', 'name': 'content_metadata', 'type': DataType.JSON}]}}}, resource_kwargs: dict = <factory>, service_kwargs: dict = {}, batch_size: int = 5120, write_time_interval: float = 1.0, retry_interval: float = 60.0, raise_on_failure: bool = False, progress_engines: ~typing.Annotated[int, ~annotated_types.Ge(ge=1)] = 1, )[source]#

Bases: BaseModel

batch_size: int#

default_resource_name: str#

is_service_serialized: bool#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#

raise_on_failure: bool#

recreate: bool#

resource_kwargs: dict#

resource_schemas: dict#

retry_interval: float#

service: str#

service_kwargs: dict#

classmethod validate_resource_name(to_validate)[source]#

classmethod validate_service(to_validate)[source]#

write_time_interval: float#

nv_ingest.schemas.vdb_task_sink_schema.build_default_milvus_config( embedding_size: int = 1024, ) → Dict[str, Any][source]#

Builds the configuration for Milvus.

This function creates a dictionary configuration for a Milvus collection. It includes the index configuration and the schema configuration, with various fields like id, title, link, summary, page_content, and embedding.

Parameters:: embedding_size (int) – The size of the embedding vector.
Returns:: A dictionary containing the configuration settings for Milvus.
Return type:: Dict[str, Any]

Module contents#

class nv_ingest.schemas.ImageCaptionExtractionSchema( *, api_key: str = 'api_key', endpoint_url: str = 'https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions', prompt: str = 'Caption the content of this image:', model_name: str = 'meta/llama-3.2-11b-vision-instruct', raise_on_failure: bool = False, )[source]#

Bases: BaseModel

api_key: str#

endpoint_url: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str#

prompt: str#

raise_on_failure: bool#

class nv_ingest.schemas.ImageStorageModuleSchema( *, structured: bool = True, images: bool = True, raise_on_failure: bool = False, )[source]#

Bases: BaseModel

images: bool#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

structured: bool#

class nv_ingest.schemas.IngestJobSchema( *, job_payload: JobPayloadSchema, job_id: str | int, tasks: List[IngestTaskSchema], tracing_options: TracingOptionsSchema | None = None, )[source]#

Bases: BaseModelNoExt

job_id: str | int#

job_payload: JobPayloadSchema#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

tasks: List[IngestTaskSchema]#

tracing_options: TracingOptionsSchema | None#

class nv_ingest.schemas.MessageBrokerClientSchema( *, host: str = 'redis', port: Annotated[int, Gt(gt=0), Lt(lt=65536)] = 6379, client_type: Literal['redis', 'simple'] = 'redis', broker_params: dict | None = {}, connection_timeout: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None = 300, max_backoff: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None = 300, max_retries: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None = 0, )[source]#

Bases: BaseModel

broker_params: dict | None#

client_type: Literal['redis', 'simple']#

connection_timeout: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None#

host: str#

max_backoff: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None#

max_retries: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

port: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Gt(gt=0), Lt(lt=65536)])]#

class nv_ingest.schemas.MessageBrokerTaskSinkSchema( *, broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), raise_on_failure: bool = False, progress_engines: Annotated[int, Ge(ge=1)] = 6, )[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#

raise_on_failure: bool#

class nv_ingest.schemas.MessageBrokerTaskSourceSchema( *, broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), task_queue: str = 'morpheus_task_queue', raise_on_failure: bool = False, progress_engines: Annotated[int, Ge(ge=1)] = 6, )[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#

raise_on_failure: bool#

task_queue: str#

class nv_ingest.schemas.MetadataInjectorSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

class nv_ingest.schemas.PDFExtractorSchema( *, max_queue_size: int = 1, n_workers: int = 16, raise_on_failure: bool = False, pdfium_config: PDFiumConfigSchema | None = None, nemoretriever_parse_config: NemoRetrieverParseConfigSchema | None = None, )[source]#

Bases: BaseModel

Configuration schema for the PDF extractor settings.

Parameters:

max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.
n_workers (int, default=16) – The number of worker threads to use for processing.
raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception on processing failure.
pdfium_config (Optional[PDFiumConfigSchema], default=None) – Configuration for the PDFium service endpoints.

max_queue_size: int#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#

nemoretriever_parse_config: NemoRetrieverParseConfigSchema | None#

pdfium_config: PDFiumConfigSchema | None#

raise_on_failure: bool#

class nv_ingest.schemas.PipelineConfigSchema(*, audio_extractor_schema: ~nv_ingest.schemas.audio_extractor_schema.AudioExtractorSchema = AudioExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, audio_extraction_config=None), chart_extractor_module: ~nv_ingest.schemas.chart_extractor_schema.ChartExtractorSchema = ChartExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, stage_config=None), text_splitter_module: ~nv_ingest.schemas.text_splitter_schema.TextSplitterSchema = TextSplitterSchema(tokenizer=None, chunk_size=1024, chunk_overlap=150, raise_on_failure=False), embedding_storage_module: ~nv_ingest.schemas.embedding_storage_schema.EmbeddingStorageModuleSchema = EmbeddingStorageModuleSchema(raise_on_failure=False), embed_extractions_module: ~nv_ingest.schemas.embed_extractions_schema.EmbedExtractionsSchema = EmbedExtractionsSchema(api_key='api_key', batch_size=8192, embedding_model='nvidia/nv-embedqa-e5-v5', embedding_nim_endpoint='http://embedding:8000/v1', encoding_format='float', httpx_log_level=<LogLevel.WARNING: 'WARNING'>, input_type='passage', raise_on_failure=False, truncate='END'), image_caption_extraction_module: ~nv_ingest.schemas.image_caption_extraction_schema.ImageCaptionExtractionSchema = ImageCaptionExtractionSchema(api_key='api_key', endpoint_url='https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions', prompt='Caption the content of this image:', model_name='meta/llama-3.2-11b-vision-instruct', raise_on_failure=False), image_dedup_module: ~nv_ingest.schemas.image_dedup_schema.ImageDedupSchema = ImageDedupSchema(raise_on_failure=False, cpu_only=False), image_filter_module: ~nv_ingest.schemas.image_filter_schema.ImageFilterSchema = ImageFilterSchema(raise_on_failure=False, cpu_only=False), image_storage_module: ~nv_ingest.schemas.image_storage_schema.ImageStorageModuleSchema = ImageStorageModuleSchema(structured=True, images=True, raise_on_failure=False), infographic_extractor_module: ~nv_ingest.schemas.infographic_extractor_schema.InfographicExtractorSchema = InfographicExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, stage_config=None), job_counter_module: ~nv_ingest.schemas.job_counter_schema.JobCounterSchema = JobCounterSchema(name='job_counter', raise_on_failure=False), metadata_injection_module: ~nv_ingest.schemas.metadata_injector_schema.MetadataInjectorSchema = MetadataInjectorSchema(raise_on_failure=False), otel_meter_module: ~nv_ingest.schemas.otel_meter_schema.OpenTelemetryMeterSchema = OpenTelemetryMeterSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), otel_endpoint='localhost:4317', raise_on_failure=False), otel_tracer_module: ~nv_ingest.schemas.otel_tracer_schema.OpenTelemetryTracerSchema = OpenTelemetryTracerSchema(otel_endpoint='localhost:4317', raise_on_failure=False), pdf_extractor_module: ~nv_ingest.schemas.pdf_extractor_schema.PDFExtractorSchema = PDFExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pdfium_config=None, nemoretriever_parse_config=None), pptx_extractor_module: ~nv_ingest.schemas.pptx_extractor_schema.PPTXExtractorSchema = PPTXExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pptx_extraction_config=None), redis_task_sink: ~nv_ingest.schemas.message_broker_sink_schema.MessageBrokerTaskSinkSchema = MessageBrokerTaskSinkSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), raise_on_failure=False, progress_engines=6), redis_task_source: ~nv_ingest.schemas.message_broker_source_schema.MessageBrokerTaskSourceSchema = MessageBrokerTaskSourceSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), task_queue='morpheus_task_queue', raise_on_failure=False, progress_engines=6), table_extractor_module: ~nv_ingest.schemas.table_extractor_schema.TableExtractorSchema = TableExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, stage_config=None), vdb_task_sink: ~nv_ingest.schemas.vdb_task_sink_schema.VdbTaskSinkSchema = VdbTaskSinkSchema(recreate=False, service='milvus', is_service_serialized=False, default_resource_name='nv_ingest_collection', resource_schemas={'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'metric_type': 'L2', 'index_type': 'GPU_CAGRA', 'params': {'intermediate_graph_degree': 128, 'graph_degree': 64, 'build_algo': 'NN_DESCENT'}}, 'schema_conf': {'enable_dynamic_field': True, 'schema_fields': [{'name': 'pk', 'description': 'Primary key for the collection', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': 'Extracted content', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'vector', 'description': 'Embedding vectors', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}, {'name': 'source', 'description': 'Source document and raw data extracted content', 'type': <DataType.JSON: 23>}, {'name': 'content_metadata', 'description': 'Content metadata', 'type': <DataType.JSON: 23>}], 'description': 'NV-INGEST collection schema'}}}, resource_kwargs={}, service_kwargs={}, batch_size=5120, write_time_interval=1.0, retry_interval=60.0, raise_on_failure=False, progress_engines=1))[source]#

Bases: BaseModel

audio_extractor_schema: AudioExtractorSchema#

chart_extractor_module: ChartExtractorSchema#

embed_extractions_module: EmbedExtractionsSchema#

embedding_storage_module: EmbeddingStorageModuleSchema#

image_caption_extraction_module: ImageCaptionExtractionSchema#

image_dedup_module: ImageDedupSchema#

image_filter_module: ImageFilterSchema#

image_storage_module: ImageStorageModuleSchema#

infographic_extractor_module: InfographicExtractorSchema#

job_counter_module: JobCounterSchema#

metadata_injection_module: MetadataInjectorSchema#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_meter_module: OpenTelemetryMeterSchema#

otel_tracer_module: OpenTelemetryTracerSchema#

pdf_extractor_module: PDFExtractorSchema#

pptx_extractor_module: PPTXExtractorSchema#

redis_task_sink: MessageBrokerTaskSinkSchema#

redis_task_source: MessageBrokerTaskSourceSchema#

table_extractor_module: TableExtractorSchema#

text_splitter_module: TextSplitterSchema#

vdb_task_sink: VdbTaskSinkSchema#

class nv_ingest.schemas.TaskInjectionSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

class nv_ingest.schemas.TextSplitterSchema( *, tokenizer: str | None = None, chunk_size: Annotated[int, Gt(gt=0)] = 1024, chunk_overlap: Annotated[int, Ge(ge=0)] = 150, raise_on_failure: bool = False, )[source]#

Bases: BaseModel

classmethod check_chunk_overlap(v, values, **kwargs)[source]#

chunk_overlap: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])]#

chunk_size: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Gt(gt=0)])]#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

tokenizer: str | None#

class nv_ingest.schemas.VdbTaskSinkSchema( *, recreate: bool = False, service: str = 'milvus', is_service_serialized: bool = False, default_resource_name: str = 'nv_ingest_collection', resource_schemas: dict = {'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'index_type': 'GPU_CAGRA', 'metric_type': 'L2', 'params': {'build_algo': 'NN_DESCENT', 'graph_degree': 64, 'intermediate_graph_degree': 128}}, 'schema_conf': {'description': 'NV-INGEST collection schema', 'enable_dynamic_field': True, 'schema_fields': [{'auto_id': True, 'description': 'Primary key for the collection', 'is_primary': True, 'name': 'pk', 'type': DataType.INT64}, {'description': 'Extracted content', 'name': 'text', 'params': {'max_length': 65535}, 'type': DataType.VARCHAR}, {'description': 'Embedding vectors', 'name': 'vector', 'params': {'dim': 1024}, 'type': DataType.FLOAT_VECTOR}, {'description': 'Source document and raw data extracted content', 'name': 'source', 'type': DataType.JSON}, {'description': 'Content metadata', 'name': 'content_metadata', 'type': DataType.JSON}]}}}, resource_kwargs: dict = <factory>, service_kwargs: dict = {}, batch_size: int = 5120, write_time_interval: float = 1.0, retry_interval: float = 60.0, raise_on_failure: bool = False, progress_engines: ~typing.Annotated[int, ~annotated_types.Ge(ge=1)] = 1, )[source]#

Bases: BaseModel

batch_size: int#

default_resource_name: str#

is_service_serialized: bool#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#

raise_on_failure: bool#

recreate: bool#

resource_kwargs: dict#

resource_schemas: dict#

retry_interval: float#

service: str#

service_kwargs: dict#

classmethod validate_resource_name(to_validate)[source]#

classmethod validate_service(to_validate)[source]#

write_time_interval: float#

nv_ingest.schemas.validate_ingest_job( job_data: Dict[str, Any], ) → IngestJobSchema[source]#

Validates a dictionary representing an ingest_job using the IngestJobSchema.

Parameters: - job_data: Dictionary representing an ingest job.

Returns: - IngestJobSchema: The validated ingest job.

Raises: - ValidationError: If the input data does not conform to the IngestJobSchema.

nv_ingest.schemas.validate_metadata( metadata: Dict[str, Any], ) → MetadataSchema[source]#

Validates the given metadata dictionary against the MetadataSchema.

Parameters: - metadata: A dictionary representing metadata to be validated.

Returns: - An instance of MetadataSchema if validation is successful.

Raises: - ValidationError: If the metadata does not conform to the schema.

nv_ingest.schemas package#

Subpackages#

Submodules#

nv_ingest.schemas.associate_nearby_text_schema module#

nv_ingest.schemas.base_model_noext module#

nv_ingest.schemas.chart_extractor_schema module#

nv_ingest.schemas.docx_extractor_schema module#

nv_ingest.schemas.embed_extractions_schema module#

nv_ingest.schemas.embedding_storage_schema module#

nv_ingest.schemas.file_source_pipe_schema module#

nv_ingest.schemas.image_caption_extraction_schema module#

nv_ingest.schemas.image_dedup_schema module#

nv_ingest.schemas.image_extractor_schema module#

nv_ingest.schemas.image_filter_schema module#

nv_ingest.schemas.image_storage_schema module#

nv_ingest.schemas.infographic_extractor_schema module#

nv_ingest.schemas.ingest_job_schema module#

nv_ingest.schemas.ingest_pipeline_config_schema module#

nv_ingest.schemas.job_counter_schema module#

nv_ingest.schemas.message_broker_client_schema module#

nv_ingest.schemas.message_broker_sink_schema module#

nv_ingest.schemas.message_broker_source_schema module#

nv_ingest.schemas.message_wrapper_schema module#

nv_ingest.schemas.metadata_injector_schema module#

nv_ingest.schemas.metadata_schema module#

nv_ingest.schemas.otel_meter_schema module#

nv_ingest.schemas.otel_tracer_schema module#

nv_ingest.schemas.pdf_extractor_schema module#

nv_ingest.schemas.pptx_extractor_schema module#

nv_ingest.schemas.processing_job_schema module#

nv_ingest.schemas.table_extractor_schema module#

nv_ingest.schemas.task_injection_schema module#

nv_ingest.schemas.text_splitter_schema module#

nv_ingest.schemas.vdb_task_sink_schema module#

Module contents#