nv_ingest.schemas package#

Subpackages#

Submodules#

nv_ingest.schemas.associate_nearby_text_schema module#

class nv_ingest.schemas.associate_nearby_text_schema.AssociateNearbyTextSchema(
*,
n_neighbors: int = 5,
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_neighbors: int#
raise_on_failure: bool#

nv_ingest.schemas.base_model_noext module#

class nv_ingest.schemas.base_model_noext.BaseModelNoExt[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nv_ingest.schemas.chart_extractor_schema module#

class nv_ingest.schemas.chart_extractor_schema.ChartExtractorConfigSchema(
*,
auth_token: str | None = None,
yolox_endpoints: Tuple[str | None, str | None] = (None, None),
yolox_infer_protocol: str = '',
paddle_endpoints: Tuple[str | None, str | None] = (None, None),
paddle_infer_protocol: str = '',
nim_batch_size: int = 2,
workers_per_progress_engine: int = 5,
)[source]#

Bases: BaseModel

Configuration schema for chart extraction service endpoints and options.

Parameters:
  • auth_token (Optional[str], default=None) – Authentication token required for secure services.

  • yolox_endpoints (Tuple[Optional[str], Optional[str]], default=(None, None)) – A tuple containing the gRPC and HTTP services for the yolox endpoint. Either the gRPC or HTTP service can be empty, but not both.

  • paddle_endpoints (Tuple[Optional[str], Optional[str]], default=(None, None)) – A tuple containing the gRPC and HTTP services for the paddle endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#

Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:
  • ValueError – If both gRPC and HTTP services are empty for any endpoint.

  • Config

  • ------

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nim_batch_size: int#
paddle_endpoints: Tuple[str | None, str | None]#
paddle_infer_protocol: str#
classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Ensures that at least one service (either gRPC or HTTP) is provided for each endpoint in the configuration.

Parameters:

values (dict) – Dictionary containing the values of the attributes for the class.

Returns:

The validated dictionary of values.

Return type:

dict

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.

workers_per_progress_engine: int#
yolox_endpoints: Tuple[str | None, str | None]#
yolox_infer_protocol: str#
class nv_ingest.schemas.chart_extractor_schema.ChartExtractorSchema(
*,
max_queue_size: int = 1,
n_workers: int = 2,
raise_on_failure: bool = False,
stage_config: ChartExtractorConfigSchema | None = None,
)[source]#

Bases: BaseModel

Configuration schema for chart extraction processing settings.

Parameters:
  • max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.

  • n_workers (int, default=2) – The number of worker threads to use for processing.

  • raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception if a failure occurs during chart extraction.

  • stage_config (Optional[ChartExtractorConfigSchema], default=None) – Configuration for the chart extraction stage, including yolox and paddle service endpoints.

classmethod check_positive(v, field)[source]#
max_queue_size: int#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#
raise_on_failure: bool#
stage_config: ChartExtractorConfigSchema | None#

nv_ingest.schemas.docx_extractor_schema module#

class nv_ingest.schemas.docx_extractor_schema.DocxConfigSchema(
*,
auth_token: str | None = None,
yolox_endpoints: Tuple[str | None, str | None] = (None, None),
yolox_infer_protocol: str = '',
)[source]#

Bases: BaseModel

Configuration schema for docx extraction endpoints and options.

Parameters:
  • auth_token (Optional[str], default=None) – Authentication token required for secure services.

  • yolox_endpoints (Tuple[str, str]) – A tuple containing the gRPC and HTTP services for the yolox endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#

Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:
  • ValueError – If both gRPC and HTTP services are empty for any endpoint.

  • Config

  • ------

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Parameters:

values (dict) – Dictionary containing the values of the attributes for the class.

Returns:

The validated dictionary of values.

Return type:

dict

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.

yolox_endpoints: Tuple[str | None, str | None]#
yolox_infer_protocol: str#
class nv_ingest.schemas.docx_extractor_schema.DocxExtractorSchema(
*,
max_queue_size: int = 1,
n_workers: int = 16,
raise_on_failure: bool = False,
docx_extraction_config: DocxConfigSchema | None = None,
)[source]#

Bases: BaseModel

Configuration schema for the PDF extractor settings.

Parameters:
  • max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.

  • n_workers (int, default=16) – The number of worker threads to use for processing.

  • raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception on processing failure.

  • image_extraction_config (Optional[ImageConfigSchema], default=None) – Configuration schema for the image extraction stage.

docx_extraction_config: DocxConfigSchema | None#
max_queue_size: int#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#
raise_on_failure: bool#

nv_ingest.schemas.embed_extractions_schema module#

class nv_ingest.schemas.embed_extractions_schema.EmbedExtractionsSchema(
*,
api_key: str = 'api_key',
batch_size: int = 8192,
embedding_model: str = 'nvidia/nv-embedqa-e5-v5',
embedding_nim_endpoint: str = 'http://embedding:8000/v1',
encoding_format: str = 'float',
httpx_log_level: LogLevel = LogLevel.WARNING,
input_type: str = 'passage',
raise_on_failure: bool = False,
truncate: str = 'END',
)[source]#

Bases: BaseModel

api_key: str#
batch_size: int#
embedding_model: str#
embedding_nim_endpoint: str#
encoding_format: str#
httpx_log_level: LogLevel#
input_type: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#
truncate: str#

nv_ingest.schemas.embedding_storage_schema module#

class nv_ingest.schemas.embedding_storage_schema.EmbeddingStorageModuleSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

nv_ingest.schemas.file_source_pipe_schema module#

class nv_ingest.schemas.file_source_pipe_schema.FileSourcePipeSchema(
*,
batch_size: int = 1024,
chunk_overlap: int = 51,
chunk_size: int = 512,
converters_meta: ~typing.Dict[~typing.Any,
~typing.Any] | None = {},
enable_monitor: bool = False,
extractor_config: ~typing.Dict[~typing.Any,
~typing.Any] | None = {},
filenames: ~typing.List[str] = <factory>,
num_threads: int = 1,
vdb_resource_name: str,
watch: bool = False,
watch_interval: float = -5.0,
)[source]#

Bases: BaseModel

batch_size: int#
chunk_overlap: int#
chunk_size: int#
converters_meta: Dict[Any, Any] | None#
enable_monitor: bool#
extractor_config: Dict[Any, Any] | None#
filenames: List[str]#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

num_threads: int#
vdb_resource_name: str#
watch: bool#
watch_interval: float#

nv_ingest.schemas.image_caption_extraction_schema module#

class nv_ingest.schemas.image_caption_extraction_schema.ImageCaptionExtractionSchema(
*,
api_key: str = 'api_key',
endpoint_url: str = 'https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions',
prompt: str = 'Caption the content of this image:',
model_name: str = 'meta/llama-3.2-11b-vision-instruct',
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

api_key: str#
endpoint_url: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str#
prompt: str#
raise_on_failure: bool#

nv_ingest.schemas.image_dedup_schema module#

class nv_ingest.schemas.image_dedup_schema.ImageDedupSchema(
*,
raise_on_failure: Annotated[bool, Strict(strict=True)] = False,
cpu_only: Annotated[bool, Strict(strict=True)] = False,
)[source]#

Bases: BaseModel

cpu_only: Annotated[bool, Strict(strict=True)]#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: Annotated[bool, Strict(strict=True)]#

nv_ingest.schemas.image_extractor_schema module#

class nv_ingest.schemas.image_extractor_schema.ImageConfigSchema(
*,
auth_token: str | None = None,
yolox_endpoints: Tuple[str | None, str | None] = (None, None),
yolox_infer_protocol: str = '',
)[source]#

Bases: BaseModel

Configuration schema for image extraction endpoints and options.

Parameters:
  • auth_token (Optional[str], default=None) – Authentication token required for secure services.

  • yolox_endpoints (Tuple[str, str]) – A tuple containing the gRPC and HTTP services for the yolox endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#

Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:
  • ValueError – If both gRPC and HTTP services are empty for any endpoint.

  • Config

  • ------

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Parameters:

values (dict) – Dictionary containing the values of the attributes for the class.

Returns:

The validated dictionary of values.

Return type:

dict

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.

yolox_endpoints: Tuple[str | None, str | None]#
yolox_infer_protocol: str#
class nv_ingest.schemas.image_extractor_schema.ImageExtractorSchema(
*,
max_queue_size: int = 1,
n_workers: int = 16,
raise_on_failure: bool = False,
image_extraction_config: ImageConfigSchema | None = None,
)[source]#

Bases: BaseModel

Configuration schema for the PDF extractor settings.

Parameters:
  • max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.

  • n_workers (int, default=16) – The number of worker threads to use for processing.

  • raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception on processing failure.

  • image_extraction_config (Optional[ImageConfigSchema], default=None) – Configuration schema for the image extraction stage.

image_extraction_config: ImageConfigSchema | None#
max_queue_size: int#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#
raise_on_failure: bool#

nv_ingest.schemas.image_filter_schema module#

class nv_ingest.schemas.image_filter_schema.ImageFilterSchema(
*,
raise_on_failure: Annotated[bool, Strict(strict=True)] = False,
cpu_only: Annotated[bool, Strict(strict=True)] = False,
)[source]#

Bases: BaseModel

cpu_only: Annotated[bool, Strict(strict=True)]#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: Annotated[bool, Strict(strict=True)]#

nv_ingest.schemas.image_storage_schema module#

class nv_ingest.schemas.image_storage_schema.ImageStorageModuleSchema(
*,
structured: bool = True,
images: bool = True,
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

images: bool#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#
structured: bool#

nv_ingest.schemas.infographic_extractor_schema module#

class nv_ingest.schemas.infographic_extractor_schema.InfographicExtractorConfigSchema(
*,
auth_token: str | None = None,
paddle_endpoints: Tuple[str | None, str | None] = (None, None),
paddle_infer_protocol: str = '',
nim_batch_size: int = 2,
workers_per_progress_engine: int = 5,
)[source]#

Bases: BaseModel

Configuration schema for infographic extraction service endpoints and options.

Parameters:
  • auth_token (Optional[str], default=None) – Authentication token required for secure services.

  • paddle_endpoints (Tuple[Optional[str], Optional[str]], default=(None, None)) – A tuple containing the gRPC and HTTP services for the paddle endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#

Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:
  • ValueError – If both gRPC and HTTP services are empty for any endpoint.

  • Config

  • ------

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nim_batch_size: int#
paddle_endpoints: Tuple[str | None, str | None]#
paddle_infer_protocol: str#
classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Ensures that at least one service (either gRPC or HTTP) is provided for each endpoint in the configuration.

Parameters:

values (dict) – Dictionary containing the values of the attributes for the class.

Returns:

The validated dictionary of values.

Return type:

dict

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.

workers_per_progress_engine: int#
class nv_ingest.schemas.infographic_extractor_schema.InfographicExtractorSchema(
*,
max_queue_size: int = 1,
n_workers: int = 2,
raise_on_failure: bool = False,
stage_config: InfographicExtractorConfigSchema | None = None,
)[source]#

Bases: BaseModel

Configuration schema for infographic extraction processing settings.

Parameters:
  • max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.

  • n_workers (int, default=2) – The number of worker threads to use for processing.

  • raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception if a failure occurs during infographic extraction.

  • stage_config (Optional[InfographicExtractorConfigSchema], default=None) – Configuration for the infographic extraction stage, including yolox and paddle service endpoints.

classmethod check_positive(v, field)[source]#
max_queue_size: int#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#
raise_on_failure: bool#
stage_config: InfographicExtractorConfigSchema | None#

nv_ingest.schemas.ingest_job_schema module#

class nv_ingest.schemas.ingest_job_schema.DocumentTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

bmp = 'bmp'#
docx = 'docx'#
html = 'html'#
jpeg = 'jpeg'#
mp3 = 'mp3'#
pdf = 'pdf'#
png = 'png'#
pptx = 'pptx'#
svg = 'svg'#
tiff = 'tiff'#
txt = 'text'#
wav = 'wav'#
class nv_ingest.schemas.ingest_job_schema.FilterTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

image = 'image'#
class nv_ingest.schemas.ingest_job_schema.IngestJobSchema(
*,
job_payload: JobPayloadSchema,
job_id: str | int,
tasks: List[IngestTaskSchema],
tracing_options: TracingOptionsSchema | None = None,
)[source]#

Bases: BaseModelNoExt

job_id: str | int#
job_payload: JobPayloadSchema#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

tasks: List[IngestTaskSchema]#
tracing_options: TracingOptionsSchema | None#
class nv_ingest.schemas.ingest_job_schema.IngestTaskAudioExtraction(
*,
auth_token: str | None = None,
grpc_endpoint: str | None = None,
http_endpoint: str | None = None,
infer_protocol: str | None = None,
function_id: str | None = None,
use_ssl: bool | None = None,
ssl_cert: str | None = None,
)[source]#

Bases: BaseModelNoExt

auth_token: str | None#
function_id: str | None#
grpc_endpoint: str | None#
http_endpoint: str | None#
infer_protocol: str | None#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

ssl_cert: str | None#
use_ssl: bool | None#
class nv_ingest.schemas.ingest_job_schema.IngestTaskCaptionSchema(
*,
api_key: str | None = None,
endpoint_url: str | None = None,
prompt: str | None = None,
model_name: str | None = None,
)[source]#

Bases: BaseModelNoExt

api_key: str | None#
endpoint_url: str | None#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str | None#
prompt: str | None#
class nv_ingest.schemas.ingest_job_schema.IngestTaskChartExtraction(*, params: ~typing.Dict = <factory>)[source]#

Bases: BaseModelNoExt

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: Dict#
class nv_ingest.schemas.ingest_job_schema.IngestTaskDedupParams(*, filter: bool = False)[source]#

Bases: BaseModelNoExt

filter: bool#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest.schemas.ingest_job_schema.IngestTaskDedupSchema(
*,
content_type: ContentTypeEnum = ContentTypeEnum.IMAGE,
params: IngestTaskDedupParams = IngestTaskDedupParams(filter=False),
)[source]#

Bases: BaseModelNoExt

content_type: ContentTypeEnum#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: IngestTaskDedupParams#
class nv_ingest.schemas.ingest_job_schema.IngestTaskEmbedSchema(
*,
endpoint_url: str | None = None,
model_name: str | None = None,
api_key: str | None = None,
filter_errors: bool = False,
)[source]#

Bases: BaseModelNoExt

api_key: str | None#
endpoint_url: str | None#
filter_errors: bool#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str | None#
class nv_ingest.schemas.ingest_job_schema.IngestTaskExtractSchema(
*,
document_type: DocumentTypeEnum,
method: str,
params: dict,
)[source]#

Bases: BaseModelNoExt

classmethod case_insensitive_document_type(v)[source]#
document_type: DocumentTypeEnum#
method: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#
class nv_ingest.schemas.ingest_job_schema.IngestTaskFilterParamsSchema(
*,
min_size: int = 128,
max_aspect_ratio: float | int = 5.0,
min_aspect_ratio: float | int = 0.2,
filter: bool = False,
)[source]#

Bases: BaseModelNoExt

filter: bool#
max_aspect_ratio: float | int#
min_aspect_ratio: float | int#
min_size: int#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest.schemas.ingest_job_schema.IngestTaskFilterSchema(
*,
content_type: ContentTypeEnum = ContentTypeEnum.IMAGE,
params: IngestTaskFilterParamsSchema = IngestTaskFilterParamsSchema(min_size=128, max_aspect_ratio=5.0, min_aspect_ratio=0.2, filter=False),
)[source]#

Bases: BaseModelNoExt

content_type: ContentTypeEnum#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: IngestTaskFilterParamsSchema#
class nv_ingest.schemas.ingest_job_schema.IngestTaskInfographicExtraction(*, params: ~typing.Dict = <factory>)[source]#

Bases: BaseModelNoExt

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: Dict#
class nv_ingest.schemas.ingest_job_schema.IngestTaskSchema(
*,
type: TaskTypeEnum,
task_properties: IngestTaskSplitSchema | IngestTaskExtractSchema | IngestTaskStoreEmbedSchema | IngestTaskStoreSchema | IngestTaskEmbedSchema | IngestTaskCaptionSchema | IngestTaskDedupSchema | IngestTaskFilterSchema | IngestTaskVdbUploadSchema | IngestTaskAudioExtraction | IngestTaskTableExtraction | IngestTaskChartExtraction | IngestTaskInfographicExtraction,
raise_on_failure: bool = False,
)[source]#

Bases: BaseModelNoExt

classmethod case_insensitive_task_type(v)[source]#
classmethod check_task_properties_type(values)[source]#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#
task_properties: IngestTaskSplitSchema | IngestTaskExtractSchema | IngestTaskStoreEmbedSchema | IngestTaskStoreSchema | IngestTaskEmbedSchema | IngestTaskCaptionSchema | IngestTaskDedupSchema | IngestTaskFilterSchema | IngestTaskVdbUploadSchema | IngestTaskAudioExtraction | IngestTaskTableExtraction | IngestTaskChartExtraction | IngestTaskInfographicExtraction#
type: TaskTypeEnum#
class nv_ingest.schemas.ingest_job_schema.IngestTaskSplitSchema(
*,
tokenizer: str | None = None,
chunk_size: Annotated[int, Gt(gt=0)] = 1024,
chunk_overlap: Annotated[int, Ge(ge=0)] = 150,
params: dict,
)[source]#

Bases: BaseModelNoExt

classmethod check_chunk_overlap(v, values, **kwargs)[source]#
chunk_overlap: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])]#
chunk_size: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Gt(gt=0)])]#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#
tokenizer: str | None#
class nv_ingest.schemas.ingest_job_schema.IngestTaskStoreEmbedSchema(*, params: dict)[source]#

Bases: BaseModelNoExt

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#
class nv_ingest.schemas.ingest_job_schema.IngestTaskStoreSchema(
*,
structured: bool = True,
images: bool = False,
method: str,
params: dict,
)[source]#

Bases: BaseModelNoExt

images: bool#
method: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#
structured: bool#
class nv_ingest.schemas.ingest_job_schema.IngestTaskTableExtraction(*, params: ~typing.Dict = <factory>)[source]#

Bases: BaseModelNoExt

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: Dict#
class nv_ingest.schemas.ingest_job_schema.IngestTaskVdbUploadSchema(
*,
bulk_ingest: bool = False,
bulk_ingest_path: str = None,
params: dict = None,
filter_errors: bool = True,
)[source]#

Bases: BaseModelNoExt

bulk_ingest: bool#
bulk_ingest_path: str#
filter_errors: bool#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#
class nv_ingest.schemas.ingest_job_schema.JobPayloadSchema(
*,
content: List[str | bytes],
source_name: List[str],
source_id: List[str | int],
document_type: List[str],
)[source]#

Bases: BaseModelNoExt

content: List[str | bytes]#
document_type: List[str]#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

source_id: List[str | int]#
source_name: List[str]#
class nv_ingest.schemas.ingest_job_schema.TaskTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

audio_data_extract = 'audio_data_extract'#
caption = 'caption'#
chart_data_extract = 'chart_data_extract'#
dedup = 'dedup'#
embed = 'embed'#
extract = 'extract'#
filter = 'filter'#
infographic_data_extract = 'infographic_data_extract'#
split = 'split'#
store = 'store'#
store_embedding = 'store_embedding'#
table_data_extract = 'table_data_extract'#
vdb_upload = 'vdb_upload'#
class nv_ingest.schemas.ingest_job_schema.TracingOptionsSchema(
*,
trace: bool = False,
ts_send: int,
trace_id: str | None = None,
)[source]#

Bases: BaseModelNoExt

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

trace: bool#
trace_id: str | None#
ts_send: int#
nv_ingest.schemas.ingest_job_schema.validate_ingest_job(
job_data: Dict[str, Any],
) IngestJobSchema[source]#

Validates a dictionary representing an ingest_job using the IngestJobSchema.

Parameters: - job_data: Dictionary representing an ingest job.

Returns: - IngestJobSchema: The validated ingest job.

Raises: - ValidationError: If the input data does not conform to the IngestJobSchema.

nv_ingest.schemas.ingest_pipeline_config_schema module#

class nv_ingest.schemas.ingest_pipeline_config_schema.PipelineConfigSchema(*, audio_extractor_schema: ~nv_ingest.schemas.audio_extractor_schema.AudioExtractorSchema = AudioExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, audio_extraction_config=None), chart_extractor_module: ~nv_ingest.schemas.chart_extractor_schema.ChartExtractorSchema = ChartExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, stage_config=None), text_splitter_module: ~nv_ingest.schemas.text_splitter_schema.TextSplitterSchema = TextSplitterSchema(tokenizer=None, chunk_size=1024, chunk_overlap=150, raise_on_failure=False), embedding_storage_module: ~nv_ingest.schemas.embedding_storage_schema.EmbeddingStorageModuleSchema = EmbeddingStorageModuleSchema(raise_on_failure=False), embed_extractions_module: ~nv_ingest.schemas.embed_extractions_schema.EmbedExtractionsSchema = EmbedExtractionsSchema(api_key='api_key', batch_size=8192, embedding_model='nvidia/nv-embedqa-e5-v5', embedding_nim_endpoint='http://embedding:8000/v1', encoding_format='float', httpx_log_level=<LogLevel.WARNING: 'WARNING'>, input_type='passage', raise_on_failure=False, truncate='END'), image_caption_extraction_module: ~nv_ingest.schemas.image_caption_extraction_schema.ImageCaptionExtractionSchema = ImageCaptionExtractionSchema(api_key='api_key', endpoint_url='https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions', prompt='Caption the content of this image:', model_name='meta/llama-3.2-11b-vision-instruct', raise_on_failure=False), image_dedup_module: ~nv_ingest.schemas.image_dedup_schema.ImageDedupSchema = ImageDedupSchema(raise_on_failure=False, cpu_only=False), image_filter_module: ~nv_ingest.schemas.image_filter_schema.ImageFilterSchema = ImageFilterSchema(raise_on_failure=False, cpu_only=False), image_storage_module: ~nv_ingest.schemas.image_storage_schema.ImageStorageModuleSchema = ImageStorageModuleSchema(structured=True, images=True, raise_on_failure=False), infographic_extractor_module: ~nv_ingest.schemas.infographic_extractor_schema.InfographicExtractorSchema = InfographicExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, stage_config=None), job_counter_module: ~nv_ingest.schemas.job_counter_schema.JobCounterSchema = JobCounterSchema(name='job_counter', raise_on_failure=False), metadata_injection_module: ~nv_ingest.schemas.metadata_injector_schema.MetadataInjectorSchema = MetadataInjectorSchema(raise_on_failure=False), otel_meter_module: ~nv_ingest.schemas.otel_meter_schema.OpenTelemetryMeterSchema = OpenTelemetryMeterSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), otel_endpoint='localhost:4317', raise_on_failure=False), otel_tracer_module: ~nv_ingest.schemas.otel_tracer_schema.OpenTelemetryTracerSchema = OpenTelemetryTracerSchema(otel_endpoint='localhost:4317', raise_on_failure=False), pdf_extractor_module: ~nv_ingest.schemas.pdf_extractor_schema.PDFExtractorSchema = PDFExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pdfium_config=None, nemoretriever_parse_config=None), pptx_extractor_module: ~nv_ingest.schemas.pptx_extractor_schema.PPTXExtractorSchema = PPTXExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pptx_extraction_config=None), redis_task_sink: ~nv_ingest.schemas.message_broker_sink_schema.MessageBrokerTaskSinkSchema = MessageBrokerTaskSinkSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), raise_on_failure=False, progress_engines=6), redis_task_source: ~nv_ingest.schemas.message_broker_source_schema.MessageBrokerTaskSourceSchema = MessageBrokerTaskSourceSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), task_queue='morpheus_task_queue', raise_on_failure=False, progress_engines=6), table_extractor_module: ~nv_ingest.schemas.table_extractor_schema.TableExtractorSchema = TableExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, stage_config=None), vdb_task_sink: ~nv_ingest.schemas.vdb_task_sink_schema.VdbTaskSinkSchema = VdbTaskSinkSchema(recreate=False, service='milvus', is_service_serialized=False, default_resource_name='nv_ingest_collection', resource_schemas={'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'metric_type': 'L2', 'index_type': 'GPU_CAGRA', 'params': {'intermediate_graph_degree': 128, 'graph_degree': 64, 'build_algo': 'NN_DESCENT'}}, 'schema_conf': {'enable_dynamic_field': True, 'schema_fields': [{'name': 'pk', 'description': 'Primary key for the collection', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': 'Extracted content', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'vector', 'description': 'Embedding vectors', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}, {'name': 'source', 'description': 'Source document and raw data extracted content', 'type': <DataType.JSON: 23>}, {'name': 'content_metadata', 'description': 'Content metadata', 'type': <DataType.JSON: 23>}], 'description': 'NV-INGEST collection schema'}}}, resource_kwargs={}, service_kwargs={}, batch_size=5120, write_time_interval=1.0, retry_interval=60.0, raise_on_failure=False, progress_engines=1))[source]#

Bases: BaseModel

audio_extractor_schema: AudioExtractorSchema#
chart_extractor_module: ChartExtractorSchema#
embed_extractions_module: EmbedExtractionsSchema#
embedding_storage_module: EmbeddingStorageModuleSchema#
image_caption_extraction_module: ImageCaptionExtractionSchema#
image_dedup_module: ImageDedupSchema#
image_filter_module: ImageFilterSchema#
image_storage_module: ImageStorageModuleSchema#
infographic_extractor_module: InfographicExtractorSchema#
job_counter_module: JobCounterSchema#
metadata_injection_module: MetadataInjectorSchema#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_meter_module: OpenTelemetryMeterSchema#
otel_tracer_module: OpenTelemetryTracerSchema#
pdf_extractor_module: PDFExtractorSchema#
pptx_extractor_module: PPTXExtractorSchema#
redis_task_sink: MessageBrokerTaskSinkSchema#
redis_task_source: MessageBrokerTaskSourceSchema#
table_extractor_module: TableExtractorSchema#
text_splitter_module: TextSplitterSchema#
vdb_task_sink: VdbTaskSinkSchema#

nv_ingest.schemas.job_counter_schema module#

class nv_ingest.schemas.job_counter_schema.JobCounterSchema(
*,
name: str = 'job_counter',
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

name: str#
raise_on_failure: bool#

nv_ingest.schemas.message_broker_client_schema module#

class nv_ingest.schemas.message_broker_client_schema.MessageBrokerClientSchema(
*,
host: str = 'redis',
port: Annotated[int, Gt(gt=0), Lt(lt=65536)] = 6379,
client_type: Literal['redis', 'simple'] = 'redis',
broker_params: dict | None = {},
connection_timeout: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None = 300,
max_backoff: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None = 300,
max_retries: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None = 0,
)[source]#

Bases: BaseModel

broker_params: dict | None#
client_type: Literal['redis', 'simple']#
connection_timeout: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None#
host: str#
max_backoff: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None#
max_retries: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None#
model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

port: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Gt(gt=0), Lt(lt=65536)])]#

nv_ingest.schemas.message_broker_sink_schema module#

class nv_ingest.schemas.message_broker_sink_schema.MessageBrokerTaskSinkSchema(
*,
broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0),
raise_on_failure: bool = False,
progress_engines: Annotated[int, Ge(ge=1)] = 6,
)[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#
model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#
raise_on_failure: bool#

nv_ingest.schemas.message_broker_source_schema module#

class nv_ingest.schemas.message_broker_source_schema.MessageBrokerTaskSourceSchema(
*,
broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0),
task_queue: str = 'morpheus_task_queue',
raise_on_failure: bool = False,
progress_engines: Annotated[int, Ge(ge=1)] = 6,
)[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#
model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#
raise_on_failure: bool#
task_queue: str#

nv_ingest.schemas.message_wrapper_schema module#

class nv_ingest.schemas.message_wrapper_schema.MessageWrapper(*, payload: str)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

payload: str#

nv_ingest.schemas.metadata_injector_schema module#

class nv_ingest.schemas.metadata_injector_schema.MetadataInjectorSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

nv_ingest.schemas.metadata_schema module#

class nv_ingest.schemas.metadata_schema.AccessLevelEnum(value)[source]#

Bases: int, Enum

An enumeration.

LEVEL_1 = 1#
LEVEL_2 = 2#
LEVEL_3 = 3#
class nv_ingest.schemas.metadata_schema.AudioMetadataSchema(
*,
audio_transcript: str = '',
audio_type: str = '',
)[source]#

Bases: BaseModelNoExt

audio_transcript: str#
audio_type: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest.schemas.metadata_schema.ChartMetadataSchema(
*,
caption: str = '',
table_format: TableFormatEnum,
table_content: str = '',
table_content_format: TableFormatEnum | str = '',
table_location: tuple = (0, 0, 0, 0),
table_location_max_dimensions: tuple = (0, 0),
uploaded_image_uri: str = '',
)[source]#

Bases: BaseModelNoExt

caption: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

table_content: str#
table_content_format: TableFormatEnum | str#
table_format: TableFormatEnum#
table_location: tuple#
table_location_max_dimensions: tuple#
uploaded_image_uri: str#
class nv_ingest.schemas.metadata_schema.ContentHierarchySchema(
*,
page_count: int = -1,
page: int = -1,
block: int = -1,
line: int = -1,
span: int = -1,
nearby_objects: NearbyObjectsSchema = NearbyObjectsSchema(text=NearbyObjectsSubSchema(content=[], bbox=[], type=[]), images=NearbyObjectsSubSchema(content=[], bbox=[], type=[]), structured=NearbyObjectsSubSchema(content=[], bbox=[], type=[])),
)[source]#

Bases: BaseModelNoExt

Schema for the extracted content hierarchy.

block: int#
line: int#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nearby_objects: NearbyObjectsSchema#
page: int#
page_count: int#
span: int#
class nv_ingest.schemas.metadata_schema.ContentMetadataSchema(
*,
type: ContentTypeEnum,
description: str = '',
page_number: int = -1,
hierarchy: ContentHierarchySchema = ContentHierarchySchema(page_count=-1, page=-1, block=-1, line=-1, span=-1, nearby_objects=NearbyObjectsSchema(text=NearbyObjectsSubSchema(content=[], bbox=[], type=[]), images=NearbyObjectsSubSchema(content=[], bbox=[], type=[]), structured=NearbyObjectsSubSchema(content=[], bbox=[], type=[]))),
subtype: ContentSubtypeEnum | str = '',
)[source]#

Bases: BaseModelNoExt

Data extracted from a source; generally Text or Image.

description: str#
hierarchy: ContentHierarchySchema#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

page_number: int#
subtype: ContentSubtypeEnum | str#
type: ContentTypeEnum#
class nv_ingest.schemas.metadata_schema.ContentSubtypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

CHART = 'chart'#
INFOGRAPHIC = 'infographic'#
TABLE = 'table'#
class nv_ingest.schemas.metadata_schema.ContentTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

AUDIO = 'audio'#
EMBEDDING = 'embedding'#
IMAGE = 'image'#
INFO_MSG = 'info_message'#
STRUCTURED = 'structured'#
TEXT = 'text'#
UNSTRUCTURED = 'unstructured'#
VIDEO = 'video'#
class nv_ingest.schemas.metadata_schema.ErrorMetadataSchema(
*,
task: TaskTypeEnum,
status: StatusEnum,
source_id: str = '',
error_msg: str,
)[source]#

Bases: BaseModelNoExt

error_msg: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

source_id: str#
status: StatusEnum#
task: TaskTypeEnum#
class nv_ingest.schemas.metadata_schema.ImageMetadataSchema(
*,
image_type: ImageTypeEnum | str,
structured_image_type: ImageTypeEnum = ImageTypeEnum.image_type_1,
caption: str = '',
text: str = '',
image_location: tuple = (0, 0, 0, 0),
image_location_max_dimensions: tuple = (0, 0),
uploaded_image_url: str = '',
width: int = 0,
height: int = 0,
)[source]#

Bases: BaseModelNoExt

caption: str#
classmethod clamp_non_negative(v, field)[source]#
height: int#
image_location: tuple#
image_location_max_dimensions: tuple#
image_type: ImageTypeEnum | str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

structured_image_type: ImageTypeEnum#
text: str#
uploaded_image_url: str#
classmethod validate_image_type(v)[source]#
width: int#
class nv_ingest.schemas.metadata_schema.ImageTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

BMP = 'bmp'#
GIF = 'gif'#
JPEG = 'jpeg'#
PNG = 'png'#
TIFF = 'tiff'#
classmethod has_value(value)[source]#
image_type_1 = 'image_type_1'#
image_type_2 = 'image_type_2'#
class nv_ingest.schemas.metadata_schema.InfoMessageMetadataSchema(
*,
task: TaskTypeEnum,
status: StatusEnum,
message: str,
filter: bool,
)[source]#

Bases: BaseModelNoExt

filter: bool#
message: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

status: StatusEnum#
task: TaskTypeEnum#
class nv_ingest.schemas.metadata_schema.LanguageEnum(value)[source]#

Bases: str, Enum

An enumeration.

AF = 'af'#
AR = 'ar'#
BG = 'bg'#
BN = 'bn'#
CA = 'ca'#
CS = 'cs'#
CY = 'cy'#
DA = 'da'#
DE = 'de'#
EL = 'el'#
EN = 'en'#
ES = 'es'#
ET = 'et'#
FA = 'fa'#
FI = 'fi'#
FR = 'fr'#
GU = 'gu'#
HE = 'he'#
HI = 'hi'#
HR = 'hr'#
HU = 'hu'#
ID = 'id'#
IT = 'it'#
JA = 'ja'#
KN = 'kn'#
KO = 'ko'#
LT = 'lt'#
LV = 'lv'#
MK = 'mk'#
ML = 'ml'#
MR = 'mr'#
NE = 'ne'#
NL = 'nl'#
NO = 'no'#
PA = 'pa'#
PL = 'pl'#
PT = 'pt'#
RO = 'ro'#
RU = 'ru'#
SK = 'sk'#
SL = 'sl'#
SO = 'so'#
SQ = 'sq'#
SV = 'sv'#
SW = 'sw'#
TA = 'ta'#
TE = 'te'#
TH = 'th'#
TL = 'tl'#
TR = 'tr'#
UK = 'uk'#
UNKNOWN = 'unknown'#
UR = 'ur'#
VI = 'vi'#
ZH_CN = 'zh-cn'#
ZH_TW = 'zh-tw'#
classmethod has_value(value)[source]#
class nv_ingest.schemas.metadata_schema.MetadataSchema(
*,
content: str = '',
content_url: str = '',
embedding: List[float] | None = None,
source_metadata: SourceMetadataSchema | None = None,
content_metadata: ContentMetadataSchema | None = None,
audio_metadata: AudioMetadataSchema | None = None,
text_metadata: TextMetadataSchema | None = None,
image_metadata: ImageMetadataSchema | None = None,
table_metadata: TableMetadataSchema | None = None,
chart_metadata: ChartMetadataSchema | None = None,
error_metadata: ErrorMetadataSchema | None = None,
info_message_metadata: InfoMessageMetadataSchema | None = None,
debug_metadata: Dict[str, Any] | None = None,
raise_on_failure: bool = False,
)[source]#

Bases: BaseModelNoExt

audio_metadata: AudioMetadataSchema | None#
chart_metadata: ChartMetadataSchema | None#
classmethod check_metadata_type(values)[source]#
content: str#
content_metadata: ContentMetadataSchema | None#
content_url: str#
debug_metadata: Dict[str, Any] | None#
embedding: List[float] | None#
error_metadata: ErrorMetadataSchema | None#
image_metadata: ImageMetadataSchema | None#
info_message_metadata: InfoMessageMetadataSchema | None#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#
source_metadata: SourceMetadataSchema | None#
table_metadata: TableMetadataSchema | None#
text_metadata: TextMetadataSchema | None#
class nv_ingest.schemas.metadata_schema.NearbyObjectsSchema(
*,
text: NearbyObjectsSubSchema = NearbyObjectsSubSchema(content=[], bbox=[], type=[]),
images: NearbyObjectsSubSchema = NearbyObjectsSubSchema(content=[], bbox=[], type=[]),
structured: NearbyObjectsSubSchema = NearbyObjectsSubSchema(content=[], bbox=[], type=[]),
)[source]#

Bases: BaseModelNoExt

Schema to hold types of related extracted objects.

images: NearbyObjectsSubSchema#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

structured: NearbyObjectsSubSchema#
text: NearbyObjectsSubSchema#
class nv_ingest.schemas.metadata_schema.NearbyObjectsSubSchema(
*,
content: List[str] = [],
bbox: List[tuple] = [],
type: List[str] = [],
)[source]#

Bases: BaseModelNoExt

Schema to hold related extracted object

bbox: List[tuple]#
content: List[str]#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

type: List[str]#
class nv_ingest.schemas.metadata_schema.SourceMetadataSchema(
*,
source_name: str,
source_id: str,
source_location: str = '',
source_type: SourceTypeEnum | str,
collection_id: str = '',
date_created: str = '2025-06-30T17:04:05.709444',
last_modified: str = '2025-06-30T17:04:05.709461',
summary: str = '',
partition_id: int = -1,
access_level: AccessLevelEnum | int = -1,
)[source]#

Bases: BaseModelNoExt

Schema for the knowledge base file from which content and metadata is extracted.

access_level: AccessLevelEnum | int#
collection_id: str#
date_created: str#
last_modified: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

partition_id: int#
source_id: str#
source_location: str#
source_name: str#
source_type: SourceTypeEnum | str#
summary: str#
classmethod validate_fields(field_value)[source]#
class nv_ingest.schemas.metadata_schema.SourceTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

DOCX = 'docx'#
PDF = 'pdf'#
PPTX = 'pptx'#
source_type_1 = 'source_type_1'#
source_type_2 = 'source_type_2'#
class nv_ingest.schemas.metadata_schema.StatusEnum(value)[source]#

Bases: str, Enum

An enumeration.

ERROR: str = 'error'#
SUCCESS: str = 'success'#
class nv_ingest.schemas.metadata_schema.StdContentDescEnum(value)[source]#

Bases: str, Enum

An enumeration.

DOCX_IMAGE = 'Image extracted from DOCX document.'#
DOCX_TABLE = 'Structured table extracted from DOCX document.'#
DOCX_TEXT = 'Unstructured text from DOCX document.'#
PDF_CHART = 'Structured chart extracted from PDF document.'#
PDF_IMAGE = 'Image extracted from PDF document.'#
PDF_INFOGRAPHIC = 'Structured infographic extracted from PDF document.'#
PDF_TABLE = 'Structured table extracted from PDF document.'#
PDF_TEXT = 'Unstructured text from PDF document.'#
PPTX_IMAGE = 'Image extracted from PPTX presentation.'#
PPTX_TABLE = 'Structured table extracted from PPTX presentation.'#
PPTX_TEXT = 'Unstructured text from PPTX presentation.'#
class nv_ingest.schemas.metadata_schema.TableFormatEnum(value)[source]#

Bases: str, Enum

An enumeration.

HTML = 'html'#
IMAGE = 'image'#
LATEX = 'latex'#
MARKDOWN = 'markdown'#
PSEUDO_MARKDOWN = 'pseudo_markdown'#
SIMPLE = 'simple'#
class nv_ingest.schemas.metadata_schema.TableMetadataSchema(
*,
caption: str = '',
table_format: TableFormatEnum,
table_content: str = '',
table_content_format: TableFormatEnum | str = '',
table_location: tuple = (0, 0, 0, 0),
table_location_max_dimensions: tuple = (0, 0),
uploaded_image_uri: str = '',
)[source]#

Bases: BaseModelNoExt

caption: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

table_content: str#
table_content_format: TableFormatEnum | str#
table_format: TableFormatEnum#
table_location: tuple#
table_location_max_dimensions: tuple#
uploaded_image_uri: str#
class nv_ingest.schemas.metadata_schema.TaskTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

CAPTION = 'caption'#
EMBED = 'embed'#
EXTRACT = 'extract'#
FILTER = 'filter'#
SPLIT = 'split'#
TRANSFORM = 'transform'#
class nv_ingest.schemas.metadata_schema.TextMetadataSchema(
*,
text_type: TextTypeEnum,
summary: str = '',
keywords: str | List[str] | Dict = '',
language: LanguageEnum = 'en',
text_location: tuple = (0, 0, 0, 0),
text_location_max_dimensions: tuple = (0, 0, 0, 0),
)[source]#

Bases: BaseModelNoExt

keywords: str | List[str] | Dict#
language: LanguageEnum#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

summary: str#
text_location: tuple#
text_location_max_dimensions: tuple#
text_type: TextTypeEnum#
class nv_ingest.schemas.metadata_schema.TextTypeEnum(value)[source]#

Bases: str, Enum

An enumeration.

BLOCK = 'block'#
BODY = 'body'#
DOCUMENT = 'document'#
HEADER = 'header'#
LINE = 'line'#
NEARBY_BLOCK = 'nearby_block'#
OTHER = 'other'#
PAGE = 'page'#
SPAN = 'span'#
nv_ingest.schemas.metadata_schema.validate_metadata(
metadata: Dict[str, Any],
) MetadataSchema[source]#

Validates the given metadata dictionary against the MetadataSchema.

Parameters: - metadata: A dictionary representing metadata to be validated.

Returns: - An instance of MetadataSchema if validation is successful.

Raises: - ValidationError: If the metadata does not conform to the schema.

nv_ingest.schemas.otel_meter_schema module#

class nv_ingest.schemas.otel_meter_schema.OpenTelemetryMeterSchema(
*,
broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0),
otel_endpoint: str = 'localhost:4317',
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_endpoint: str#
raise_on_failure: bool#

nv_ingest.schemas.otel_tracer_schema module#

class nv_ingest.schemas.otel_tracer_schema.OpenTelemetryTracerSchema(
*,
otel_endpoint: str = 'localhost:4317',
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_endpoint: str#
raise_on_failure: bool#

nv_ingest.schemas.pdf_extractor_schema module#

class nv_ingest.schemas.pdf_extractor_schema.NemoRetrieverParseConfigSchema(
*,
auth_token: str | None = None,
nemoretriever_parse_endpoints: Tuple[str | None, str | None] = (None, None),
nemoretriever_parse_infer_protocol: str = '',
model_name: str = 'nvidia/nemoretriever-parse',
timeout: float = 300.0,
workers_per_progress_engine: int = 5,
)[source]#

Bases: BaseModel

Configuration schema for NemoRetrieverParse endpoints and options.

Parameters:
  • auth_token (Optional[str], default=None) – Authentication token required for secure services.

  • nemoretriever_parse_endpoints (Tuple[str, str]) – A tuple containing the gRPC and HTTP services for the nemoretriever_parse endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#

Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:
  • ValueError – If both gRPC and HTTP services are empty for any endpoint.

  • Config

  • ------

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str#
nemoretriever_parse_endpoints: Tuple[str | None, str | None]#
nemoretriever_parse_infer_protocol: str#
timeout: float#
classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Parameters:

values (dict) – Dictionary containing the values of the attributes for the class.

Returns:

The validated dictionary of values.

Return type:

dict

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.

workers_per_progress_engine: int#
class nv_ingest.schemas.pdf_extractor_schema.PDFExtractorSchema(
*,
max_queue_size: int = 1,
n_workers: int = 16,
raise_on_failure: bool = False,
pdfium_config: PDFiumConfigSchema | None = None,
nemoretriever_parse_config: NemoRetrieverParseConfigSchema | None = None,
)[source]#

Bases: BaseModel

Configuration schema for the PDF extractor settings.

Parameters:
  • max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.

  • n_workers (int, default=16) – The number of worker threads to use for processing.

  • raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception on processing failure.

  • pdfium_config (Optional[PDFiumConfigSchema], default=None) – Configuration for the PDFium service endpoints.

max_queue_size: int#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#
nemoretriever_parse_config: NemoRetrieverParseConfigSchema | None#
pdfium_config: PDFiumConfigSchema | None#
raise_on_failure: bool#
class nv_ingest.schemas.pdf_extractor_schema.PDFiumConfigSchema(
*,
auth_token: str | None = None,
yolox_endpoints: Tuple[str | None, str | None] = (None, None),
yolox_infer_protocol: str = '',
nim_batch_size: int = 4,
workers_per_progress_engine: int = 5,
)[source]#

Bases: BaseModel

Configuration schema for PDFium endpoints and options.

Parameters:
  • auth_token (Optional[str], default=None) – Authentication token required for secure services.

  • yolox_endpoints (Tuple[str, str]) – A tuple containing the gRPC and HTTP services for the yolox endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#

Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:
  • ValueError – If both gRPC and HTTP services are empty for any endpoint.

  • Config

  • ------

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nim_batch_size: int#
classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Parameters:

values (dict) – Dictionary containing the values of the attributes for the class.

Returns:

The validated dictionary of values.

Return type:

dict

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.

workers_per_progress_engine: int#
yolox_endpoints: Tuple[str | None, str | None]#
yolox_infer_protocol: str#

nv_ingest.schemas.pptx_extractor_schema module#

class nv_ingest.schemas.pptx_extractor_schema.PPTXConfigSchema(
*,
auth_token: str | None = None,
yolox_endpoints: Tuple[str | None, str | None] = (None, None),
yolox_infer_protocol: str = '',
)[source]#

Bases: BaseModel

Configuration schema for docx extraction endpoints and options.

Parameters:
  • auth_token (Optional[str], default=None) – Authentication token required for secure services.

  • yolox_endpoints (Tuple[str, str]) – A tuple containing the gRPC and HTTP services for the yolox endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#

Validates that at least one of the gRPC or HTTP services is provided for each endpoint.

Raises:
  • ValueError – If both gRPC and HTTP services are empty for any endpoint.

  • Config

  • ------

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for all endpoints.

Parameters:

values (dict) – Dictionary containing the values of the attributes for the class.

Returns:

The validated dictionary of values.

Return type:

dict

Raises:

ValueError – If both gRPC and HTTP services are empty for any endpoint.

yolox_endpoints: Tuple[str | None, str | None]#
yolox_infer_protocol: str#
class nv_ingest.schemas.pptx_extractor_schema.PPTXExtractorSchema(
*,
max_queue_size: int = 1,
n_workers: int = 16,
raise_on_failure: bool = False,
pptx_extraction_config: PPTXConfigSchema | None = None,
)[source]#

Bases: BaseModel

Configuration schema for the PDF extractor settings.

Parameters:
  • max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.

  • n_workers (int, default=16) – The number of worker threads to use for processing.

  • raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception on processing failure.

  • image_extraction_config (Optional[ImageConfigSchema], default=None) – Configuration schema for the image extraction stage.

max_queue_size: int#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#
pptx_extraction_config: PPTXConfigSchema | None#
raise_on_failure: bool#

nv_ingest.schemas.processing_job_schema module#

class nv_ingest.schemas.processing_job_schema.ConversionStatus(value)[source]#

Bases: str, Enum

An enumeration.

FAILED = 'failed'#
IN_PROGRESS = 'in_progress'#
SUCCESS = 'success'#
model_config = "{'extra': 'forbid'}"#
class nv_ingest.schemas.processing_job_schema.ProcessingJob(
*,
submitted_job_id: str,
filename: str,
raw_result: str = '',
content: str = '',
status: ConversionStatus,
error: str | None = None,
)[source]#

Bases: BaseModel

content: str#
error: str | None#
filename: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raw_result: str#
status: ConversionStatus#
submitted_job_id: str#

nv_ingest.schemas.table_extractor_schema module#

class nv_ingest.schemas.table_extractor_schema.TableExtractorConfigSchema(
*,
auth_token: str | None = None,
yolox_endpoints: Tuple[str | None, str | None] = (None, None),
yolox_infer_protocol: str = '',
paddle_endpoints: Tuple[str | None, str | None] = (None, None),
paddle_infer_protocol: str = '',
nim_batch_size: int = 2,
workers_per_progress_engine: int = 5,
)[source]#

Bases: BaseModel

Configuration schema for the table extraction stage settings.

Parameters:
  • auth_token (Optional[str], default=None) – Authentication token required for secure services.

  • paddle_endpoints (Tuple[Optional[str], Optional[str]], default=(None, None)) – A tuple containing the gRPC and HTTP services for the paddle endpoint. Either the gRPC or HTTP service can be empty, but not both.

validate_endpoints(values)[source]#

Validates that at least one of the gRPC or HTTP services is provided for the yolox endpoint.

Raises:
  • ValueError – If both gRPC and HTTP services are empty for the yolox endpoint.

  • Config

  • ------

:raises extra : str: Pydantic config option to forbid extra fields.

auth_token: str | None#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nim_batch_size: int#
paddle_endpoints: Tuple[str | None, str | None]#
paddle_infer_protocol: str#
classmethod validate_endpoints(values)[source]#

Validates the gRPC and HTTP services for the yolox endpoint.

Parameters:

values (dict) – Dictionary containing the values of the attributes for the class.

Returns:

The validated dictionary of values.

Return type:

dict

Raises:

ValueError – If both gRPC and HTTP services are empty for the yolox endpoint.

workers_per_progress_engine: int#
yolox_endpoints: Tuple[str | None, str | None]#
yolox_infer_protocol: str#
class nv_ingest.schemas.table_extractor_schema.TableExtractorSchema(
*,
max_queue_size: int = 1,
n_workers: int = 2,
raise_on_failure: bool = False,
stage_config: TableExtractorConfigSchema | None = None,
)[source]#

Bases: BaseModel

Configuration schema for the table extraction processing settings.

Parameters:
  • max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.

  • n_workers (int, default=2) – The number of worker threads to use for processing.

  • raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception if a failure occurs during table extraction.

  • stage_config (Optional[TableExtractorConfigSchema], default=None) – Configuration for the table extraction stage, including yolox service endpoints.

classmethod check_positive(v, field)[source]#
max_queue_size: int#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#
raise_on_failure: bool#
stage_config: TableExtractorConfigSchema | None#

nv_ingest.schemas.task_injection_schema module#

class nv_ingest.schemas.task_injection_schema.TaskInjectionSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

nv_ingest.schemas.text_splitter_schema module#

class nv_ingest.schemas.text_splitter_schema.TextSplitterSchema(
*,
tokenizer: str | None = None,
chunk_size: Annotated[int, Gt(gt=0)] = 1024,
chunk_overlap: Annotated[int, Ge(ge=0)] = 150,
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

classmethod check_chunk_overlap(v, values, **kwargs)[source]#
chunk_overlap: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])]#
chunk_size: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Gt(gt=0)])]#
model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#
tokenizer: str | None#

nv_ingest.schemas.vdb_task_sink_schema module#

class nv_ingest.schemas.vdb_task_sink_schema.VdbTaskSinkSchema(
*,
recreate: bool = False,
service: str = 'milvus',
is_service_serialized: bool = False,
default_resource_name: str = 'nv_ingest_collection',
resource_schemas: dict = {'nv_ingest_collection': {'index_conf': {'field_name': 'vector',
'index_type': 'GPU_CAGRA',
'metric_type': 'L2',
'params': {'build_algo': 'NN_DESCENT',
'graph_degree': 64,
'intermediate_graph_degree': 128}},
'schema_conf': {'description': 'NV-INGEST collection schema',
'enable_dynamic_field': True,
'schema_fields': [{'auto_id': True,
'description': 'Primary key for the collection',
'is_primary': True,
'name': 'pk',
'type': DataType.INT64},
{'description': 'Extracted content',
'name': 'text',
'params': {'max_length': 65535},
'type': DataType.VARCHAR},
{'description': 'Embedding vectors',
'name': 'vector',
'params': {'dim': 1024},
'type': DataType.FLOAT_VECTOR},
{'description': 'Source document and raw data extracted content',
'name': 'source',
'type': DataType.JSON},
{'description': 'Content metadata',
'name': 'content_metadata',
'type': DataType.JSON}]}}},
resource_kwargs: dict = <factory>,
service_kwargs: dict = {},
batch_size: int = 5120,
write_time_interval: float = 1.0,
retry_interval: float = 60.0,
raise_on_failure: bool = False,
progress_engines: ~typing.Annotated[int,
~annotated_types.Ge(ge=1)] = 1,
)[source]#

Bases: BaseModel

batch_size: int#
default_resource_name: str#
is_service_serialized: bool#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#
raise_on_failure: bool#
recreate: bool#
resource_kwargs: dict#
resource_schemas: dict#
retry_interval: float#
service: str#
service_kwargs: dict#
classmethod validate_resource_name(to_validate)[source]#
classmethod validate_service(to_validate)[source]#
write_time_interval: float#
nv_ingest.schemas.vdb_task_sink_schema.build_default_milvus_config(
embedding_size: int = 1024,
) Dict[str, Any][source]#

Builds the configuration for Milvus.

This function creates a dictionary configuration for a Milvus collection. It includes the index configuration and the schema configuration, with various fields like id, title, link, summary, page_content, and embedding.

Parameters:

embedding_size (int) – The size of the embedding vector.

Returns:

A dictionary containing the configuration settings for Milvus.

Return type:

Dict[str, Any]

Module contents#

class nv_ingest.schemas.ImageCaptionExtractionSchema(
*,
api_key: str = 'api_key',
endpoint_url: str = 'https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions',
prompt: str = 'Caption the content of this image:',
model_name: str = 'meta/llama-3.2-11b-vision-instruct',
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

api_key: str#
endpoint_url: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str#
prompt: str#
raise_on_failure: bool#
class nv_ingest.schemas.ImageStorageModuleSchema(
*,
structured: bool = True,
images: bool = True,
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

images: bool#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#
structured: bool#
class nv_ingest.schemas.IngestJobSchema(
*,
job_payload: JobPayloadSchema,
job_id: str | int,
tasks: List[IngestTaskSchema],
tracing_options: TracingOptionsSchema | None = None,
)[source]#

Bases: BaseModelNoExt

job_id: str | int#
job_payload: JobPayloadSchema#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

tasks: List[IngestTaskSchema]#
tracing_options: TracingOptionsSchema | None#
class nv_ingest.schemas.MessageBrokerClientSchema(
*,
host: str = 'redis',
port: Annotated[int, Gt(gt=0), Lt(lt=65536)] = 6379,
client_type: Literal['redis', 'simple'] = 'redis',
broker_params: dict | None = {},
connection_timeout: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None = 300,
max_backoff: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None = 300,
max_retries: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None = 0,
)[source]#

Bases: BaseModel

broker_params: dict | None#
client_type: Literal['redis', 'simple']#
connection_timeout: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None#
host: str#
max_backoff: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None#
max_retries: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])] | None#
model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

port: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Gt(gt=0), Lt(lt=65536)])]#
class nv_ingest.schemas.MessageBrokerTaskSinkSchema(
*,
broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0),
raise_on_failure: bool = False,
progress_engines: Annotated[int, Ge(ge=1)] = 6,
)[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#
model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#
raise_on_failure: bool#
class nv_ingest.schemas.MessageBrokerTaskSourceSchema(
*,
broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0),
task_queue: str = 'morpheus_task_queue',
raise_on_failure: bool = False,
progress_engines: Annotated[int, Ge(ge=1)] = 6,
)[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#
model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#
raise_on_failure: bool#
task_queue: str#
class nv_ingest.schemas.MetadataInjectorSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#
class nv_ingest.schemas.PDFExtractorSchema(
*,
max_queue_size: int = 1,
n_workers: int = 16,
raise_on_failure: bool = False,
pdfium_config: PDFiumConfigSchema | None = None,
nemoretriever_parse_config: NemoRetrieverParseConfigSchema | None = None,
)[source]#

Bases: BaseModel

Configuration schema for the PDF extractor settings.

Parameters:
  • max_queue_size (int, default=1) – The maximum number of items allowed in the processing queue.

  • n_workers (int, default=16) – The number of worker threads to use for processing.

  • raise_on_failure (bool, default=False) – A flag indicating whether to raise an exception on processing failure.

  • pdfium_config (Optional[PDFiumConfigSchema], default=None) – Configuration for the PDFium service endpoints.

max_queue_size: int#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n_workers: int#
nemoretriever_parse_config: NemoRetrieverParseConfigSchema | None#
pdfium_config: PDFiumConfigSchema | None#
raise_on_failure: bool#
class nv_ingest.schemas.PipelineConfigSchema(*, audio_extractor_schema: ~nv_ingest.schemas.audio_extractor_schema.AudioExtractorSchema = AudioExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, audio_extraction_config=None), chart_extractor_module: ~nv_ingest.schemas.chart_extractor_schema.ChartExtractorSchema = ChartExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, stage_config=None), text_splitter_module: ~nv_ingest.schemas.text_splitter_schema.TextSplitterSchema = TextSplitterSchema(tokenizer=None, chunk_size=1024, chunk_overlap=150, raise_on_failure=False), embedding_storage_module: ~nv_ingest.schemas.embedding_storage_schema.EmbeddingStorageModuleSchema = EmbeddingStorageModuleSchema(raise_on_failure=False), embed_extractions_module: ~nv_ingest.schemas.embed_extractions_schema.EmbedExtractionsSchema = EmbedExtractionsSchema(api_key='api_key', batch_size=8192, embedding_model='nvidia/nv-embedqa-e5-v5', embedding_nim_endpoint='http://embedding:8000/v1', encoding_format='float', httpx_log_level=<LogLevel.WARNING: 'WARNING'>, input_type='passage', raise_on_failure=False, truncate='END'), image_caption_extraction_module: ~nv_ingest.schemas.image_caption_extraction_schema.ImageCaptionExtractionSchema = ImageCaptionExtractionSchema(api_key='api_key', endpoint_url='https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions', prompt='Caption the content of this image:', model_name='meta/llama-3.2-11b-vision-instruct', raise_on_failure=False), image_dedup_module: ~nv_ingest.schemas.image_dedup_schema.ImageDedupSchema = ImageDedupSchema(raise_on_failure=False, cpu_only=False), image_filter_module: ~nv_ingest.schemas.image_filter_schema.ImageFilterSchema = ImageFilterSchema(raise_on_failure=False, cpu_only=False), image_storage_module: ~nv_ingest.schemas.image_storage_schema.ImageStorageModuleSchema = ImageStorageModuleSchema(structured=True, images=True, raise_on_failure=False), infographic_extractor_module: ~nv_ingest.schemas.infographic_extractor_schema.InfographicExtractorSchema = InfographicExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, stage_config=None), job_counter_module: ~nv_ingest.schemas.job_counter_schema.JobCounterSchema = JobCounterSchema(name='job_counter', raise_on_failure=False), metadata_injection_module: ~nv_ingest.schemas.metadata_injector_schema.MetadataInjectorSchema = MetadataInjectorSchema(raise_on_failure=False), otel_meter_module: ~nv_ingest.schemas.otel_meter_schema.OpenTelemetryMeterSchema = OpenTelemetryMeterSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), otel_endpoint='localhost:4317', raise_on_failure=False), otel_tracer_module: ~nv_ingest.schemas.otel_tracer_schema.OpenTelemetryTracerSchema = OpenTelemetryTracerSchema(otel_endpoint='localhost:4317', raise_on_failure=False), pdf_extractor_module: ~nv_ingest.schemas.pdf_extractor_schema.PDFExtractorSchema = PDFExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pdfium_config=None, nemoretriever_parse_config=None), pptx_extractor_module: ~nv_ingest.schemas.pptx_extractor_schema.PPTXExtractorSchema = PPTXExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pptx_extraction_config=None), redis_task_sink: ~nv_ingest.schemas.message_broker_sink_schema.MessageBrokerTaskSinkSchema = MessageBrokerTaskSinkSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), raise_on_failure=False, progress_engines=6), redis_task_source: ~nv_ingest.schemas.message_broker_source_schema.MessageBrokerTaskSourceSchema = MessageBrokerTaskSourceSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), task_queue='morpheus_task_queue', raise_on_failure=False, progress_engines=6), table_extractor_module: ~nv_ingest.schemas.table_extractor_schema.TableExtractorSchema = TableExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, stage_config=None), vdb_task_sink: ~nv_ingest.schemas.vdb_task_sink_schema.VdbTaskSinkSchema = VdbTaskSinkSchema(recreate=False, service='milvus', is_service_serialized=False, default_resource_name='nv_ingest_collection', resource_schemas={'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'metric_type': 'L2', 'index_type': 'GPU_CAGRA', 'params': {'intermediate_graph_degree': 128, 'graph_degree': 64, 'build_algo': 'NN_DESCENT'}}, 'schema_conf': {'enable_dynamic_field': True, 'schema_fields': [{'name': 'pk', 'description': 'Primary key for the collection', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': 'Extracted content', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'vector', 'description': 'Embedding vectors', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}, {'name': 'source', 'description': 'Source document and raw data extracted content', 'type': <DataType.JSON: 23>}, {'name': 'content_metadata', 'description': 'Content metadata', 'type': <DataType.JSON: 23>}], 'description': 'NV-INGEST collection schema'}}}, resource_kwargs={}, service_kwargs={}, batch_size=5120, write_time_interval=1.0, retry_interval=60.0, raise_on_failure=False, progress_engines=1))[source]#

Bases: BaseModel

audio_extractor_schema: AudioExtractorSchema#
chart_extractor_module: ChartExtractorSchema#
embed_extractions_module: EmbedExtractionsSchema#
embedding_storage_module: EmbeddingStorageModuleSchema#
image_caption_extraction_module: ImageCaptionExtractionSchema#
image_dedup_module: ImageDedupSchema#
image_filter_module: ImageFilterSchema#
image_storage_module: ImageStorageModuleSchema#
infographic_extractor_module: InfographicExtractorSchema#
job_counter_module: JobCounterSchema#
metadata_injection_module: MetadataInjectorSchema#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_meter_module: OpenTelemetryMeterSchema#
otel_tracer_module: OpenTelemetryTracerSchema#
pdf_extractor_module: PDFExtractorSchema#
pptx_extractor_module: PPTXExtractorSchema#
redis_task_sink: MessageBrokerTaskSinkSchema#
redis_task_source: MessageBrokerTaskSourceSchema#
table_extractor_module: TableExtractorSchema#
text_splitter_module: TextSplitterSchema#
vdb_task_sink: VdbTaskSinkSchema#
class nv_ingest.schemas.TaskInjectionSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#
class nv_ingest.schemas.TextSplitterSchema(
*,
tokenizer: str | None = None,
chunk_size: Annotated[int, Gt(gt=0)] = 1024,
chunk_overlap: Annotated[int, Ge(ge=0)] = 150,
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

classmethod check_chunk_overlap(v, values, **kwargs)[source]#
chunk_overlap: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0)])]#
chunk_size: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Gt(gt=0)])]#
model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#
tokenizer: str | None#
class nv_ingest.schemas.VdbTaskSinkSchema(
*,
recreate: bool = False,
service: str = 'milvus',
is_service_serialized: bool = False,
default_resource_name: str = 'nv_ingest_collection',
resource_schemas: dict = {'nv_ingest_collection': {'index_conf': {'field_name': 'vector',
'index_type': 'GPU_CAGRA',
'metric_type': 'L2',
'params': {'build_algo': 'NN_DESCENT',
'graph_degree': 64,
'intermediate_graph_degree': 128}},
'schema_conf': {'description': 'NV-INGEST collection schema',
'enable_dynamic_field': True,
'schema_fields': [{'auto_id': True,
'description': 'Primary key for the collection',
'is_primary': True,
'name': 'pk',
'type': DataType.INT64},
{'description': 'Extracted content',
'name': 'text',
'params': {'max_length': 65535},
'type': DataType.VARCHAR},
{'description': 'Embedding vectors',
'name': 'vector',
'params': {'dim': 1024},
'type': DataType.FLOAT_VECTOR},
{'description': 'Source document and raw data extracted content',
'name': 'source',
'type': DataType.JSON},
{'description': 'Content metadata',
'name': 'content_metadata',
'type': DataType.JSON}]}}},
resource_kwargs: dict = <factory>,
service_kwargs: dict = {},
batch_size: int = 5120,
write_time_interval: float = 1.0,
retry_interval: float = 60.0,
raise_on_failure: bool = False,
progress_engines: ~typing.Annotated[int,
~annotated_types.Ge(ge=1)] = 1,
)[source]#

Bases: BaseModel

batch_size: int#
default_resource_name: str#
is_service_serialized: bool#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#
raise_on_failure: bool#
recreate: bool#
resource_kwargs: dict#
resource_schemas: dict#
retry_interval: float#
service: str#
service_kwargs: dict#
classmethod validate_resource_name(to_validate)[source]#
classmethod validate_service(to_validate)[source]#
write_time_interval: float#
nv_ingest.schemas.validate_ingest_job(
job_data: Dict[str, Any],
) IngestJobSchema[source]#

Validates a dictionary representing an ingest_job using the IngestJobSchema.

Parameters: - job_data: Dictionary representing an ingest job.

Returns: - IngestJobSchema: The validated ingest job.

Raises: - ValidationError: If the input data does not conform to the IngestJobSchema.

nv_ingest.schemas.validate_metadata(
metadata: Dict[str, Any],
) MetadataSchema[source]#

Validates the given metadata dictionary against the MetadataSchema.

Parameters: - metadata: A dictionary representing metadata to be validated.

Returns: - An instance of MetadataSchema if validation is successful.

Raises: - ValidationError: If the metadata does not conform to the schema.