nv_ingest.framework.schemas package#

Submodules#

nv_ingest.framework.schemas.framework_ingest_config_schema module#

class nv_ingest.framework.schemas.framework_ingest_config_schema.PipelineConfigSchema(*, audio_extractor_schema: ~nv_ingest_api.internal.schemas.extract.extract_audio_schema.AudioExtractorSchema = AudioExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, audio_extraction_config=None), chart_extractor_module: ~nv_ingest_api.internal.schemas.extract.extract_chart_schema.ChartExtractorSchema = ChartExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, endpoint_config=None), text_splitter_module: ~nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema.TextSplitterSchema = TextSplitterSchema(tokenizer=None, chunk_size=1024, chunk_overlap=150, raise_on_failure=False), embedding_storage_module: ~nv_ingest_api.internal.schemas.store.store_embedding_schema.EmbeddingStorageSchema = EmbeddingStorageSchema(raise_on_failure=False), embed_extractions_module: ~nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema.TextEmbeddingSchema = TextEmbeddingSchema(api_key='api_key', batch_size=4, embedding_model='nvidia/llama-3.2-nv-embedqa-1b-v2', embedding_nim_endpoint='http://embedding:8000/v1', encoding_format='float', httpx_log_level=<LogLevel.WARNING: 'WARNING'>, input_type='passage', raise_on_failure=False, truncate='END'), image_caption_extraction_module: ~nv_ingest_api.internal.schemas.transform.transform_image_caption_schema.ImageCaptionExtractionSchema = ImageCaptionExtractionSchema(api_key='api_key', endpoint_url='https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions', prompt='Caption the content of this image:', model_name='meta/llama-3.2-11b-vision-instruct', raise_on_failure=False), image_dedup_module: ~nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema.ImageDedupSchema = ImageDedupSchema(raise_on_failure=False), image_filter_module: ~nv_ingest_api.internal.schemas.transform.transform_image_filter_schema.ImageFilterSchema = ImageFilterSchema(raise_on_failure=False, cpu_only=False), image_storage_module: ~nv_ingest_api.internal.schemas.store.store_image_schema.ImageStorageModuleSchema = ImageStorageModuleSchema(structured=True, images=True, raise_on_failure=False), infographic_extractor_module: ~nv_ingest_api.internal.schemas.extract.extract_infographic_schema.InfographicExtractorSchema = InfographicExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, endpoint_config=None), job_counter_module: ~nv_ingest.framework.schemas.framework_job_counter_schema.JobCounterSchema = JobCounterSchema(name='job_counter', raise_on_failure=False), metadata_injection_module: ~nv_ingest.framework.schemas.framework_metadata_injector_schema.MetadataInjectorSchema = MetadataInjectorSchema(raise_on_failure=False), otel_meter_module: ~nv_ingest.framework.schemas.framework_otel_meter_schema.OpenTelemetryMeterSchema = OpenTelemetryMeterSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), otel_endpoint='localhost:4317', raise_on_failure=False), otel_tracer_module: ~nv_ingest.framework.schemas.framework_otel_tracer_schema.OpenTelemetryTracerSchema = OpenTelemetryTracerSchema(otel_endpoint='localhost:4317', raise_on_failure=False), pdf_extractor_module: ~nv_ingest_api.internal.schemas.extract.extract_pdf_schema.PDFExtractorSchema = PDFExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pdfium_config=None, nemoretriever_parse_config=None), pptx_extractor_module: ~nv_ingest_api.internal.schemas.extract.extract_pptx_schema.PPTXExtractorSchema = PPTXExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pptx_extraction_config=None), redis_task_sink: ~nv_ingest.framework.schemas.framework_message_broker_sink_schema.MessageBrokerTaskSinkSchema = MessageBrokerTaskSinkSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), raise_on_failure=False, progress_engines=6), redis_task_source: ~nv_ingest.framework.schemas.framework_message_broker_source_schema.MessageBrokerTaskSourceSchema = MessageBrokerTaskSourceSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), task_queue='ingest_task_queue', raise_on_failure=False, progress_engines=6), table_extractor_module: ~nv_ingest_api.internal.schemas.extract.extract_table_schema.TableExtractorSchema = TableExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, endpoint_config=None), vdb_task_sink: ~nv_ingest.framework.schemas.framework_vdb_task_sink_schema.VdbTaskSinkSchema = VdbTaskSinkSchema(recreate=False, service='milvus', is_service_serialized=False, default_resource_name='nv_ingest_collection', resource_schemas={'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'metric_type': 'L2', 'index_type': 'GPU_CAGRA', 'params': {'intermediate_graph_degree': 128, 'graph_degree': 64, 'build_algo': 'NN_DESCENT'}}, 'schema_conf': {'enable_dynamic_field': True, 'schema_fields': [{'name': 'pk', 'description': 'Primary key for the collection', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': 'Extracted content', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'vector', 'description': 'Embedding vectors', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}, {'name': 'source', 'description': 'Source document and raw data extracted content', 'type': <DataType.JSON: 23>}, {'name': 'content_metadata', 'description': 'Content metadata', 'type': <DataType.JSON: 23>}], 'description': 'NV-INGEST collection schema'}}}, resource_kwargs={}, service_kwargs={}, batch_size=5120, write_time_interval=1.0, retry_interval=60.0, raise_on_failure=False, progress_engines=1))[source]#

Bases: BaseModel

audio_extractor_schema: AudioExtractorSchema#
chart_extractor_module: ChartExtractorSchema#
embed_extractions_module: TextEmbeddingSchema#
embedding_storage_module: EmbeddingStorageSchema#
image_caption_extraction_module: ImageCaptionExtractionSchema#
image_dedup_module: ImageDedupSchema#
image_filter_module: ImageFilterSchema#
image_storage_module: ImageStorageModuleSchema#
infographic_extractor_module: InfographicExtractorSchema#
job_counter_module: JobCounterSchema#
metadata_injection_module: MetadataInjectorSchema#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_meter_module: OpenTelemetryMeterSchema#
otel_tracer_module: OpenTelemetryTracerSchema#
pdf_extractor_module: PDFExtractorSchema#
pptx_extractor_module: PPTXExtractorSchema#
redis_task_sink: MessageBrokerTaskSinkSchema#
redis_task_source: MessageBrokerTaskSourceSchema#
table_extractor_module: TableExtractorSchema#
text_splitter_module: TextSplitterSchema#
vdb_task_sink: VdbTaskSinkSchema#

nv_ingest.framework.schemas.framework_job_counter_schema module#

class nv_ingest.framework.schemas.framework_job_counter_schema.JobCounterSchema(
*,
name: str = 'job_counter',
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

name: str#
raise_on_failure: bool#

nv_ingest.framework.schemas.framework_message_broker_sink_schema module#

class nv_ingest.framework.schemas.framework_message_broker_sink_schema.MessageBrokerTaskSinkSchema(
*,
broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0),
raise_on_failure: bool = False,
progress_engines: Annotated[int, Ge(ge=1)] = 6,
)[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#
model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#
raise_on_failure: bool#

nv_ingest.framework.schemas.framework_message_broker_source_schema module#

class nv_ingest.framework.schemas.framework_message_broker_source_schema.MessageBrokerTaskSourceSchema(
*,
broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0),
task_queue: str = 'ingest_task_queue',
raise_on_failure: bool = False,
progress_engines: Annotated[int, Ge(ge=1)] = 6,
)[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#
model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#
raise_on_failure: bool#
task_queue: str#

nv_ingest.framework.schemas.framework_message_wrapper_schema module#

class nv_ingest.framework.schemas.framework_message_wrapper_schema.MessageWrapper(*, payload: str)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

payload: str#

nv_ingest.framework.schemas.framework_metadata_injector_schema module#

class nv_ingest.framework.schemas.framework_metadata_injector_schema.MetadataInjectorSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

nv_ingest.framework.schemas.framework_otel_meter_schema module#

class nv_ingest.framework.schemas.framework_otel_meter_schema.OpenTelemetryMeterSchema(
*,
broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0),
otel_endpoint: str = 'localhost:4317',
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_endpoint: str#
raise_on_failure: bool#

nv_ingest.framework.schemas.framework_otel_tracer_schema module#

class nv_ingest.framework.schemas.framework_otel_tracer_schema.OpenTelemetryTracerSchema(
*,
otel_endpoint: str = 'localhost:4317',
raise_on_failure: bool = False,
)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_endpoint: str#
raise_on_failure: bool#

nv_ingest.framework.schemas.framework_processing_job_schema module#

class nv_ingest.framework.schemas.framework_processing_job_schema.ConversionStatus(value)[source]#

Bases: str, Enum

An enumeration.

FAILED = 'failed'#
IN_PROGRESS = 'in_progress'#
SUCCESS = 'success'#
model_config = "{'extra': 'forbid'}"#
class nv_ingest.framework.schemas.framework_processing_job_schema.ProcessingJob(
*,
submitted_job_id: str,
filename: str,
raw_result: str = '',
content: str = '',
status: ConversionStatus,
error: str | None = None,
)[source]#

Bases: BaseModel

content: str#
error: str | None#
filename: str#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raw_result: str#
status: ConversionStatus#
submitted_job_id: str#

nv_ingest.framework.schemas.framework_task_injection_schema module#

class nv_ingest.framework.schemas.framework_task_injection_schema.TaskInjectionSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

nv_ingest.framework.schemas.framework_vdb_task_sink_schema module#

class nv_ingest.framework.schemas.framework_vdb_task_sink_schema.VdbTaskSinkSchema(
*,
recreate: bool = False,
service: str = 'milvus',
is_service_serialized: bool = False,
default_resource_name: str = 'nv_ingest_collection',
resource_schemas: dict = {'nv_ingest_collection': {'index_conf': {'field_name': 'vector',
'index_type': 'GPU_CAGRA',
'metric_type': 'L2',
'params': {'build_algo': 'NN_DESCENT',
'graph_degree': 64,
'intermediate_graph_degree': 128}},
'schema_conf': {'description': 'NV-INGEST collection schema',
'enable_dynamic_field': True,
'schema_fields': [{'auto_id': True,
'description': 'Primary key for the collection',
'is_primary': True,
'name': 'pk',
'type': DataType.INT64},
{'description': 'Extracted content',
'name': 'text',
'params': {'max_length': 65535},
'type': DataType.VARCHAR},
{'description': 'Embedding vectors',
'name': 'vector',
'params': {'dim': 1024},
'type': DataType.FLOAT_VECTOR},
{'description': 'Source document and raw data extracted content',
'name': 'source',
'type': DataType.JSON},
{'description': 'Content metadata',
'name': 'content_metadata',
'type': DataType.JSON}]}}},
resource_kwargs: dict = <factory>,
service_kwargs: dict = {},
batch_size: int = 5120,
write_time_interval: float = 1.0,
retry_interval: float = 60.0,
raise_on_failure: bool = False,
progress_engines: ~typing.Annotated[int,
~annotated_types.Ge(ge=1)] = 1,
)[source]#

Bases: BaseModel

batch_size: int#
default_resource_name: str#
is_service_serialized: bool#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#
raise_on_failure: bool#
recreate: bool#
resource_kwargs: dict#
resource_schemas: dict#
retry_interval: float#
service: str#
service_kwargs: dict#
classmethod validate_resource_name(to_validate)[source]#
classmethod validate_service(to_validate)[source]#
write_time_interval: float#
nv_ingest.framework.schemas.framework_vdb_task_sink_schema.build_default_milvus_config(
embedding_size: int = 1024,
) Dict[str, Any][source]#

Builds the configuration for Milvus.

This function creates a dictionary configuration for a Milvus collection. It includes the index configuration and the schema configuration, with various fields like id, title, link, summary, page_content, and embedding.

Parameters:

embedding_size (int) – The size of the embedding vector.

Returns:

A dictionary containing the configuration settings for Milvus.

Return type:

Dict[str, Any]

Module contents#