nv_ingest.framework.schemas package#

Submodules#

nv_ingest.framework.schemas.framework_ingest_config_schema module#

class nv_ingest.framework.schemas.framework_ingest_config_schema.PipelineConfigSchema(*, audio_extractor_schema: ~nv_ingest_api.internal.schemas.extract.extract_audio_schema.AudioExtractorSchema = AudioExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, audio_extraction_config=None), chart_extractor_module: ~nv_ingest_api.internal.schemas.extract.extract_chart_schema.ChartExtractorSchema = ChartExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, endpoint_config=None), text_splitter_module: ~nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema.TextSplitterSchema = TextSplitterSchema(tokenizer=None, chunk_size=1024, chunk_overlap=150, raise_on_failure=False), embedding_storage_module: ~nv_ingest_api.internal.schemas.store.store_embedding_schema.EmbeddingStorageSchema = EmbeddingStorageSchema(raise_on_failure=False), embed_extractions_module: ~nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema.TextEmbeddingSchema = TextEmbeddingSchema(api_key='api_key', batch_size=4, embedding_model='nvidia/llama-3.2-nv-embedqa-1b-v2', embedding_nim_endpoint='http://embedding:8000/v1', encoding_format='float', httpx_log_level=<LogLevel.WARNING: 'WARNING'>, input_type='passage', raise_on_failure=False, truncate='END'), image_caption_extraction_module: ~nv_ingest_api.internal.schemas.transform.transform_image_caption_schema.ImageCaptionExtractionSchema = ImageCaptionExtractionSchema(api_key='api_key', endpoint_url='https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions', prompt='Caption the content of this image:', model_name='meta/llama-3.2-11b-vision-instruct', raise_on_failure=False), image_dedup_module: ~nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema.ImageDedupSchema = ImageDedupSchema(raise_on_failure=False), image_filter_module: ~nv_ingest_api.internal.schemas.transform.transform_image_filter_schema.ImageFilterSchema = ImageFilterSchema(raise_on_failure=False, cpu_only=False), image_storage_module: ~nv_ingest_api.internal.schemas.store.store_image_schema.ImageStorageModuleSchema = ImageStorageModuleSchema(structured=True, images=True, raise_on_failure=False), infographic_extractor_module: ~nv_ingest_api.internal.schemas.extract.extract_infographic_schema.InfographicExtractorSchema = InfographicExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, endpoint_config=None), job_counter_module: ~nv_ingest.framework.schemas.framework_job_counter_schema.JobCounterSchema = JobCounterSchema(name='job_counter', raise_on_failure=False), metadata_injection_module: ~nv_ingest.framework.schemas.framework_metadata_injector_schema.MetadataInjectorSchema = MetadataInjectorSchema(raise_on_failure=False), otel_meter_module: ~nv_ingest.framework.schemas.framework_otel_meter_schema.OpenTelemetryMeterSchema = OpenTelemetryMeterSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), otel_endpoint='localhost:4317', raise_on_failure=False), otel_tracer_module: ~nv_ingest.framework.schemas.framework_otel_tracer_schema.OpenTelemetryTracerSchema = OpenTelemetryTracerSchema(otel_endpoint='localhost:4317', raise_on_failure=False), pdf_extractor_module: ~nv_ingest_api.internal.schemas.extract.extract_pdf_schema.PDFExtractorSchema = PDFExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pdfium_config=None, nemoretriever_parse_config=None), pptx_extractor_module: ~nv_ingest_api.internal.schemas.extract.extract_pptx_schema.PPTXExtractorSchema = PPTXExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pptx_extraction_config=None), redis_task_sink: ~nv_ingest.framework.schemas.framework_message_broker_sink_schema.MessageBrokerTaskSinkSchema = MessageBrokerTaskSinkSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), raise_on_failure=False, progress_engines=6), redis_task_source: ~nv_ingest.framework.schemas.framework_message_broker_source_schema.MessageBrokerTaskSourceSchema = MessageBrokerTaskSourceSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), task_queue='ingest_task_queue', raise_on_failure=False, progress_engines=6), table_extractor_module: ~nv_ingest_api.internal.schemas.extract.extract_table_schema.TableExtractorSchema = TableExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, endpoint_config=None), vdb_task_sink: ~nv_ingest.framework.schemas.framework_vdb_task_sink_schema.VdbTaskSinkSchema = VdbTaskSinkSchema(recreate=False, service='milvus', is_service_serialized=False, default_resource_name='nv_ingest_collection', resource_schemas={'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'metric_type': 'L2', 'index_type': 'GPU_CAGRA', 'params': {'intermediate_graph_degree': 128, 'graph_degree': 64, 'build_algo': 'NN_DESCENT'}}, 'schema_conf': {'enable_dynamic_field': True, 'schema_fields': [{'name': 'pk', 'description': 'Primary key for the collection', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': 'Extracted content', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'vector', 'description': 'Embedding vectors', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}, {'name': 'source', 'description': 'Source document and raw data extracted content', 'type': <DataType.JSON: 23>}, {'name': 'content_metadata', 'description': 'Content metadata', 'type': <DataType.JSON: 23>}], 'description': 'NV-INGEST collection schema'}}}, resource_kwargs={}, service_kwargs={}, batch_size=5120, write_time_interval=1.0, retry_interval=60.0, raise_on_failure=False, progress_engines=1))[source]#

Bases: BaseModel

audio_extractor_schema: AudioExtractorSchema#

chart_extractor_module: ChartExtractorSchema#

embed_extractions_module: TextEmbeddingSchema#

embedding_storage_module: EmbeddingStorageSchema#

image_caption_extraction_module: ImageCaptionExtractionSchema#

image_dedup_module: ImageDedupSchema#

image_filter_module: ImageFilterSchema#

image_storage_module: ImageStorageModuleSchema#

infographic_extractor_module: InfographicExtractorSchema#

job_counter_module: JobCounterSchema#

metadata_injection_module: MetadataInjectorSchema#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_meter_module: OpenTelemetryMeterSchema#

otel_tracer_module: OpenTelemetryTracerSchema#

pdf_extractor_module: PDFExtractorSchema#

pptx_extractor_module: PPTXExtractorSchema#

redis_task_sink: MessageBrokerTaskSinkSchema#

redis_task_source: MessageBrokerTaskSourceSchema#

table_extractor_module: TableExtractorSchema#

text_splitter_module: TextSplitterSchema#

vdb_task_sink: VdbTaskSinkSchema#

nv_ingest.framework.schemas.framework_job_counter_schema module#

class nv_ingest.framework.schemas.framework_job_counter_schema.JobCounterSchema( *, name: str = 'job_counter', raise_on_failure: bool = False, )[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

name: str#

raise_on_failure: bool#

nv_ingest.framework.schemas.framework_message_broker_sink_schema module#

class nv_ingest.framework.schemas.framework_message_broker_sink_schema.MessageBrokerTaskSinkSchema( *, broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), raise_on_failure: bool = False, progress_engines: Annotated[int, Ge(ge=1)] = 6, )[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#

raise_on_failure: bool#

nv_ingest.framework.schemas.framework_message_broker_source_schema module#

class nv_ingest.framework.schemas.framework_message_broker_source_schema.MessageBrokerTaskSourceSchema( *, broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), task_queue: str = 'ingest_task_queue', raise_on_failure: bool = False, progress_engines: Annotated[int, Ge(ge=1)] = 6, )[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#

raise_on_failure: bool#

task_queue: str#

nv_ingest.framework.schemas.framework_message_wrapper_schema module#

class nv_ingest.framework.schemas.framework_message_wrapper_schema.MessageWrapper(*, payload: str)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

payload: str#

nv_ingest.framework.schemas.framework_metadata_injector_schema module#

class nv_ingest.framework.schemas.framework_metadata_injector_schema.MetadataInjectorSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

nv_ingest.framework.schemas.framework_otel_meter_schema module#

class nv_ingest.framework.schemas.framework_otel_meter_schema.OpenTelemetryMeterSchema( *, broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), otel_endpoint: str = 'localhost:4317', raise_on_failure: bool = False, )[source]#

Bases: BaseModel

broker_client: MessageBrokerClientSchema#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_endpoint: str#

raise_on_failure: bool#

nv_ingest.framework.schemas.framework_otel_tracer_schema module#

class nv_ingest.framework.schemas.framework_otel_tracer_schema.OpenTelemetryTracerSchema( *, otel_endpoint: str = 'localhost:4317', raise_on_failure: bool = False, )[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

otel_endpoint: str#

raise_on_failure: bool#

nv_ingest.framework.schemas.framework_processing_job_schema module#

class nv_ingest.framework.schemas.framework_processing_job_schema.ConversionStatus(value)[source]#

Bases: str, Enum

An enumeration.

FAILED = 'failed'#

IN_PROGRESS = 'in_progress'#

SUCCESS = 'success'#

model_config = "{'extra': 'forbid'}"#

class nv_ingest.framework.schemas.framework_processing_job_schema.ProcessingJob( *, submitted_job_id: str, filename: str, raw_result: str = '', content: str = '', status: ConversionStatus, error: str | None = None, )[source]#

Bases: BaseModel

content: str#

error: str | None#

filename: str#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raw_result: str#

status: ConversionStatus#

submitted_job_id: str#

nv_ingest.framework.schemas.framework_task_injection_schema module#

class nv_ingest.framework.schemas.framework_task_injection_schema.TaskInjectionSchema(*, raise_on_failure: bool = False)[source]#

Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

raise_on_failure: bool#

nv_ingest.framework.schemas.framework_vdb_task_sink_schema module#

class nv_ingest.framework.schemas.framework_vdb_task_sink_schema.VdbTaskSinkSchema( *, recreate: bool = False, service: str = 'milvus', is_service_serialized: bool = False, default_resource_name: str = 'nv_ingest_collection', resource_schemas: dict = {'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'index_type': 'GPU_CAGRA', 'metric_type': 'L2', 'params': {'build_algo': 'NN_DESCENT', 'graph_degree': 64, 'intermediate_graph_degree': 128}}, 'schema_conf': {'description': 'NV-INGEST collection schema', 'enable_dynamic_field': True, 'schema_fields': [{'auto_id': True, 'description': 'Primary key for the collection', 'is_primary': True, 'name': 'pk', 'type': DataType.INT64}, {'description': 'Extracted content', 'name': 'text', 'params': {'max_length': 65535}, 'type': DataType.VARCHAR}, {'description': 'Embedding vectors', 'name': 'vector', 'params': {'dim': 1024}, 'type': DataType.FLOAT_VECTOR}, {'description': 'Source document and raw data extracted content', 'name': 'source', 'type': DataType.JSON}, {'description': 'Content metadata', 'name': 'content_metadata', 'type': DataType.JSON}]}}}, resource_kwargs: dict = <factory>, service_kwargs: dict = {}, batch_size: int = 5120, write_time_interval: float = 1.0, retry_interval: float = 60.0, raise_on_failure: bool = False, progress_engines: ~typing.Annotated[int, ~annotated_types.Ge(ge=1)] = 1, )[source]#

Bases: BaseModel

batch_size: int#

default_resource_name: str#

is_service_serialized: bool#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])]#

raise_on_failure: bool#

recreate: bool#

resource_kwargs: dict#

resource_schemas: dict#

retry_interval: float#

service: str#

service_kwargs: dict#

classmethod validate_resource_name(to_validate)[source]#

classmethod validate_service(to_validate)[source]#

write_time_interval: float#

nv_ingest.framework.schemas.framework_vdb_task_sink_schema.build_default_milvus_config( embedding_size: int = 1024, ) → Dict[str, Any][source]#

Builds the configuration for Milvus.

This function creates a dictionary configuration for a Milvus collection. It includes the index configuration and the schema configuration, with various fields like id, title, link, summary, page_content, and embedding.

Parameters:: embedding_size (int) – The size of the embedding vector.
Returns:: A dictionary containing the configuration settings for Milvus.
Return type:: Dict[str, Any]

nv_ingest.framework.schemas package#

Submodules#

nv_ingest.framework.schemas.framework_ingest_config_schema module#

nv_ingest.framework.schemas.framework_job_counter_schema module#

nv_ingest.framework.schemas.framework_message_broker_sink_schema module#

nv_ingest.framework.schemas.framework_message_broker_source_schema module#

nv_ingest.framework.schemas.framework_message_wrapper_schema module#

nv_ingest.framework.schemas.framework_metadata_injector_schema module#

nv_ingest.framework.schemas.framework_otel_meter_schema module#

nv_ingest.framework.schemas.framework_otel_tracer_schema module#

nv_ingest.framework.schemas.framework_processing_job_schema module#

nv_ingest.framework.schemas.framework_task_injection_schema module#

nv_ingest.framework.schemas.framework_vdb_task_sink_schema module#

Module contents#