nv_ingest.framework.schemas package#
Submodules#
nv_ingest.framework.schemas.framework_ingest_config_schema module#
- pydantic model nv_ingest.framework.schemas.framework_ingest_config_schema.PipelineConfigSchema[source]#
Bases:
BaseModelShow JSON schema
{ "title": "PipelineConfigSchema", "type": "object", "properties": { "audio_extractor_schema": { "$ref": "#/$defs/AudioExtractorSchema", "default": { "max_queue_size": 1, "n_workers": 16, "raise_on_failure": false, "audio_extraction_config": null } }, "chart_extractor_module": { "$ref": "#/$defs/ChartExtractorSchema", "default": { "max_queue_size": 1, "n_workers": 2, "raise_on_failure": false, "endpoint_config": null } }, "text_splitter_module": { "$ref": "#/$defs/TextSplitterSchema", "default": { "tokenizer": null, "chunk_size": 1024, "chunk_overlap": 150, "raise_on_failure": false } }, "embedding_storage_module": { "$ref": "#/$defs/EmbeddingStorageSchema", "default": { "raise_on_failure": false } }, "embed_extractions_module": { "$ref": "#/$defs/TextEmbeddingSchema", "default": { "api_key": "", "batch_size": 4, "embedding_model": "nvidia/llama-3.2-nv-embedqa-1b-v2", "embedding_nim_endpoint": "http://embedding:8000/v1", "encoding_format": "float", "httpx_log_level": "WARNING", "input_type": "passage", "raise_on_failure": false, "truncate": "END", "text_elements_modality": "text", "image_elements_modality": "text", "structured_elements_modality": "text", "audio_elements_modality": "text", "custom_content_field": null, "result_target_field": null, "dimensions": null } }, "image_caption_extraction_module": { "$ref": "#/$defs/ImageCaptionExtractionSchema", "default": { "api_key": "", "endpoint_url": "https://integrate.api.nvidia.com/v1/chat/completions", "prompt": "Caption the content of this image:", "system_prompt": "/no_think", "model_name": "nvidia/nemotron-nano-12b-v2-vl", "raise_on_failure": false } }, "image_dedup_module": { "$ref": "#/$defs/ImageDedupSchema", "default": { "raise_on_failure": false } }, "image_filter_module": { "$ref": "#/$defs/ImageFilterSchema", "default": { "raise_on_failure": false, "cpu_only": false } }, "image_storage_module": { "$ref": "#/$defs/ImageStorageModuleSchema", "default": { "structured": true, "images": true, "storage_uri": "s3://nv-ingest/artifacts/store/images", "storage_options": {}, "public_base_url": null, "raise_on_failure": false } }, "infographic_extractor_module": { "$ref": "#/$defs/InfographicExtractorSchema", "default": { "max_queue_size": 1, "n_workers": 2, "raise_on_failure": false, "endpoint_config": null } }, "job_counter_module": { "$ref": "#/$defs/JobCounterSchema", "default": { "name": "job_counter", "raise_on_failure": false } }, "metadata_injection_module": { "$ref": "#/$defs/MetadataInjectorSchema", "default": { "raise_on_failure": false } }, "otel_meter_module": { "$ref": "#/$defs/OpenTelemetryMeterSchema", "default": { "broker_client": { "broker_params": {}, "client_type": "redis", "connection_timeout": 300, "host": "redis", "max_backoff": 300, "max_retries": 0, "port": 6379 }, "otel_endpoint": "localhost:4317", "raise_on_failure": false } }, "otel_tracer_module": { "$ref": "#/$defs/OpenTelemetryTracerSchema", "default": { "otel_endpoint": "localhost:4317", "raise_on_failure": false } }, "pdf_extractor_module": { "$ref": "#/$defs/PDFExtractorSchema", "default": { "max_queue_size": 1, "n_workers": 16, "raise_on_failure": false, "pdfium_config": null, "nemotron_parse_config": null } }, "pptx_extractor_module": { "$ref": "#/$defs/PPTXExtractorSchema", "default": { "max_queue_size": 1, "n_workers": 16, "raise_on_failure": false, "pptx_extraction_config": null, "pdfium_config": null } }, "redis_task_sink": { "$ref": "#/$defs/MessageBrokerTaskSinkSchema", "default": { "broker_client": { "broker_params": {}, "client_type": "redis", "connection_timeout": 300, "host": "redis", "max_backoff": 300, "max_retries": 0, "port": 6379 }, "raise_on_failure": false, "progress_engines": 6 } }, "redis_task_source": { "$ref": "#/$defs/MessageBrokerTaskSourceSchema", "default": { "broker_client": { "broker_params": {}, "client_type": "redis", "connection_timeout": 300, "host": "redis", "max_backoff": 300, "max_retries": 0, "port": 6379 }, "task_queue": "ingest_task_queue", "raise_on_failure": false, "progress_engines": 6 } }, "table_extractor_module": { "$ref": "#/$defs/TableExtractorSchema", "default": { "max_queue_size": 1, "n_workers": 2, "raise_on_failure": false, "endpoint_config": null } }, "vdb_task_sink": { "$ref": "#/$defs/VdbTaskSinkSchema", "default": { "recreate": false, "service": "milvus", "is_service_serialized": false, "default_resource_name": "nv_ingest_collection", "resource_schemas": { "nv_ingest_collection": { "index_conf": { "field_name": "vector", "index_type": "GPU_CAGRA", "metric_type": "L2", "params": { "build_algo": "NN_DESCENT", "graph_degree": 64, "intermediate_graph_degree": 128 } }, "schema_conf": { "description": "NV-INGEST collection schema", "enable_dynamic_field": true, "schema_fields": [ { "auto_id": true, "description": "Primary key for the collection", "is_primary": true, "name": "pk", "type": 5 }, { "description": "Extracted content", "name": "text", "params": { "max_length": 65535 }, "type": 21 }, { "description": "Embedding vectors", "name": "vector", "params": { "dim": 1024 }, "type": 101 }, { "description": "Source document and raw data extracted content", "name": "source", "type": 23 }, { "description": "Content metadata", "name": "content_metadata", "type": 23 } ] } } }, "resource_kwargs": {}, "service_kwargs": {}, "batch_size": 5120, "write_time_interval": 1.0, "retry_interval": 60.0, "raise_on_failure": false, "progress_engines": 1 } } }, "$defs": { "AudioConfigSchema": { "additionalProperties": false, "description": "Configuration schema for audio extraction endpoints and options.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n Authentication token required for secure services.\n\naudio_endpoints : Tuple[str, str]\n A tuple containing the gRPC and HTTP services for the audio_retriever endpoint.\n Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n Validates that at least one of the gRPC or HTTP services is provided for each endpoint.\n\nRaises\n------\nValueError\n If both gRPC and HTTP services are empty for any endpoint.\n\nConfig\n------\nextra : str\n Pydantic config option to forbid extra fields.", "properties": { "auth_token": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Auth Token" }, "audio_endpoints": { "default": [ null, null ], "maxItems": 2, "minItems": 2, "prefixItems": [ { "anyOf": [ { "type": "string" }, { "type": "null" } ] }, { "anyOf": [ { "type": "string" }, { "type": "null" } ] } ], "title": "Audio Endpoints", "type": "array" }, "audio_infer_protocol": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Audio Infer Protocol" }, "function_id": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Function Id" }, "use_ssl": { "anyOf": [ { "type": "boolean" }, { "type": "null" } ], "default": null, "title": "Use Ssl" }, "ssl_cert": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Ssl Cert" }, "segment_audio": { "anyOf": [ { "type": "boolean" }, { "type": "null" } ], "default": null, "title": "Segment Audio" } }, "title": "AudioConfigSchema", "type": "object" }, "AudioExtractorSchema": { "additionalProperties": false, "description": "Configuration schema for the PDF extractor settings.\n\nParameters\n----------\nmax_queue_size : int, default=1\n The maximum number of items allowed in the processing queue.\n\nn_workers : int, default=16\n The number of worker threads to use for processing.\n\nraise_on_failure : bool, default=False\n A flag indicating whether to raise an exception on processing failure.\n\naudio_extraction_config: Optional[AudioConfigSchema], default=None\n Configuration schema for the audio extraction stage.", "properties": { "max_queue_size": { "default": 1, "title": "Max Queue Size", "type": "integer" }, "n_workers": { "default": 16, "title": "N Workers", "type": "integer" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "audio_extraction_config": { "anyOf": [ { "$ref": "#/$defs/AudioConfigSchema" }, { "type": "null" } ], "default": null } }, "title": "AudioExtractorSchema", "type": "object" }, "ChartExtractorConfigSchema": { "additionalProperties": false, "description": "Configuration schema for chart extraction service endpoints and options.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n Authentication token required for secure services.\n\nyolox_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)\n A tuple containing the gRPC and HTTP services for the yolox endpoint.\n Either the gRPC or HTTP service can be empty, but not both.\n\nocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)\n A tuple containing the gRPC and HTTP services for the ocr endpoint.\n Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n Validates that at least one of the gRPC or HTTP services is provided for each endpoint.\n\nRaises\n------\nValueError\n If both gRPC and HTTP services are empty for any endpoint.\n\nConfig\n------\nextra : str\n Pydantic config option to forbid extra fields.", "properties": { "auth_token": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Auth Token" }, "yolox_endpoints": { "default": [ null, null ], "maxItems": 2, "minItems": 2, "prefixItems": [ { "anyOf": [ { "type": "string" }, { "type": "null" } ] }, { "anyOf": [ { "type": "string" }, { "type": "null" } ] } ], "title": "Yolox Endpoints", "type": "array" }, "yolox_infer_protocol": { "default": "", "title": "Yolox Infer Protocol", "type": "string" }, "ocr_endpoints": { "default": [ null, null ], "maxItems": 2, "minItems": 2, "prefixItems": [ { "anyOf": [ { "type": "string" }, { "type": "null" } ] }, { "anyOf": [ { "type": "string" }, { "type": "null" } ] } ], "title": "Ocr Endpoints", "type": "array" }, "ocr_infer_protocol": { "default": "", "title": "Ocr Infer Protocol", "type": "string" }, "nim_batch_size": { "default": 2, "title": "Nim Batch Size", "type": "integer" }, "workers_per_progress_engine": { "default": 5, "title": "Workers Per Progress Engine", "type": "integer" } }, "title": "ChartExtractorConfigSchema", "type": "object" }, "ChartExtractorSchema": { "additionalProperties": false, "description": "Configuration schema for chart extraction processing settings.\n\nParameters\n----------\nmax_queue_size : int, default=1\n The maximum number of items allowed in the processing queue.\n\nn_workers : int, default=2\n The number of worker threads to use for processing.\n\nraise_on_failure : bool, default=False\n A flag indicating whether to raise an exception if a failure occurs during chart extraction.\n\nextraction_config: Optional[ChartExtractorConfigSchema], default=None\n Configuration for the chart extraction stage, including yolox and ocr service endpoints.", "properties": { "max_queue_size": { "default": 1, "title": "Max Queue Size", "type": "integer" }, "n_workers": { "default": 2, "title": "N Workers", "type": "integer" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "endpoint_config": { "anyOf": [ { "$ref": "#/$defs/ChartExtractorConfigSchema" }, { "type": "null" } ], "default": null } }, "title": "ChartExtractorSchema", "type": "object" }, "EmbeddingStorageSchema": { "additionalProperties": false, "properties": { "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "title": "EmbeddingStorageSchema", "type": "object" }, "ImageCaptionExtractionSchema": { "additionalProperties": false, "properties": { "api_key": { "default": "", "title": "Api Key", "type": "string" }, "endpoint_url": { "default": "https://integrate.api.nvidia.com/v1/chat/completions", "title": "Endpoint Url", "type": "string" }, "prompt": { "default": "Caption the content of this image:", "title": "Prompt", "type": "string" }, "system_prompt": { "default": "/no_think", "title": "System Prompt", "type": "string" }, "model_name": { "default": "nvidia/nemotron-nano-12b-v2-vl", "title": "Model Name", "type": "string" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "title": "ImageCaptionExtractionSchema", "type": "object" }, "ImageDedupSchema": { "additionalProperties": false, "properties": { "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "title": "ImageDedupSchema", "type": "object" }, "ImageFilterSchema": { "additionalProperties": false, "properties": { "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "cpu_only": { "default": false, "title": "Cpu Only", "type": "boolean" } }, "title": "ImageFilterSchema", "type": "object" }, "ImageStorageModuleSchema": { "additionalProperties": false, "properties": { "structured": { "default": true, "title": "Structured", "type": "boolean" }, "images": { "default": true, "title": "Images", "type": "boolean" }, "storage_uri": { "title": "Storage Uri", "type": "string" }, "storage_options": { "additionalProperties": true, "title": "Storage Options", "type": "object" }, "public_base_url": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Public Base Url" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "title": "ImageStorageModuleSchema", "type": "object" }, "InfographicExtractorConfigSchema": { "additionalProperties": false, "description": "Configuration schema for infographic extraction service endpoints and options.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n Authentication token required for secure services.\n\nocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)\n A tuple containing the gRPC and HTTP services for the ocr endpoint.\n Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n Validates that at least one of the gRPC or HTTP services is provided for each endpoint.\n\nRaises\n------\nValueError\n If both gRPC and HTTP services are empty for any endpoint.\n\nConfig\n------\nextra : str\n Pydantic config option to forbid extra fields.", "properties": { "auth_token": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Auth Token" }, "ocr_endpoints": { "default": [ null, null ], "maxItems": 2, "minItems": 2, "prefixItems": [ { "anyOf": [ { "type": "string" }, { "type": "null" } ] }, { "anyOf": [ { "type": "string" }, { "type": "null" } ] } ], "title": "Ocr Endpoints", "type": "array" }, "ocr_infer_protocol": { "default": "", "title": "Ocr Infer Protocol", "type": "string" }, "nim_batch_size": { "default": 2, "title": "Nim Batch Size", "type": "integer" }, "workers_per_progress_engine": { "default": 5, "title": "Workers Per Progress Engine", "type": "integer" } }, "title": "InfographicExtractorConfigSchema", "type": "object" }, "InfographicExtractorSchema": { "additionalProperties": false, "description": "Configuration schema for infographic extraction processing settings.\n\nParameters\n----------\nmax_queue_size : int, default=1\n The maximum number of items allowed in the processing queue.\n\nn_workers : int, default=2\n The number of worker threads to use for processing.\n\nraise_on_failure : bool, default=False\n A flag indicating whether to raise an exception if a failure occurs during infographic extraction.\n\nstage_config : Optional[InfographicExtractorConfigSchema], default=None\n Configuration for the infographic extraction stage, including yolox and ocr service endpoints.", "properties": { "max_queue_size": { "default": 1, "title": "Max Queue Size", "type": "integer" }, "n_workers": { "default": 2, "title": "N Workers", "type": "integer" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "endpoint_config": { "anyOf": [ { "$ref": "#/$defs/InfographicExtractorConfigSchema" }, { "type": "null" } ], "default": null } }, "title": "InfographicExtractorSchema", "type": "object" }, "JobCounterSchema": { "additionalProperties": false, "properties": { "name": { "default": "job_counter", "title": "Name", "type": "string" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "title": "JobCounterSchema", "type": "object" }, "LogLevel": { "enum": [ "DEFAULT", "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL" ], "title": "LogLevel", "type": "string" }, "MessageBrokerClientSchema": { "description": "Configuration schema for message broker client connections.\nSupports Redis or simple in-memory clients.", "properties": { "host": { "default": "redis", "description": "Hostname of the broker service.", "title": "Host", "type": "string" }, "port": { "default": 6379, "description": "Port to connect to. Must be between 1 and 65535.", "exclusiveMaximum": 65536, "exclusiveMinimum": 0, "title": "Port", "type": "integer" }, "client_type": { "default": "redis", "description": "Type of broker client. Supported values: 'redis', 'simple'.", "enum": [ "redis", "simple" ], "title": "Client Type", "type": "string" }, "broker_params": { "anyOf": [ { "additionalProperties": true, "type": "object" }, { "type": "null" } ], "description": "Optional parameters passed to the broker client.", "title": "Broker Params" }, "connection_timeout": { "default": 300, "description": "Connection timeout in seconds. Must be >= 0.", "minimum": 0, "title": "Connection Timeout", "type": "integer" }, "max_backoff": { "default": 300, "description": "Maximum backoff time in seconds. Must be >= 0.", "minimum": 0, "title": "Max Backoff", "type": "integer" }, "max_retries": { "default": 0, "description": "Maximum number of retries. Must be >= 0.", "minimum": 0, "title": "Max Retries", "type": "integer" } }, "title": "MessageBrokerClientSchema", "type": "object" }, "MessageBrokerTaskSinkSchema": { "properties": { "broker_client": { "$ref": "#/$defs/MessageBrokerClientSchema", "default": { "host": "redis", "port": 6379, "client_type": "redis", "broker_params": {}, "connection_timeout": 300, "max_backoff": 300, "max_retries": 0 } }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "progress_engines": { "default": 6, "minimum": 1, "title": "Progress Engines", "type": "integer" } }, "title": "MessageBrokerTaskSinkSchema", "type": "object" }, "MessageBrokerTaskSourceSchema": { "properties": { "broker_client": { "$ref": "#/$defs/MessageBrokerClientSchema", "default": { "host": "redis", "port": 6379, "client_type": "redis", "broker_params": {}, "connection_timeout": 300, "max_backoff": 300, "max_retries": 0 } }, "task_queue": { "default": "ingest_task_queue", "title": "Task Queue", "type": "string" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "progress_engines": { "default": 6, "minimum": 1, "title": "Progress Engines", "type": "integer" } }, "title": "MessageBrokerTaskSourceSchema", "type": "object" }, "MetadataInjectorSchema": { "additionalProperties": false, "properties": { "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "title": "MetadataInjectorSchema", "type": "object" }, "NemotronParseConfigSchema": { "additionalProperties": false, "description": "Configuration schema for Nemotron Parse endpoints and options.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n Authentication token required for secure services.\n\nnemotron_parse_endpoints : Tuple[str, str]\n A tuple containing the gRPC and HTTP services for the nemotron_parse endpoint.\n Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n Validates that at least one of the gRPC or HTTP services is provided for each endpoint.\n\nRaises\n------\nValueError\n If both gRPC and HTTP services are empty for any endpoint.\n\nConfig\n------\nextra : str\n Pydantic config option to forbid extra fields.", "properties": { "auth_token": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Auth Token" }, "yolox_endpoints": { "default": [ null, null ], "maxItems": 2, "minItems": 2, "prefixItems": [ { "anyOf": [ { "type": "string" }, { "type": "null" } ] }, { "anyOf": [ { "type": "string" }, { "type": "null" } ] } ], "title": "Yolox Endpoints", "type": "array" }, "yolox_infer_protocol": { "default": "", "title": "Yolox Infer Protocol", "type": "string" }, "nemotron_parse_endpoints": { "default": [ null, null ], "maxItems": 2, "minItems": 2, "prefixItems": [ { "anyOf": [ { "type": "string" }, { "type": "null" } ] }, { "anyOf": [ { "type": "string" }, { "type": "null" } ] } ], "title": "Nemotron Parse Endpoints", "type": "array" }, "nemotron_parse_infer_protocol": { "default": "", "title": "Nemotron Parse Infer Protocol", "type": "string" }, "nemotron_parse_model_name": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": "nvidia/nemotron-parse", "title": "Nemotron Parse Model Name" }, "timeout": { "default": 300.0, "title": "Timeout", "type": "number" }, "workers_per_progress_engine": { "default": 5, "title": "Workers Per Progress Engine", "type": "integer" } }, "title": "NemotronParseConfigSchema", "type": "object" }, "OpenTelemetryMeterSchema": { "additionalProperties": false, "properties": { "broker_client": { "$ref": "#/$defs/MessageBrokerClientSchema", "default": { "host": "redis", "port": 6379, "client_type": "redis", "broker_params": {}, "connection_timeout": 300, "max_backoff": 300, "max_retries": 0 } }, "otel_endpoint": { "default": "localhost:4317", "title": "Otel Endpoint", "type": "string" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "title": "OpenTelemetryMeterSchema", "type": "object" }, "OpenTelemetryTracerSchema": { "additionalProperties": false, "properties": { "otel_endpoint": { "default": "localhost:4317", "title": "Otel Endpoint", "type": "string" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "title": "OpenTelemetryTracerSchema", "type": "object" }, "PDFExtractorSchema": { "additionalProperties": false, "description": "Configuration schema for the PDF extractor settings.\n\nParameters\n----------\nmax_queue_size : int, default=1\n The maximum number of items allowed in the processing queue.\n\nn_workers : int, default=16\n The number of worker threads to use for processing.\n\nraise_on_failure : bool, default=False\n A flag indicating whether to raise an exception on processing failure.\n\npdfium_config : Optional[PDFiumConfigSchema], default=None\n Configuration for the PDFium service endpoints.", "properties": { "max_queue_size": { "default": 1, "title": "Max Queue Size", "type": "integer" }, "n_workers": { "default": 16, "title": "N Workers", "type": "integer" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "pdfium_config": { "anyOf": [ { "$ref": "#/$defs/PDFiumConfigSchema" }, { "type": "null" } ], "default": null }, "nemotron_parse_config": { "anyOf": [ { "$ref": "#/$defs/NemotronParseConfigSchema" }, { "type": "null" } ], "default": null } }, "title": "PDFExtractorSchema", "type": "object" }, "PDFiumConfigSchema": { "additionalProperties": false, "description": "Configuration schema for PDFium endpoints and options.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n Authentication token required for secure services.\n\nyolox_endpoints : Tuple[str, str]\n A tuple containing the gRPC and HTTP services for the yolox endpoint.\n Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n Validates that at least one of the gRPC or HTTP services is provided for each endpoint.\n\nRaises\n------\nValueError\n If both gRPC and HTTP services are empty for any endpoint.\n\nConfig\n------\nextra : str\n Pydantic config option to forbid extra fields.", "properties": { "auth_token": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Auth Token" }, "yolox_endpoints": { "default": [ null, null ], "maxItems": 2, "minItems": 2, "prefixItems": [ { "anyOf": [ { "type": "string" }, { "type": "null" } ] }, { "anyOf": [ { "type": "string" }, { "type": "null" } ] } ], "title": "Yolox Endpoints", "type": "array" }, "yolox_infer_protocol": { "default": "", "title": "Yolox Infer Protocol", "type": "string" }, "nim_batch_size": { "default": 4, "title": "Nim Batch Size", "type": "integer" }, "workers_per_progress_engine": { "default": 5, "title": "Workers Per Progress Engine", "type": "integer" } }, "title": "PDFiumConfigSchema", "type": "object" }, "PPTXConfigSchema": { "additionalProperties": false, "description": "Configuration schema for docx extraction endpoints and options.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n Authentication token required for secure services.\n\nyolox_endpoints : Tuple[str, str]\n A tuple containing the gRPC and HTTP services for the yolox endpoint.\n Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n Validates that at least one of the gRPC or HTTP services is provided for each endpoint.\n\nRaises\n------\nValueError\n If both gRPC and HTTP services are empty for any endpoint.\n\nConfig\n------\nextra : str\n Pydantic config option to forbid extra fields.", "properties": { "auth_token": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Auth Token" }, "yolox_endpoints": { "default": [ null, null ], "maxItems": 2, "minItems": 2, "prefixItems": [ { "anyOf": [ { "type": "string" }, { "type": "null" } ] }, { "anyOf": [ { "type": "string" }, { "type": "null" } ] } ], "title": "Yolox Endpoints", "type": "array" }, "yolox_infer_protocol": { "default": "", "title": "Yolox Infer Protocol", "type": "string" } }, "title": "PPTXConfigSchema", "type": "object" }, "PPTXExtractorSchema": { "additionalProperties": false, "description": "Configuration schema for the PDF extractor settings.\n\nParameters\n----------\nmax_queue_size : int, default=1\n The maximum number of items allowed in the processing queue.\n\nn_workers : int, default=16\n The number of worker threads to use for processing.\n\nraise_on_failure : bool, default=False\n A flag indicating whether to raise an exception on processing failure.\n\nimage_extraction_config: Optional[ImageConfigSchema], default=None\n Configuration schema for the image extraction stage.", "properties": { "max_queue_size": { "default": 1, "title": "Max Queue Size", "type": "integer" }, "n_workers": { "default": 16, "title": "N Workers", "type": "integer" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "pptx_extraction_config": { "anyOf": [ { "$ref": "#/$defs/PPTXConfigSchema" }, { "type": "null" } ], "default": null }, "pdfium_config": { "anyOf": [ { "$ref": "#/$defs/PDFiumConfigSchema" }, { "type": "null" } ], "default": null } }, "title": "PPTXExtractorSchema", "type": "object" }, "TableExtractorConfigSchema": { "additionalProperties": false, "description": "Configuration schema for the table extraction stage settings.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n Authentication token required for secure services.\n\nocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)\n A tuple containing the gRPC and HTTP services for the ocr endpoint.\n Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n Validates that at least one of the gRPC or HTTP services is provided for the yolox endpoint.\n\nRaises\n------\nValueError\n If both gRPC and HTTP services are empty for the yolox endpoint.\n\nConfig\n------\nextra : str\n Pydantic config option to forbid extra fields.", "properties": { "auth_token": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Auth Token" }, "yolox_endpoints": { "default": [ null, null ], "maxItems": 2, "minItems": 2, "prefixItems": [ { "anyOf": [ { "type": "string" }, { "type": "null" } ] }, { "anyOf": [ { "type": "string" }, { "type": "null" } ] } ], "title": "Yolox Endpoints", "type": "array" }, "yolox_infer_protocol": { "default": "", "title": "Yolox Infer Protocol", "type": "string" }, "ocr_endpoints": { "default": [ null, null ], "maxItems": 2, "minItems": 2, "prefixItems": [ { "anyOf": [ { "type": "string" }, { "type": "null" } ] }, { "anyOf": [ { "type": "string" }, { "type": "null" } ] } ], "title": "Ocr Endpoints", "type": "array" }, "ocr_infer_protocol": { "default": "", "title": "Ocr Infer Protocol", "type": "string" }, "nim_batch_size": { "default": 2, "title": "Nim Batch Size", "type": "integer" }, "workers_per_progress_engine": { "default": 5, "title": "Workers Per Progress Engine", "type": "integer" } }, "title": "TableExtractorConfigSchema", "type": "object" }, "TableExtractorSchema": { "additionalProperties": false, "description": "Configuration schema for the table extraction processing settings.\n\nParameters\n----------\nmax_queue_size : int, default=1\n The maximum number of items allowed in the processing queue.\n\nn_workers : int, default=2\n The number of worker threads to use for processing.\n\nraise_on_failure : bool, default=False\n A flag indicating whether to raise an exception if a failure occurs during table extraction.\n\nstage_config : Optional[TableExtractorConfigSchema], default=None\n Configuration for the table extraction stage, including yolox service endpoints.", "properties": { "max_queue_size": { "default": 1, "title": "Max Queue Size", "type": "integer" }, "n_workers": { "default": 2, "title": "N Workers", "type": "integer" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "endpoint_config": { "anyOf": [ { "$ref": "#/$defs/TableExtractorConfigSchema" }, { "type": "null" } ], "default": null } }, "title": "TableExtractorSchema", "type": "object" }, "TextEmbeddingSchema": { "additionalProperties": false, "properties": { "api_key": { "default": "", "title": "Api Key", "type": "string" }, "batch_size": { "default": 4, "title": "Batch Size", "type": "integer" }, "embedding_model": { "default": "nvidia/llama-3.2-nv-embedqa-1b-v2", "title": "Embedding Model", "type": "string" }, "embedding_nim_endpoint": { "default": "http://embedding:8000/v1", "title": "Embedding Nim Endpoint", "type": "string" }, "encoding_format": { "default": "float", "title": "Encoding Format", "type": "string" }, "httpx_log_level": { "$ref": "#/$defs/LogLevel", "default": "WARNING" }, "input_type": { "default": "passage", "title": "Input Type", "type": "string" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "truncate": { "default": "END", "title": "Truncate", "type": "string" }, "text_elements_modality": { "default": "text", "title": "Text Elements Modality", "type": "string" }, "image_elements_modality": { "default": "text", "title": "Image Elements Modality", "type": "string" }, "structured_elements_modality": { "default": "text", "title": "Structured Elements Modality", "type": "string" }, "audio_elements_modality": { "default": "text", "title": "Audio Elements Modality", "type": "string" }, "custom_content_field": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Custom Content Field" }, "result_target_field": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Result Target Field" }, "dimensions": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "default": null, "title": "Dimensions" } }, "title": "TextEmbeddingSchema", "type": "object" }, "TextSplitterSchema": { "additionalProperties": false, "properties": { "tokenizer": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Tokenizer" }, "chunk_size": { "default": 1024, "exclusiveMinimum": 0, "title": "Chunk Size", "type": "integer" }, "chunk_overlap": { "default": 150, "minimum": 0, "title": "Chunk Overlap", "type": "integer" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "title": "TextSplitterSchema", "type": "object" }, "VdbTaskSinkSchema": { "additionalProperties": false, "properties": { "recreate": { "default": false, "title": "Recreate", "type": "boolean" }, "service": { "default": "milvus", "title": "Service", "type": "string" }, "is_service_serialized": { "default": false, "title": "Is Service Serialized", "type": "boolean" }, "default_resource_name": { "default": "nv_ingest_collection", "title": "Default Resource Name", "type": "string" }, "resource_schemas": { "additionalProperties": true, "default": { "nv_ingest_collection": { "index_conf": { "field_name": "vector", "index_type": "GPU_CAGRA", "metric_type": "L2", "params": { "build_algo": "NN_DESCENT", "graph_degree": 64, "intermediate_graph_degree": 128 } }, "schema_conf": { "description": "NV-INGEST collection schema", "enable_dynamic_field": true, "schema_fields": [ { "auto_id": true, "description": "Primary key for the collection", "is_primary": true, "name": "pk", "type": 5 }, { "description": "Extracted content", "name": "text", "params": { "max_length": 65535 }, "type": 21 }, { "description": "Embedding vectors", "name": "vector", "params": { "dim": 1024 }, "type": 101 }, { "description": "Source document and raw data extracted content", "name": "source", "type": 23 }, { "description": "Content metadata", "name": "content_metadata", "type": 23 } ] } } }, "title": "Resource Schemas", "type": "object" }, "resource_kwargs": { "additionalProperties": true, "title": "Resource Kwargs", "type": "object" }, "service_kwargs": { "additionalProperties": true, "default": {}, "title": "Service Kwargs", "type": "object" }, "batch_size": { "default": 5120, "title": "Batch Size", "type": "integer" }, "write_time_interval": { "default": 1.0, "title": "Write Time Interval", "type": "number" }, "retry_interval": { "default": 60.0, "title": "Retry Interval", "type": "number" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "progress_engines": { "default": 1, "minimum": 1, "title": "Progress Engines", "type": "integer" } }, "title": "VdbTaskSinkSchema", "type": "object" } }, "additionalProperties": false }
- Config:
extra: str = forbid
- Fields:
job_counter_module (nv_ingest.framework.schemas.framework_job_counter_schema.JobCounterSchema)otel_meter_module (nv_ingest.framework.schemas.framework_otel_meter_schema.OpenTelemetryMeterSchema)pdf_extractor_module (nv_ingest_api.internal.schemas.extract.extract_pdf_schema.PDFExtractorSchema)vdb_task_sink (nv_ingest.framework.schemas.framework_vdb_task_sink_schema.VdbTaskSinkSchema)
- field audio_extractor_schema: AudioExtractorSchema = AudioExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, audio_extraction_config=None)#
- field chart_extractor_module: ChartExtractorSchema = ChartExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, endpoint_config=None)#
- field embed_extractions_module: TextEmbeddingSchema = TextEmbeddingSchema(batch_size=4, embedding_model='nvidia/llama-3.2-nv-embedqa-1b-v2', embedding_nim_endpoint='http://embedding:8000/v1', encoding_format='float', httpx_log_level=<LogLevel.WARNING: 'WARNING'>, input_type='passage', raise_on_failure=False, truncate='END', text_elements_modality='text', image_elements_modality='text', structured_elements_modality='text', audio_elements_modality='text', custom_content_field=None, result_target_field=None, dimensions=None)#
- field embedding_storage_module: EmbeddingStorageSchema = EmbeddingStorageSchema(raise_on_failure=False)#
- field image_caption_extraction_module: ImageCaptionExtractionSchema = ImageCaptionExtractionSchema(endpoint_url='https://integrate.api.nvidia.com/v1/chat/completions', prompt='Caption the content of this image:', system_prompt='/no_think', model_name='nvidia/nemotron-nano-12b-v2-vl', raise_on_failure=False)#
- field image_dedup_module: ImageDedupSchema = ImageDedupSchema(raise_on_failure=False)#
- field image_filter_module: ImageFilterSchema = ImageFilterSchema(raise_on_failure=False, cpu_only=False)#
- field image_storage_module: ImageStorageModuleSchema = ImageStorageModuleSchema(structured=True, images=True, storage_uri='s3://nv-ingest/artifacts/store/images', storage_options={}, public_base_url=None, raise_on_failure=False)#
- field infographic_extractor_module: InfographicExtractorSchema = InfographicExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, endpoint_config=None)#
- field job_counter_module: JobCounterSchema = JobCounterSchema(name='job_counter', raise_on_failure=False)#
- field metadata_injection_module: MetadataInjectorSchema = MetadataInjectorSchema(raise_on_failure=False)#
- field otel_meter_module: OpenTelemetryMeterSchema = OpenTelemetryMeterSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), otel_endpoint='localhost:4317', raise_on_failure=False)#
- field otel_tracer_module: OpenTelemetryTracerSchema = OpenTelemetryTracerSchema(otel_endpoint='localhost:4317', raise_on_failure=False)#
- field pdf_extractor_module: PDFExtractorSchema = PDFExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pdfium_config=None, nemotron_parse_config=None)#
- field pptx_extractor_module: PPTXExtractorSchema = PPTXExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pptx_extraction_config=None, pdfium_config=None)#
- field redis_task_sink: MessageBrokerTaskSinkSchema = MessageBrokerTaskSinkSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), raise_on_failure=False, progress_engines=6)#
- field redis_task_source: MessageBrokerTaskSourceSchema = MessageBrokerTaskSourceSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), task_queue='ingest_task_queue', raise_on_failure=False, progress_engines=6)#
- field table_extractor_module: TableExtractorSchema = TableExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, endpoint_config=None)#
- field text_splitter_module: TextSplitterSchema = TextSplitterSchema(tokenizer=None, chunk_size=1024, chunk_overlap=150, raise_on_failure=False)#
- field vdb_task_sink: VdbTaskSinkSchema = VdbTaskSinkSchema(recreate=False, service='milvus', is_service_serialized=False, default_resource_name='nv_ingest_collection', resource_schemas={'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'metric_type': 'L2', 'index_type': 'GPU_CAGRA', 'params': {'intermediate_graph_degree': 128, 'graph_degree': 64, 'build_algo': 'NN_DESCENT'}}, 'schema_conf': {'enable_dynamic_field': True, 'schema_fields': [{'name': 'pk', 'description': 'Primary key for the collection', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': 'Extracted content', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'vector', 'description': 'Embedding vectors', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}, {'name': 'source', 'description': 'Source document and raw data extracted content', 'type': <DataType.JSON: 23>}, {'name': 'content_metadata', 'description': 'Content metadata', 'type': <DataType.JSON: 23>}], 'description': 'NV-INGEST collection schema'}}}, resource_kwargs={}, service_kwargs={}, batch_size=5120, write_time_interval=1.0, retry_interval=60.0, raise_on_failure=False, progress_engines=1)#
nv_ingest.framework.schemas.framework_job_counter_schema module#
- pydantic model nv_ingest.framework.schemas.framework_job_counter_schema.JobCounterSchema[source]#
Bases:
BaseModelShow JSON schema
{ "title": "JobCounterSchema", "type": "object", "properties": { "name": { "default": "job_counter", "title": "Name", "type": "string" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "additionalProperties": false }
- Config:
extra: str = forbid
- Fields:
- field name: str = 'job_counter'#
- field raise_on_failure: bool = False#
nv_ingest.framework.schemas.framework_message_broker_sink_schema module#
- pydantic model nv_ingest.framework.schemas.framework_message_broker_sink_schema.MessageBrokerTaskSinkSchema[source]#
Bases:
BaseModelShow JSON schema
{ "title": "MessageBrokerTaskSinkSchema", "type": "object", "properties": { "broker_client": { "$ref": "#/$defs/MessageBrokerClientSchema", "default": { "host": "redis", "port": 6379, "client_type": "redis", "broker_params": {}, "connection_timeout": 300, "max_backoff": 300, "max_retries": 0 } }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "progress_engines": { "default": 6, "minimum": 1, "title": "Progress Engines", "type": "integer" } }, "$defs": { "MessageBrokerClientSchema": { "description": "Configuration schema for message broker client connections.\nSupports Redis or simple in-memory clients.", "properties": { "host": { "default": "redis", "description": "Hostname of the broker service.", "title": "Host", "type": "string" }, "port": { "default": 6379, "description": "Port to connect to. Must be between 1 and 65535.", "exclusiveMaximum": 65536, "exclusiveMinimum": 0, "title": "Port", "type": "integer" }, "client_type": { "default": "redis", "description": "Type of broker client. Supported values: 'redis', 'simple'.", "enum": [ "redis", "simple" ], "title": "Client Type", "type": "string" }, "broker_params": { "anyOf": [ { "additionalProperties": true, "type": "object" }, { "type": "null" } ], "description": "Optional parameters passed to the broker client.", "title": "Broker Params" }, "connection_timeout": { "default": 300, "description": "Connection timeout in seconds. Must be >= 0.", "minimum": 0, "title": "Connection Timeout", "type": "integer" }, "max_backoff": { "default": 300, "description": "Maximum backoff time in seconds. Must be >= 0.", "minimum": 0, "title": "Max Backoff", "type": "integer" }, "max_retries": { "default": 0, "description": "Maximum number of retries. Must be >= 0.", "minimum": 0, "title": "Max Retries", "type": "integer" } }, "title": "MessageBrokerClientSchema", "type": "object" } } }
- Fields:
- field broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0)#
- field progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])] = 6#
- Constraints:
ge = 1
- field raise_on_failure: bool = False#
nv_ingest.framework.schemas.framework_message_broker_source_schema module#
- pydantic model nv_ingest.framework.schemas.framework_message_broker_source_schema.MessageBrokerTaskSourceSchema[source]#
Bases:
BaseModelShow JSON schema
{ "title": "MessageBrokerTaskSourceSchema", "type": "object", "properties": { "broker_client": { "$ref": "#/$defs/MessageBrokerClientSchema", "default": { "host": "redis", "port": 6379, "client_type": "redis", "broker_params": {}, "connection_timeout": 300, "max_backoff": 300, "max_retries": 0 } }, "task_queue": { "default": "ingest_task_queue", "title": "Task Queue", "type": "string" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "progress_engines": { "default": 6, "minimum": 1, "title": "Progress Engines", "type": "integer" } }, "$defs": { "MessageBrokerClientSchema": { "description": "Configuration schema for message broker client connections.\nSupports Redis or simple in-memory clients.", "properties": { "host": { "default": "redis", "description": "Hostname of the broker service.", "title": "Host", "type": "string" }, "port": { "default": 6379, "description": "Port to connect to. Must be between 1 and 65535.", "exclusiveMaximum": 65536, "exclusiveMinimum": 0, "title": "Port", "type": "integer" }, "client_type": { "default": "redis", "description": "Type of broker client. Supported values: 'redis', 'simple'.", "enum": [ "redis", "simple" ], "title": "Client Type", "type": "string" }, "broker_params": { "anyOf": [ { "additionalProperties": true, "type": "object" }, { "type": "null" } ], "description": "Optional parameters passed to the broker client.", "title": "Broker Params" }, "connection_timeout": { "default": 300, "description": "Connection timeout in seconds. Must be >= 0.", "minimum": 0, "title": "Connection Timeout", "type": "integer" }, "max_backoff": { "default": 300, "description": "Maximum backoff time in seconds. Must be >= 0.", "minimum": 0, "title": "Max Backoff", "type": "integer" }, "max_retries": { "default": 0, "description": "Maximum number of retries. Must be >= 0.", "minimum": 0, "title": "Max Retries", "type": "integer" } }, "title": "MessageBrokerClientSchema", "type": "object" } } }
- Fields:
- field broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0)#
- field progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])] = 6#
- Constraints:
ge = 1
- field raise_on_failure: bool = False#
- field task_queue: str = 'ingest_task_queue'#
nv_ingest.framework.schemas.framework_message_wrapper_schema module#
- pydantic model nv_ingest.framework.schemas.framework_message_wrapper_schema.MessageWrapper[source]#
Bases:
BaseModelShow JSON schema
{ "title": "MessageWrapper", "type": "object", "properties": { "payload": { "title": "Payload", "type": "string" } }, "required": [ "payload" ] }
- Fields:
- field payload: str [Required]#
nv_ingest.framework.schemas.framework_metadata_injector_schema module#
- pydantic model nv_ingest.framework.schemas.framework_metadata_injector_schema.MetadataInjectorSchema[source]#
Bases:
BaseModelShow JSON schema
{ "title": "MetadataInjectorSchema", "type": "object", "properties": { "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "additionalProperties": false }
- Config:
extra: str = forbid
- Fields:
- field raise_on_failure: bool = False#
nv_ingest.framework.schemas.framework_otel_meter_schema module#
- pydantic model nv_ingest.framework.schemas.framework_otel_meter_schema.OpenTelemetryMeterSchema[source]#
Bases:
BaseModelShow JSON schema
{ "title": "OpenTelemetryMeterSchema", "type": "object", "properties": { "broker_client": { "$ref": "#/$defs/MessageBrokerClientSchema", "default": { "host": "redis", "port": 6379, "client_type": "redis", "broker_params": {}, "connection_timeout": 300, "max_backoff": 300, "max_retries": 0 } }, "otel_endpoint": { "default": "localhost:4317", "title": "Otel Endpoint", "type": "string" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "$defs": { "MessageBrokerClientSchema": { "description": "Configuration schema for message broker client connections.\nSupports Redis or simple in-memory clients.", "properties": { "host": { "default": "redis", "description": "Hostname of the broker service.", "title": "Host", "type": "string" }, "port": { "default": 6379, "description": "Port to connect to. Must be between 1 and 65535.", "exclusiveMaximum": 65536, "exclusiveMinimum": 0, "title": "Port", "type": "integer" }, "client_type": { "default": "redis", "description": "Type of broker client. Supported values: 'redis', 'simple'.", "enum": [ "redis", "simple" ], "title": "Client Type", "type": "string" }, "broker_params": { "anyOf": [ { "additionalProperties": true, "type": "object" }, { "type": "null" } ], "description": "Optional parameters passed to the broker client.", "title": "Broker Params" }, "connection_timeout": { "default": 300, "description": "Connection timeout in seconds. Must be >= 0.", "minimum": 0, "title": "Connection Timeout", "type": "integer" }, "max_backoff": { "default": 300, "description": "Maximum backoff time in seconds. Must be >= 0.", "minimum": 0, "title": "Max Backoff", "type": "integer" }, "max_retries": { "default": 0, "description": "Maximum number of retries. Must be >= 0.", "minimum": 0, "title": "Max Retries", "type": "integer" } }, "title": "MessageBrokerClientSchema", "type": "object" } }, "additionalProperties": false }
- Config:
extra: str = forbid
- Fields:
- field broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0)#
- field otel_endpoint: str = 'localhost:4317'#
- field raise_on_failure: bool = False#
nv_ingest.framework.schemas.framework_otel_tracer_schema module#
- pydantic model nv_ingest.framework.schemas.framework_otel_tracer_schema.OpenTelemetryTracerSchema[source]#
Bases:
BaseModelShow JSON schema
{ "title": "OpenTelemetryTracerSchema", "type": "object", "properties": { "otel_endpoint": { "default": "localhost:4317", "title": "Otel Endpoint", "type": "string" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "additionalProperties": false }
- Config:
extra: str = forbid
- Fields:
- field otel_endpoint: str = 'localhost:4317'#
- field raise_on_failure: bool = False#
nv_ingest.framework.schemas.framework_processing_job_schema module#
- class nv_ingest.framework.schemas.framework_processing_job_schema.ConversionStatus(*values)[source]#
Bases:
str,Enum- FAILED = 'failed'#
- IN_PROGRESS = 'in_progress'#
- SUCCESS = 'success'#
- pydantic model nv_ingest.framework.schemas.framework_processing_job_schema.ProcessingJob[source]#
Bases:
BaseModelShow JSON schema
{ "title": "ProcessingJob", "type": "object", "properties": { "submitted_job_id": { "title": "Submitted Job Id", "type": "string" }, "filename": { "title": "Filename", "type": "string" }, "raw_result": { "default": "", "title": "Raw Result", "type": "string" }, "content": { "default": "", "title": "Content", "type": "string" }, "status": { "$ref": "#/$defs/ConversionStatus" }, "error": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "default": null, "title": "Error" } }, "$defs": { "ConversionStatus": { "enum": [ "in_progress", "success", "failed", "{'extra': 'forbid'}" ], "title": "ConversionStatus", "type": "string" } }, "additionalProperties": false, "required": [ "submitted_job_id", "filename", "status" ] }
- Config:
extra: str = forbid
- Fields:
- field content: str = ''#
- field error: str | None = None#
- field filename: str [Required]#
- field raw_result: str = ''#
- field status: ConversionStatus [Required]#
- field submitted_job_id: str [Required]#
nv_ingest.framework.schemas.framework_task_injection_schema module#
- pydantic model nv_ingest.framework.schemas.framework_task_injection_schema.TaskInjectionSchema[source]#
Bases:
BaseModelShow JSON schema
{ "title": "TaskInjectionSchema", "type": "object", "properties": { "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" } }, "additionalProperties": false }
- Config:
extra: str = forbid
- Fields:
- field raise_on_failure: bool = False#
nv_ingest.framework.schemas.framework_vdb_task_sink_schema module#
- pydantic model nv_ingest.framework.schemas.framework_vdb_task_sink_schema.VdbTaskSinkSchema[source]#
Bases:
BaseModelShow JSON schema
{ "title": "VdbTaskSinkSchema", "type": "object", "properties": { "recreate": { "default": false, "title": "Recreate", "type": "boolean" }, "service": { "default": "milvus", "title": "Service", "type": "string" }, "is_service_serialized": { "default": false, "title": "Is Service Serialized", "type": "boolean" }, "default_resource_name": { "default": "nv_ingest_collection", "title": "Default Resource Name", "type": "string" }, "resource_schemas": { "additionalProperties": true, "default": { "nv_ingest_collection": { "index_conf": { "field_name": "vector", "index_type": "GPU_CAGRA", "metric_type": "L2", "params": { "build_algo": "NN_DESCENT", "graph_degree": 64, "intermediate_graph_degree": 128 } }, "schema_conf": { "description": "NV-INGEST collection schema", "enable_dynamic_field": true, "schema_fields": [ { "auto_id": true, "description": "Primary key for the collection", "is_primary": true, "name": "pk", "type": 5 }, { "description": "Extracted content", "name": "text", "params": { "max_length": 65535 }, "type": 21 }, { "description": "Embedding vectors", "name": "vector", "params": { "dim": 1024 }, "type": 101 }, { "description": "Source document and raw data extracted content", "name": "source", "type": 23 }, { "description": "Content metadata", "name": "content_metadata", "type": 23 } ] } } }, "title": "Resource Schemas", "type": "object" }, "resource_kwargs": { "additionalProperties": true, "title": "Resource Kwargs", "type": "object" }, "service_kwargs": { "additionalProperties": true, "default": {}, "title": "Service Kwargs", "type": "object" }, "batch_size": { "default": 5120, "title": "Batch Size", "type": "integer" }, "write_time_interval": { "default": 1.0, "title": "Write Time Interval", "type": "number" }, "retry_interval": { "default": 60.0, "title": "Retry Interval", "type": "number" }, "raise_on_failure": { "default": false, "title": "Raise On Failure", "type": "boolean" }, "progress_engines": { "default": 1, "minimum": 1, "title": "Progress Engines", "type": "integer" } }, "additionalProperties": false }
- Config:
extra: str = forbid
- Fields:
- Validators:
- field batch_size: int = 5120#
- field default_resource_name: str = 'nv_ingest_collection'#
- Validated by:
- field is_service_serialized: bool = False#
- field progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])] = 1#
- Constraints:
ge = 1
- field raise_on_failure: bool = False#
- field recreate: bool = False#
- field resource_kwargs: dict [Optional]#
- field resource_schemas: dict = {'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'index_type': 'GPU_CAGRA', 'metric_type': 'L2', 'params': {'build_algo': 'NN_DESCENT', 'graph_degree': 64, 'intermediate_graph_degree': 128}}, 'schema_conf': {'description': 'NV-INGEST collection schema', 'enable_dynamic_field': True, 'schema_fields': [{'auto_id': True, 'description': 'Primary key for the collection', 'is_primary': True, 'name': 'pk', 'type': DataType.INT64}, {'description': 'Extracted content', 'name': 'text', 'params': {'max_length': 65535}, 'type': DataType.VARCHAR}, {'description': 'Embedding vectors', 'name': 'vector', 'params': {'dim': 1024}, 'type': DataType.FLOAT_VECTOR}, {'description': 'Source document and raw data extracted content', 'name': 'source', 'type': DataType.JSON}, {'description': 'Content metadata', 'name': 'content_metadata', 'type': DataType.JSON}]}}}#
- field retry_interval: float = 60.0#
- field service: str = 'milvus'#
- Validated by:
- field service_kwargs: dict = {}#
- field write_time_interval: float = 1.0#
- validator validate_resource_name » default_resource_name[source]#
- nv_ingest.framework.schemas.framework_vdb_task_sink_schema.build_default_milvus_config(
- embedding_size: int = 1024,
Builds the configuration for Milvus.
This function creates a dictionary configuration for a Milvus collection. It includes the index configuration and the schema configuration, with various fields like id, title, link, summary, page_content, and embedding.
- Parameters:
embedding_size (int) – The size of the embedding vector.
- Returns:
A dictionary containing the configuration settings for Milvus.
- Return type:
Dict[str, Any]