nv_ingest.framework.schemas package#

Submodules#

nv_ingest.framework.schemas.framework_ingest_config_schema module#

pydantic model nv_ingest.framework.schemas.framework_ingest_config_schema.PipelineConfigSchema[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "PipelineConfigSchema",
   "type": "object",
   "properties": {
      "audio_extractor_schema": {
         "$ref": "#/$defs/AudioExtractorSchema",
         "default": {
            "max_queue_size": 1,
            "n_workers": 16,
            "raise_on_failure": false,
            "audio_extraction_config": null
         }
      },
      "chart_extractor_module": {
         "$ref": "#/$defs/ChartExtractorSchema",
         "default": {
            "max_queue_size": 1,
            "n_workers": 2,
            "raise_on_failure": false,
            "endpoint_config": null
         }
      },
      "text_splitter_module": {
         "$ref": "#/$defs/TextSplitterSchema",
         "default": {
            "tokenizer": null,
            "chunk_size": 1024,
            "chunk_overlap": 150,
            "raise_on_failure": false
         }
      },
      "embedding_storage_module": {
         "$ref": "#/$defs/EmbeddingStorageSchema",
         "default": {
            "raise_on_failure": false
         }
      },
      "embed_extractions_module": {
         "$ref": "#/$defs/TextEmbeddingSchema",
         "default": {
            "api_key": "",
            "batch_size": 4,
            "embedding_model": "nvidia/llama-3.2-nv-embedqa-1b-v2",
            "embedding_nim_endpoint": "http://embedding:8000/v1",
            "encoding_format": "float",
            "httpx_log_level": "WARNING",
            "input_type": "passage",
            "raise_on_failure": false,
            "truncate": "END",
            "text_elements_modality": "text",
            "image_elements_modality": "text",
            "structured_elements_modality": "text",
            "audio_elements_modality": "text",
            "custom_content_field": null,
            "result_target_field": null,
            "dimensions": null
         }
      },
      "image_caption_extraction_module": {
         "$ref": "#/$defs/ImageCaptionExtractionSchema",
         "default": {
            "api_key": "",
            "endpoint_url": "https://integrate.api.nvidia.com/v1/chat/completions",
            "prompt": "Caption the content of this image:",
            "system_prompt": "/no_think",
            "model_name": "nvidia/nemotron-nano-12b-v2-vl",
            "raise_on_failure": false
         }
      },
      "image_dedup_module": {
         "$ref": "#/$defs/ImageDedupSchema",
         "default": {
            "raise_on_failure": false
         }
      },
      "image_filter_module": {
         "$ref": "#/$defs/ImageFilterSchema",
         "default": {
            "raise_on_failure": false,
            "cpu_only": false
         }
      },
      "image_storage_module": {
         "$ref": "#/$defs/ImageStorageModuleSchema",
         "default": {
            "structured": true,
            "images": true,
            "storage_uri": "s3://nv-ingest/artifacts/store/images",
            "storage_options": {},
            "public_base_url": null,
            "raise_on_failure": false
         }
      },
      "infographic_extractor_module": {
         "$ref": "#/$defs/InfographicExtractorSchema",
         "default": {
            "max_queue_size": 1,
            "n_workers": 2,
            "raise_on_failure": false,
            "endpoint_config": null
         }
      },
      "job_counter_module": {
         "$ref": "#/$defs/JobCounterSchema",
         "default": {
            "name": "job_counter",
            "raise_on_failure": false
         }
      },
      "metadata_injection_module": {
         "$ref": "#/$defs/MetadataInjectorSchema",
         "default": {
            "raise_on_failure": false
         }
      },
      "otel_meter_module": {
         "$ref": "#/$defs/OpenTelemetryMeterSchema",
         "default": {
            "broker_client": {
               "broker_params": {},
               "client_type": "redis",
               "connection_timeout": 300,
               "host": "redis",
               "max_backoff": 300,
               "max_retries": 0,
               "port": 6379
            },
            "otel_endpoint": "localhost:4317",
            "raise_on_failure": false
         }
      },
      "otel_tracer_module": {
         "$ref": "#/$defs/OpenTelemetryTracerSchema",
         "default": {
            "otel_endpoint": "localhost:4317",
            "raise_on_failure": false
         }
      },
      "pdf_extractor_module": {
         "$ref": "#/$defs/PDFExtractorSchema",
         "default": {
            "max_queue_size": 1,
            "n_workers": 16,
            "raise_on_failure": false,
            "pdfium_config": null,
            "nemotron_parse_config": null
         }
      },
      "pptx_extractor_module": {
         "$ref": "#/$defs/PPTXExtractorSchema",
         "default": {
            "max_queue_size": 1,
            "n_workers": 16,
            "raise_on_failure": false,
            "pptx_extraction_config": null,
            "pdfium_config": null
         }
      },
      "redis_task_sink": {
         "$ref": "#/$defs/MessageBrokerTaskSinkSchema",
         "default": {
            "broker_client": {
               "broker_params": {},
               "client_type": "redis",
               "connection_timeout": 300,
               "host": "redis",
               "max_backoff": 300,
               "max_retries": 0,
               "port": 6379
            },
            "raise_on_failure": false,
            "progress_engines": 6
         }
      },
      "redis_task_source": {
         "$ref": "#/$defs/MessageBrokerTaskSourceSchema",
         "default": {
            "broker_client": {
               "broker_params": {},
               "client_type": "redis",
               "connection_timeout": 300,
               "host": "redis",
               "max_backoff": 300,
               "max_retries": 0,
               "port": 6379
            },
            "task_queue": "ingest_task_queue",
            "raise_on_failure": false,
            "progress_engines": 6
         }
      },
      "table_extractor_module": {
         "$ref": "#/$defs/TableExtractorSchema",
         "default": {
            "max_queue_size": 1,
            "n_workers": 2,
            "raise_on_failure": false,
            "endpoint_config": null
         }
      },
      "vdb_task_sink": {
         "$ref": "#/$defs/VdbTaskSinkSchema",
         "default": {
            "recreate": false,
            "service": "milvus",
            "is_service_serialized": false,
            "default_resource_name": "nv_ingest_collection",
            "resource_schemas": {
               "nv_ingest_collection": {
                  "index_conf": {
                     "field_name": "vector",
                     "index_type": "GPU_CAGRA",
                     "metric_type": "L2",
                     "params": {
                        "build_algo": "NN_DESCENT",
                        "graph_degree": 64,
                        "intermediate_graph_degree": 128
                     }
                  },
                  "schema_conf": {
                     "description": "NV-INGEST collection schema",
                     "enable_dynamic_field": true,
                     "schema_fields": [
                        {
                           "auto_id": true,
                           "description": "Primary key for the collection",
                           "is_primary": true,
                           "name": "pk",
                           "type": 5
                        },
                        {
                           "description": "Extracted content",
                           "name": "text",
                           "params": {
                              "max_length": 65535
                           },
                           "type": 21
                        },
                        {
                           "description": "Embedding vectors",
                           "name": "vector",
                           "params": {
                              "dim": 1024
                           },
                           "type": 101
                        },
                        {
                           "description": "Source document and raw data extracted content",
                           "name": "source",
                           "type": 23
                        },
                        {
                           "description": "Content metadata",
                           "name": "content_metadata",
                           "type": 23
                        }
                     ]
                  }
               }
            },
            "resource_kwargs": {},
            "service_kwargs": {},
            "batch_size": 5120,
            "write_time_interval": 1.0,
            "retry_interval": 60.0,
            "raise_on_failure": false,
            "progress_engines": 1
         }
      }
   },
   "$defs": {
      "AudioConfigSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for audio extraction endpoints and options.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n    Authentication token required for secure services.\n\naudio_endpoints : Tuple[str, str]\n    A tuple containing the gRPC and HTTP services for the audio_retriever endpoint.\n    Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n    Validates that at least one of the gRPC or HTTP services is provided for each endpoint.\n\nRaises\n------\nValueError\n    If both gRPC and HTTP services are empty for any endpoint.\n\nConfig\n------\nextra : str\n    Pydantic config option to forbid extra fields.",
         "properties": {
            "auth_token": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Auth Token"
            },
            "audio_endpoints": {
               "default": [
                  null,
                  null
               ],
               "maxItems": 2,
               "minItems": 2,
               "prefixItems": [
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  },
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  }
               ],
               "title": "Audio Endpoints",
               "type": "array"
            },
            "audio_infer_protocol": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Audio Infer Protocol"
            },
            "function_id": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Function Id"
            },
            "use_ssl": {
               "anyOf": [
                  {
                     "type": "boolean"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Use Ssl"
            },
            "ssl_cert": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Ssl Cert"
            },
            "segment_audio": {
               "anyOf": [
                  {
                     "type": "boolean"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Segment Audio"
            }
         },
         "title": "AudioConfigSchema",
         "type": "object"
      },
      "AudioExtractorSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for the PDF extractor settings.\n\nParameters\n----------\nmax_queue_size : int, default=1\n    The maximum number of items allowed in the processing queue.\n\nn_workers : int, default=16\n    The number of worker threads to use for processing.\n\nraise_on_failure : bool, default=False\n    A flag indicating whether to raise an exception on processing failure.\n\naudio_extraction_config: Optional[AudioConfigSchema], default=None\n    Configuration schema for the audio extraction stage.",
         "properties": {
            "max_queue_size": {
               "default": 1,
               "title": "Max Queue Size",
               "type": "integer"
            },
            "n_workers": {
               "default": 16,
               "title": "N Workers",
               "type": "integer"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            },
            "audio_extraction_config": {
               "anyOf": [
                  {
                     "$ref": "#/$defs/AudioConfigSchema"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null
            }
         },
         "title": "AudioExtractorSchema",
         "type": "object"
      },
      "ChartExtractorConfigSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for chart extraction service endpoints and options.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n    Authentication token required for secure services.\n\nyolox_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)\n    A tuple containing the gRPC and HTTP services for the yolox endpoint.\n    Either the gRPC or HTTP service can be empty, but not both.\n\nocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)\n    A tuple containing the gRPC and HTTP services for the ocr endpoint.\n    Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n    Validates that at least one of the gRPC or HTTP services is provided for each endpoint.\n\nRaises\n------\nValueError\n    If both gRPC and HTTP services are empty for any endpoint.\n\nConfig\n------\nextra : str\n    Pydantic config option to forbid extra fields.",
         "properties": {
            "auth_token": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Auth Token"
            },
            "yolox_endpoints": {
               "default": [
                  null,
                  null
               ],
               "maxItems": 2,
               "minItems": 2,
               "prefixItems": [
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  },
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  }
               ],
               "title": "Yolox Endpoints",
               "type": "array"
            },
            "yolox_infer_protocol": {
               "default": "",
               "title": "Yolox Infer Protocol",
               "type": "string"
            },
            "ocr_endpoints": {
               "default": [
                  null,
                  null
               ],
               "maxItems": 2,
               "minItems": 2,
               "prefixItems": [
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  },
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  }
               ],
               "title": "Ocr Endpoints",
               "type": "array"
            },
            "ocr_infer_protocol": {
               "default": "",
               "title": "Ocr Infer Protocol",
               "type": "string"
            },
            "nim_batch_size": {
               "default": 2,
               "title": "Nim Batch Size",
               "type": "integer"
            },
            "workers_per_progress_engine": {
               "default": 5,
               "title": "Workers Per Progress Engine",
               "type": "integer"
            }
         },
         "title": "ChartExtractorConfigSchema",
         "type": "object"
      },
      "ChartExtractorSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for chart extraction processing settings.\n\nParameters\n----------\nmax_queue_size : int, default=1\n    The maximum number of items allowed in the processing queue.\n\nn_workers : int, default=2\n    The number of worker threads to use for processing.\n\nraise_on_failure : bool, default=False\n    A flag indicating whether to raise an exception if a failure occurs during chart extraction.\n\nextraction_config: Optional[ChartExtractorConfigSchema], default=None\n    Configuration for the chart extraction stage, including yolox and ocr service endpoints.",
         "properties": {
            "max_queue_size": {
               "default": 1,
               "title": "Max Queue Size",
               "type": "integer"
            },
            "n_workers": {
               "default": 2,
               "title": "N Workers",
               "type": "integer"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            },
            "endpoint_config": {
               "anyOf": [
                  {
                     "$ref": "#/$defs/ChartExtractorConfigSchema"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null
            }
         },
         "title": "ChartExtractorSchema",
         "type": "object"
      },
      "EmbeddingStorageSchema": {
         "additionalProperties": false,
         "properties": {
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            }
         },
         "title": "EmbeddingStorageSchema",
         "type": "object"
      },
      "ImageCaptionExtractionSchema": {
         "additionalProperties": false,
         "properties": {
            "api_key": {
               "default": "",
               "title": "Api Key",
               "type": "string"
            },
            "endpoint_url": {
               "default": "https://integrate.api.nvidia.com/v1/chat/completions",
               "title": "Endpoint Url",
               "type": "string"
            },
            "prompt": {
               "default": "Caption the content of this image:",
               "title": "Prompt",
               "type": "string"
            },
            "system_prompt": {
               "default": "/no_think",
               "title": "System Prompt",
               "type": "string"
            },
            "model_name": {
               "default": "nvidia/nemotron-nano-12b-v2-vl",
               "title": "Model Name",
               "type": "string"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            }
         },
         "title": "ImageCaptionExtractionSchema",
         "type": "object"
      },
      "ImageDedupSchema": {
         "additionalProperties": false,
         "properties": {
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            }
         },
         "title": "ImageDedupSchema",
         "type": "object"
      },
      "ImageFilterSchema": {
         "additionalProperties": false,
         "properties": {
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            },
            "cpu_only": {
               "default": false,
               "title": "Cpu Only",
               "type": "boolean"
            }
         },
         "title": "ImageFilterSchema",
         "type": "object"
      },
      "ImageStorageModuleSchema": {
         "additionalProperties": false,
         "properties": {
            "structured": {
               "default": true,
               "title": "Structured",
               "type": "boolean"
            },
            "images": {
               "default": true,
               "title": "Images",
               "type": "boolean"
            },
            "storage_uri": {
               "title": "Storage Uri",
               "type": "string"
            },
            "storage_options": {
               "additionalProperties": true,
               "title": "Storage Options",
               "type": "object"
            },
            "public_base_url": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Public Base Url"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            }
         },
         "title": "ImageStorageModuleSchema",
         "type": "object"
      },
      "InfographicExtractorConfigSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for infographic extraction service endpoints and options.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n    Authentication token required for secure services.\n\nocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)\n    A tuple containing the gRPC and HTTP services for the ocr endpoint.\n    Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n    Validates that at least one of the gRPC or HTTP services is provided for each endpoint.\n\nRaises\n------\nValueError\n    If both gRPC and HTTP services are empty for any endpoint.\n\nConfig\n------\nextra : str\n    Pydantic config option to forbid extra fields.",
         "properties": {
            "auth_token": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Auth Token"
            },
            "ocr_endpoints": {
               "default": [
                  null,
                  null
               ],
               "maxItems": 2,
               "minItems": 2,
               "prefixItems": [
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  },
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  }
               ],
               "title": "Ocr Endpoints",
               "type": "array"
            },
            "ocr_infer_protocol": {
               "default": "",
               "title": "Ocr Infer Protocol",
               "type": "string"
            },
            "nim_batch_size": {
               "default": 2,
               "title": "Nim Batch Size",
               "type": "integer"
            },
            "workers_per_progress_engine": {
               "default": 5,
               "title": "Workers Per Progress Engine",
               "type": "integer"
            }
         },
         "title": "InfographicExtractorConfigSchema",
         "type": "object"
      },
      "InfographicExtractorSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for infographic extraction processing settings.\n\nParameters\n----------\nmax_queue_size : int, default=1\n    The maximum number of items allowed in the processing queue.\n\nn_workers : int, default=2\n    The number of worker threads to use for processing.\n\nraise_on_failure : bool, default=False\n    A flag indicating whether to raise an exception if a failure occurs during infographic extraction.\n\nstage_config : Optional[InfographicExtractorConfigSchema], default=None\n    Configuration for the infographic extraction stage, including yolox and ocr service endpoints.",
         "properties": {
            "max_queue_size": {
               "default": 1,
               "title": "Max Queue Size",
               "type": "integer"
            },
            "n_workers": {
               "default": 2,
               "title": "N Workers",
               "type": "integer"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            },
            "endpoint_config": {
               "anyOf": [
                  {
                     "$ref": "#/$defs/InfographicExtractorConfigSchema"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null
            }
         },
         "title": "InfographicExtractorSchema",
         "type": "object"
      },
      "JobCounterSchema": {
         "additionalProperties": false,
         "properties": {
            "name": {
               "default": "job_counter",
               "title": "Name",
               "type": "string"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            }
         },
         "title": "JobCounterSchema",
         "type": "object"
      },
      "LogLevel": {
         "enum": [
            "DEFAULT",
            "DEBUG",
            "INFO",
            "WARNING",
            "ERROR",
            "CRITICAL"
         ],
         "title": "LogLevel",
         "type": "string"
      },
      "MessageBrokerClientSchema": {
         "description": "Configuration schema for message broker client connections.\nSupports Redis or simple in-memory clients.",
         "properties": {
            "host": {
               "default": "redis",
               "description": "Hostname of the broker service.",
               "title": "Host",
               "type": "string"
            },
            "port": {
               "default": 6379,
               "description": "Port to connect to. Must be between 1 and 65535.",
               "exclusiveMaximum": 65536,
               "exclusiveMinimum": 0,
               "title": "Port",
               "type": "integer"
            },
            "client_type": {
               "default": "redis",
               "description": "Type of broker client. Supported values: 'redis', 'simple'.",
               "enum": [
                  "redis",
                  "simple"
               ],
               "title": "Client Type",
               "type": "string"
            },
            "broker_params": {
               "anyOf": [
                  {
                     "additionalProperties": true,
                     "type": "object"
                  },
                  {
                     "type": "null"
                  }
               ],
               "description": "Optional parameters passed to the broker client.",
               "title": "Broker Params"
            },
            "connection_timeout": {
               "default": 300,
               "description": "Connection timeout in seconds. Must be >= 0.",
               "minimum": 0,
               "title": "Connection Timeout",
               "type": "integer"
            },
            "max_backoff": {
               "default": 300,
               "description": "Maximum backoff time in seconds. Must be >= 0.",
               "minimum": 0,
               "title": "Max Backoff",
               "type": "integer"
            },
            "max_retries": {
               "default": 0,
               "description": "Maximum number of retries. Must be >= 0.",
               "minimum": 0,
               "title": "Max Retries",
               "type": "integer"
            }
         },
         "title": "MessageBrokerClientSchema",
         "type": "object"
      },
      "MessageBrokerTaskSinkSchema": {
         "properties": {
            "broker_client": {
               "$ref": "#/$defs/MessageBrokerClientSchema",
               "default": {
                  "host": "redis",
                  "port": 6379,
                  "client_type": "redis",
                  "broker_params": {},
                  "connection_timeout": 300,
                  "max_backoff": 300,
                  "max_retries": 0
               }
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            },
            "progress_engines": {
               "default": 6,
               "minimum": 1,
               "title": "Progress Engines",
               "type": "integer"
            }
         },
         "title": "MessageBrokerTaskSinkSchema",
         "type": "object"
      },
      "MessageBrokerTaskSourceSchema": {
         "properties": {
            "broker_client": {
               "$ref": "#/$defs/MessageBrokerClientSchema",
               "default": {
                  "host": "redis",
                  "port": 6379,
                  "client_type": "redis",
                  "broker_params": {},
                  "connection_timeout": 300,
                  "max_backoff": 300,
                  "max_retries": 0
               }
            },
            "task_queue": {
               "default": "ingest_task_queue",
               "title": "Task Queue",
               "type": "string"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            },
            "progress_engines": {
               "default": 6,
               "minimum": 1,
               "title": "Progress Engines",
               "type": "integer"
            }
         },
         "title": "MessageBrokerTaskSourceSchema",
         "type": "object"
      },
      "MetadataInjectorSchema": {
         "additionalProperties": false,
         "properties": {
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            }
         },
         "title": "MetadataInjectorSchema",
         "type": "object"
      },
      "NemotronParseConfigSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for Nemotron Parse endpoints and options.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n    Authentication token required for secure services.\n\nnemotron_parse_endpoints : Tuple[str, str]\n    A tuple containing the gRPC and HTTP services for the nemotron_parse endpoint.\n    Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n    Validates that at least one of the gRPC or HTTP services is provided for each endpoint.\n\nRaises\n------\nValueError\n    If both gRPC and HTTP services are empty for any endpoint.\n\nConfig\n------\nextra : str\n    Pydantic config option to forbid extra fields.",
         "properties": {
            "auth_token": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Auth Token"
            },
            "yolox_endpoints": {
               "default": [
                  null,
                  null
               ],
               "maxItems": 2,
               "minItems": 2,
               "prefixItems": [
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  },
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  }
               ],
               "title": "Yolox Endpoints",
               "type": "array"
            },
            "yolox_infer_protocol": {
               "default": "",
               "title": "Yolox Infer Protocol",
               "type": "string"
            },
            "nemotron_parse_endpoints": {
               "default": [
                  null,
                  null
               ],
               "maxItems": 2,
               "minItems": 2,
               "prefixItems": [
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  },
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  }
               ],
               "title": "Nemotron Parse Endpoints",
               "type": "array"
            },
            "nemotron_parse_infer_protocol": {
               "default": "",
               "title": "Nemotron Parse Infer Protocol",
               "type": "string"
            },
            "nemotron_parse_model_name": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": "nvidia/nemotron-parse",
               "title": "Nemotron Parse Model Name"
            },
            "timeout": {
               "default": 300.0,
               "title": "Timeout",
               "type": "number"
            },
            "workers_per_progress_engine": {
               "default": 5,
               "title": "Workers Per Progress Engine",
               "type": "integer"
            }
         },
         "title": "NemotronParseConfigSchema",
         "type": "object"
      },
      "OpenTelemetryMeterSchema": {
         "additionalProperties": false,
         "properties": {
            "broker_client": {
               "$ref": "#/$defs/MessageBrokerClientSchema",
               "default": {
                  "host": "redis",
                  "port": 6379,
                  "client_type": "redis",
                  "broker_params": {},
                  "connection_timeout": 300,
                  "max_backoff": 300,
                  "max_retries": 0
               }
            },
            "otel_endpoint": {
               "default": "localhost:4317",
               "title": "Otel Endpoint",
               "type": "string"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            }
         },
         "title": "OpenTelemetryMeterSchema",
         "type": "object"
      },
      "OpenTelemetryTracerSchema": {
         "additionalProperties": false,
         "properties": {
            "otel_endpoint": {
               "default": "localhost:4317",
               "title": "Otel Endpoint",
               "type": "string"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            }
         },
         "title": "OpenTelemetryTracerSchema",
         "type": "object"
      },
      "PDFExtractorSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for the PDF extractor settings.\n\nParameters\n----------\nmax_queue_size : int, default=1\n    The maximum number of items allowed in the processing queue.\n\nn_workers : int, default=16\n    The number of worker threads to use for processing.\n\nraise_on_failure : bool, default=False\n    A flag indicating whether to raise an exception on processing failure.\n\npdfium_config : Optional[PDFiumConfigSchema], default=None\n    Configuration for the PDFium service endpoints.",
         "properties": {
            "max_queue_size": {
               "default": 1,
               "title": "Max Queue Size",
               "type": "integer"
            },
            "n_workers": {
               "default": 16,
               "title": "N Workers",
               "type": "integer"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            },
            "pdfium_config": {
               "anyOf": [
                  {
                     "$ref": "#/$defs/PDFiumConfigSchema"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null
            },
            "nemotron_parse_config": {
               "anyOf": [
                  {
                     "$ref": "#/$defs/NemotronParseConfigSchema"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null
            }
         },
         "title": "PDFExtractorSchema",
         "type": "object"
      },
      "PDFiumConfigSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for PDFium endpoints and options.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n    Authentication token required for secure services.\n\nyolox_endpoints : Tuple[str, str]\n    A tuple containing the gRPC and HTTP services for the yolox endpoint.\n    Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n    Validates that at least one of the gRPC or HTTP services is provided for each endpoint.\n\nRaises\n------\nValueError\n    If both gRPC and HTTP services are empty for any endpoint.\n\nConfig\n------\nextra : str\n    Pydantic config option to forbid extra fields.",
         "properties": {
            "auth_token": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Auth Token"
            },
            "yolox_endpoints": {
               "default": [
                  null,
                  null
               ],
               "maxItems": 2,
               "minItems": 2,
               "prefixItems": [
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  },
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  }
               ],
               "title": "Yolox Endpoints",
               "type": "array"
            },
            "yolox_infer_protocol": {
               "default": "",
               "title": "Yolox Infer Protocol",
               "type": "string"
            },
            "nim_batch_size": {
               "default": 4,
               "title": "Nim Batch Size",
               "type": "integer"
            },
            "workers_per_progress_engine": {
               "default": 5,
               "title": "Workers Per Progress Engine",
               "type": "integer"
            }
         },
         "title": "PDFiumConfigSchema",
         "type": "object"
      },
      "PPTXConfigSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for docx extraction endpoints and options.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n    Authentication token required for secure services.\n\nyolox_endpoints : Tuple[str, str]\n    A tuple containing the gRPC and HTTP services for the yolox endpoint.\n    Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n    Validates that at least one of the gRPC or HTTP services is provided for each endpoint.\n\nRaises\n------\nValueError\n    If both gRPC and HTTP services are empty for any endpoint.\n\nConfig\n------\nextra : str\n    Pydantic config option to forbid extra fields.",
         "properties": {
            "auth_token": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Auth Token"
            },
            "yolox_endpoints": {
               "default": [
                  null,
                  null
               ],
               "maxItems": 2,
               "minItems": 2,
               "prefixItems": [
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  },
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  }
               ],
               "title": "Yolox Endpoints",
               "type": "array"
            },
            "yolox_infer_protocol": {
               "default": "",
               "title": "Yolox Infer Protocol",
               "type": "string"
            }
         },
         "title": "PPTXConfigSchema",
         "type": "object"
      },
      "PPTXExtractorSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for the PDF extractor settings.\n\nParameters\n----------\nmax_queue_size : int, default=1\n    The maximum number of items allowed in the processing queue.\n\nn_workers : int, default=16\n    The number of worker threads to use for processing.\n\nraise_on_failure : bool, default=False\n    A flag indicating whether to raise an exception on processing failure.\n\nimage_extraction_config: Optional[ImageConfigSchema], default=None\n    Configuration schema for the image extraction stage.",
         "properties": {
            "max_queue_size": {
               "default": 1,
               "title": "Max Queue Size",
               "type": "integer"
            },
            "n_workers": {
               "default": 16,
               "title": "N Workers",
               "type": "integer"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            },
            "pptx_extraction_config": {
               "anyOf": [
                  {
                     "$ref": "#/$defs/PPTXConfigSchema"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null
            },
            "pdfium_config": {
               "anyOf": [
                  {
                     "$ref": "#/$defs/PDFiumConfigSchema"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null
            }
         },
         "title": "PPTXExtractorSchema",
         "type": "object"
      },
      "TableExtractorConfigSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for the table extraction stage settings.\n\nParameters\n----------\nauth_token : Optional[str], default=None\n    Authentication token required for secure services.\n\nocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)\n    A tuple containing the gRPC and HTTP services for the ocr endpoint.\n    Either the gRPC or HTTP service can be empty, but not both.\n\nMethods\n-------\nvalidate_endpoints(values)\n    Validates that at least one of the gRPC or HTTP services is provided for the yolox endpoint.\n\nRaises\n------\nValueError\n    If both gRPC and HTTP services are empty for the yolox endpoint.\n\nConfig\n------\nextra : str\n    Pydantic config option to forbid extra fields.",
         "properties": {
            "auth_token": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Auth Token"
            },
            "yolox_endpoints": {
               "default": [
                  null,
                  null
               ],
               "maxItems": 2,
               "minItems": 2,
               "prefixItems": [
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  },
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  }
               ],
               "title": "Yolox Endpoints",
               "type": "array"
            },
            "yolox_infer_protocol": {
               "default": "",
               "title": "Yolox Infer Protocol",
               "type": "string"
            },
            "ocr_endpoints": {
               "default": [
                  null,
                  null
               ],
               "maxItems": 2,
               "minItems": 2,
               "prefixItems": [
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  },
                  {
                     "anyOf": [
                        {
                           "type": "string"
                        },
                        {
                           "type": "null"
                        }
                     ]
                  }
               ],
               "title": "Ocr Endpoints",
               "type": "array"
            },
            "ocr_infer_protocol": {
               "default": "",
               "title": "Ocr Infer Protocol",
               "type": "string"
            },
            "nim_batch_size": {
               "default": 2,
               "title": "Nim Batch Size",
               "type": "integer"
            },
            "workers_per_progress_engine": {
               "default": 5,
               "title": "Workers Per Progress Engine",
               "type": "integer"
            }
         },
         "title": "TableExtractorConfigSchema",
         "type": "object"
      },
      "TableExtractorSchema": {
         "additionalProperties": false,
         "description": "Configuration schema for the table extraction processing settings.\n\nParameters\n----------\nmax_queue_size : int, default=1\n    The maximum number of items allowed in the processing queue.\n\nn_workers : int, default=2\n    The number of worker threads to use for processing.\n\nraise_on_failure : bool, default=False\n    A flag indicating whether to raise an exception if a failure occurs during table extraction.\n\nstage_config : Optional[TableExtractorConfigSchema], default=None\n    Configuration for the table extraction stage, including yolox service endpoints.",
         "properties": {
            "max_queue_size": {
               "default": 1,
               "title": "Max Queue Size",
               "type": "integer"
            },
            "n_workers": {
               "default": 2,
               "title": "N Workers",
               "type": "integer"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            },
            "endpoint_config": {
               "anyOf": [
                  {
                     "$ref": "#/$defs/TableExtractorConfigSchema"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null
            }
         },
         "title": "TableExtractorSchema",
         "type": "object"
      },
      "TextEmbeddingSchema": {
         "additionalProperties": false,
         "properties": {
            "api_key": {
               "default": "",
               "title": "Api Key",
               "type": "string"
            },
            "batch_size": {
               "default": 4,
               "title": "Batch Size",
               "type": "integer"
            },
            "embedding_model": {
               "default": "nvidia/llama-3.2-nv-embedqa-1b-v2",
               "title": "Embedding Model",
               "type": "string"
            },
            "embedding_nim_endpoint": {
               "default": "http://embedding:8000/v1",
               "title": "Embedding Nim Endpoint",
               "type": "string"
            },
            "encoding_format": {
               "default": "float",
               "title": "Encoding Format",
               "type": "string"
            },
            "httpx_log_level": {
               "$ref": "#/$defs/LogLevel",
               "default": "WARNING"
            },
            "input_type": {
               "default": "passage",
               "title": "Input Type",
               "type": "string"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            },
            "truncate": {
               "default": "END",
               "title": "Truncate",
               "type": "string"
            },
            "text_elements_modality": {
               "default": "text",
               "title": "Text Elements Modality",
               "type": "string"
            },
            "image_elements_modality": {
               "default": "text",
               "title": "Image Elements Modality",
               "type": "string"
            },
            "structured_elements_modality": {
               "default": "text",
               "title": "Structured Elements Modality",
               "type": "string"
            },
            "audio_elements_modality": {
               "default": "text",
               "title": "Audio Elements Modality",
               "type": "string"
            },
            "custom_content_field": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Custom Content Field"
            },
            "result_target_field": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Result Target Field"
            },
            "dimensions": {
               "anyOf": [
                  {
                     "type": "integer"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Dimensions"
            }
         },
         "title": "TextEmbeddingSchema",
         "type": "object"
      },
      "TextSplitterSchema": {
         "additionalProperties": false,
         "properties": {
            "tokenizer": {
               "anyOf": [
                  {
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "title": "Tokenizer"
            },
            "chunk_size": {
               "default": 1024,
               "exclusiveMinimum": 0,
               "title": "Chunk Size",
               "type": "integer"
            },
            "chunk_overlap": {
               "default": 150,
               "minimum": 0,
               "title": "Chunk Overlap",
               "type": "integer"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            }
         },
         "title": "TextSplitterSchema",
         "type": "object"
      },
      "VdbTaskSinkSchema": {
         "additionalProperties": false,
         "properties": {
            "recreate": {
               "default": false,
               "title": "Recreate",
               "type": "boolean"
            },
            "service": {
               "default": "milvus",
               "title": "Service",
               "type": "string"
            },
            "is_service_serialized": {
               "default": false,
               "title": "Is Service Serialized",
               "type": "boolean"
            },
            "default_resource_name": {
               "default": "nv_ingest_collection",
               "title": "Default Resource Name",
               "type": "string"
            },
            "resource_schemas": {
               "additionalProperties": true,
               "default": {
                  "nv_ingest_collection": {
                     "index_conf": {
                        "field_name": "vector",
                        "index_type": "GPU_CAGRA",
                        "metric_type": "L2",
                        "params": {
                           "build_algo": "NN_DESCENT",
                           "graph_degree": 64,
                           "intermediate_graph_degree": 128
                        }
                     },
                     "schema_conf": {
                        "description": "NV-INGEST collection schema",
                        "enable_dynamic_field": true,
                        "schema_fields": [
                           {
                              "auto_id": true,
                              "description": "Primary key for the collection",
                              "is_primary": true,
                              "name": "pk",
                              "type": 5
                           },
                           {
                              "description": "Extracted content",
                              "name": "text",
                              "params": {
                                 "max_length": 65535
                              },
                              "type": 21
                           },
                           {
                              "description": "Embedding vectors",
                              "name": "vector",
                              "params": {
                                 "dim": 1024
                              },
                              "type": 101
                           },
                           {
                              "description": "Source document and raw data extracted content",
                              "name": "source",
                              "type": 23
                           },
                           {
                              "description": "Content metadata",
                              "name": "content_metadata",
                              "type": 23
                           }
                        ]
                     }
                  }
               },
               "title": "Resource Schemas",
               "type": "object"
            },
            "resource_kwargs": {
               "additionalProperties": true,
               "title": "Resource Kwargs",
               "type": "object"
            },
            "service_kwargs": {
               "additionalProperties": true,
               "default": {},
               "title": "Service Kwargs",
               "type": "object"
            },
            "batch_size": {
               "default": 5120,
               "title": "Batch Size",
               "type": "integer"
            },
            "write_time_interval": {
               "default": 1.0,
               "title": "Write Time Interval",
               "type": "number"
            },
            "retry_interval": {
               "default": 60.0,
               "title": "Retry Interval",
               "type": "number"
            },
            "raise_on_failure": {
               "default": false,
               "title": "Raise On Failure",
               "type": "boolean"
            },
            "progress_engines": {
               "default": 1,
               "minimum": 1,
               "title": "Progress Engines",
               "type": "integer"
            }
         },
         "title": "VdbTaskSinkSchema",
         "type": "object"
      }
   },
   "additionalProperties": false
}

Config:
  • extra: str = forbid

Fields:
field audio_extractor_schema: AudioExtractorSchema = AudioExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, audio_extraction_config=None)#
field chart_extractor_module: ChartExtractorSchema = ChartExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, endpoint_config=None)#
field embed_extractions_module: TextEmbeddingSchema = TextEmbeddingSchema(batch_size=4, embedding_model='nvidia/llama-3.2-nv-embedqa-1b-v2', embedding_nim_endpoint='http://embedding:8000/v1', encoding_format='float', httpx_log_level=<LogLevel.WARNING: 'WARNING'>, input_type='passage', raise_on_failure=False, truncate='END', text_elements_modality='text', image_elements_modality='text', structured_elements_modality='text', audio_elements_modality='text', custom_content_field=None, result_target_field=None, dimensions=None)#
field embedding_storage_module: EmbeddingStorageSchema = EmbeddingStorageSchema(raise_on_failure=False)#
field image_caption_extraction_module: ImageCaptionExtractionSchema = ImageCaptionExtractionSchema(endpoint_url='https://integrate.api.nvidia.com/v1/chat/completions', prompt='Caption the content of this image:', system_prompt='/no_think', model_name='nvidia/nemotron-nano-12b-v2-vl', raise_on_failure=False)#
field image_dedup_module: ImageDedupSchema = ImageDedupSchema(raise_on_failure=False)#
field image_filter_module: ImageFilterSchema = ImageFilterSchema(raise_on_failure=False, cpu_only=False)#
field image_storage_module: ImageStorageModuleSchema = ImageStorageModuleSchema(structured=True, images=True, storage_uri='s3://nv-ingest/artifacts/store/images', storage_options={}, public_base_url=None, raise_on_failure=False)#
field infographic_extractor_module: InfographicExtractorSchema = InfographicExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, endpoint_config=None)#
field job_counter_module: JobCounterSchema = JobCounterSchema(name='job_counter', raise_on_failure=False)#
field metadata_injection_module: MetadataInjectorSchema = MetadataInjectorSchema(raise_on_failure=False)#
field otel_meter_module: OpenTelemetryMeterSchema = OpenTelemetryMeterSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), otel_endpoint='localhost:4317', raise_on_failure=False)#
field otel_tracer_module: OpenTelemetryTracerSchema = OpenTelemetryTracerSchema(otel_endpoint='localhost:4317', raise_on_failure=False)#
field pdf_extractor_module: PDFExtractorSchema = PDFExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pdfium_config=None, nemotron_parse_config=None)#
field pptx_extractor_module: PPTXExtractorSchema = PPTXExtractorSchema(max_queue_size=1, n_workers=16, raise_on_failure=False, pptx_extraction_config=None, pdfium_config=None)#
field redis_task_sink: MessageBrokerTaskSinkSchema = MessageBrokerTaskSinkSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), raise_on_failure=False, progress_engines=6)#
field redis_task_source: MessageBrokerTaskSourceSchema = MessageBrokerTaskSourceSchema(broker_client=MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0), task_queue='ingest_task_queue', raise_on_failure=False, progress_engines=6)#
field table_extractor_module: TableExtractorSchema = TableExtractorSchema(max_queue_size=1, n_workers=2, raise_on_failure=False, endpoint_config=None)#
field text_splitter_module: TextSplitterSchema = TextSplitterSchema(tokenizer=None, chunk_size=1024, chunk_overlap=150, raise_on_failure=False)#
field vdb_task_sink: VdbTaskSinkSchema = VdbTaskSinkSchema(recreate=False, service='milvus', is_service_serialized=False, default_resource_name='nv_ingest_collection', resource_schemas={'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'metric_type': 'L2', 'index_type': 'GPU_CAGRA', 'params': {'intermediate_graph_degree': 128, 'graph_degree': 64, 'build_algo': 'NN_DESCENT'}}, 'schema_conf': {'enable_dynamic_field': True, 'schema_fields': [{'name': 'pk', 'description': 'Primary key for the collection', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': 'Extracted content', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'vector', 'description': 'Embedding vectors', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}, {'name': 'source', 'description': 'Source document and raw data extracted content', 'type': <DataType.JSON: 23>}, {'name': 'content_metadata', 'description': 'Content metadata', 'type': <DataType.JSON: 23>}], 'description': 'NV-INGEST collection schema'}}}, resource_kwargs={}, service_kwargs={}, batch_size=5120, write_time_interval=1.0, retry_interval=60.0, raise_on_failure=False, progress_engines=1)#

nv_ingest.framework.schemas.framework_job_counter_schema module#

pydantic model nv_ingest.framework.schemas.framework_job_counter_schema.JobCounterSchema[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "JobCounterSchema",
   "type": "object",
   "properties": {
      "name": {
         "default": "job_counter",
         "title": "Name",
         "type": "string"
      },
      "raise_on_failure": {
         "default": false,
         "title": "Raise On Failure",
         "type": "boolean"
      }
   },
   "additionalProperties": false
}

Config:
  • extra: str = forbid

Fields:
field name: str = 'job_counter'#
field raise_on_failure: bool = False#

nv_ingest.framework.schemas.framework_message_broker_sink_schema module#

pydantic model nv_ingest.framework.schemas.framework_message_broker_sink_schema.MessageBrokerTaskSinkSchema[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "MessageBrokerTaskSinkSchema",
   "type": "object",
   "properties": {
      "broker_client": {
         "$ref": "#/$defs/MessageBrokerClientSchema",
         "default": {
            "host": "redis",
            "port": 6379,
            "client_type": "redis",
            "broker_params": {},
            "connection_timeout": 300,
            "max_backoff": 300,
            "max_retries": 0
         }
      },
      "raise_on_failure": {
         "default": false,
         "title": "Raise On Failure",
         "type": "boolean"
      },
      "progress_engines": {
         "default": 6,
         "minimum": 1,
         "title": "Progress Engines",
         "type": "integer"
      }
   },
   "$defs": {
      "MessageBrokerClientSchema": {
         "description": "Configuration schema for message broker client connections.\nSupports Redis or simple in-memory clients.",
         "properties": {
            "host": {
               "default": "redis",
               "description": "Hostname of the broker service.",
               "title": "Host",
               "type": "string"
            },
            "port": {
               "default": 6379,
               "description": "Port to connect to. Must be between 1 and 65535.",
               "exclusiveMaximum": 65536,
               "exclusiveMinimum": 0,
               "title": "Port",
               "type": "integer"
            },
            "client_type": {
               "default": "redis",
               "description": "Type of broker client. Supported values: 'redis', 'simple'.",
               "enum": [
                  "redis",
                  "simple"
               ],
               "title": "Client Type",
               "type": "string"
            },
            "broker_params": {
               "anyOf": [
                  {
                     "additionalProperties": true,
                     "type": "object"
                  },
                  {
                     "type": "null"
                  }
               ],
               "description": "Optional parameters passed to the broker client.",
               "title": "Broker Params"
            },
            "connection_timeout": {
               "default": 300,
               "description": "Connection timeout in seconds. Must be >= 0.",
               "minimum": 0,
               "title": "Connection Timeout",
               "type": "integer"
            },
            "max_backoff": {
               "default": 300,
               "description": "Maximum backoff time in seconds. Must be >= 0.",
               "minimum": 0,
               "title": "Max Backoff",
               "type": "integer"
            },
            "max_retries": {
               "default": 0,
               "description": "Maximum number of retries. Must be >= 0.",
               "minimum": 0,
               "title": "Max Retries",
               "type": "integer"
            }
         },
         "title": "MessageBrokerClientSchema",
         "type": "object"
      }
   }
}

Fields:
field broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0)#
field progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])] = 6#
Constraints:
  • ge = 1

field raise_on_failure: bool = False#

nv_ingest.framework.schemas.framework_message_broker_source_schema module#

pydantic model nv_ingest.framework.schemas.framework_message_broker_source_schema.MessageBrokerTaskSourceSchema[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "MessageBrokerTaskSourceSchema",
   "type": "object",
   "properties": {
      "broker_client": {
         "$ref": "#/$defs/MessageBrokerClientSchema",
         "default": {
            "host": "redis",
            "port": 6379,
            "client_type": "redis",
            "broker_params": {},
            "connection_timeout": 300,
            "max_backoff": 300,
            "max_retries": 0
         }
      },
      "task_queue": {
         "default": "ingest_task_queue",
         "title": "Task Queue",
         "type": "string"
      },
      "raise_on_failure": {
         "default": false,
         "title": "Raise On Failure",
         "type": "boolean"
      },
      "progress_engines": {
         "default": 6,
         "minimum": 1,
         "title": "Progress Engines",
         "type": "integer"
      }
   },
   "$defs": {
      "MessageBrokerClientSchema": {
         "description": "Configuration schema for message broker client connections.\nSupports Redis or simple in-memory clients.",
         "properties": {
            "host": {
               "default": "redis",
               "description": "Hostname of the broker service.",
               "title": "Host",
               "type": "string"
            },
            "port": {
               "default": 6379,
               "description": "Port to connect to. Must be between 1 and 65535.",
               "exclusiveMaximum": 65536,
               "exclusiveMinimum": 0,
               "title": "Port",
               "type": "integer"
            },
            "client_type": {
               "default": "redis",
               "description": "Type of broker client. Supported values: 'redis', 'simple'.",
               "enum": [
                  "redis",
                  "simple"
               ],
               "title": "Client Type",
               "type": "string"
            },
            "broker_params": {
               "anyOf": [
                  {
                     "additionalProperties": true,
                     "type": "object"
                  },
                  {
                     "type": "null"
                  }
               ],
               "description": "Optional parameters passed to the broker client.",
               "title": "Broker Params"
            },
            "connection_timeout": {
               "default": 300,
               "description": "Connection timeout in seconds. Must be >= 0.",
               "minimum": 0,
               "title": "Connection Timeout",
               "type": "integer"
            },
            "max_backoff": {
               "default": 300,
               "description": "Maximum backoff time in seconds. Must be >= 0.",
               "minimum": 0,
               "title": "Max Backoff",
               "type": "integer"
            },
            "max_retries": {
               "default": 0,
               "description": "Maximum number of retries. Must be >= 0.",
               "minimum": 0,
               "title": "Max Retries",
               "type": "integer"
            }
         },
         "title": "MessageBrokerClientSchema",
         "type": "object"
      }
   }
}

Fields:
field broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0)#
field progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])] = 6#
Constraints:
  • ge = 1

field raise_on_failure: bool = False#
field task_queue: str = 'ingest_task_queue'#

nv_ingest.framework.schemas.framework_message_wrapper_schema module#

pydantic model nv_ingest.framework.schemas.framework_message_wrapper_schema.MessageWrapper[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "MessageWrapper",
   "type": "object",
   "properties": {
      "payload": {
         "title": "Payload",
         "type": "string"
      }
   },
   "required": [
      "payload"
   ]
}

Fields:
field payload: str [Required]#

nv_ingest.framework.schemas.framework_metadata_injector_schema module#

pydantic model nv_ingest.framework.schemas.framework_metadata_injector_schema.MetadataInjectorSchema[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "MetadataInjectorSchema",
   "type": "object",
   "properties": {
      "raise_on_failure": {
         "default": false,
         "title": "Raise On Failure",
         "type": "boolean"
      }
   },
   "additionalProperties": false
}

Config:
  • extra: str = forbid

Fields:
field raise_on_failure: bool = False#

nv_ingest.framework.schemas.framework_otel_meter_schema module#

pydantic model nv_ingest.framework.schemas.framework_otel_meter_schema.OpenTelemetryMeterSchema[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "OpenTelemetryMeterSchema",
   "type": "object",
   "properties": {
      "broker_client": {
         "$ref": "#/$defs/MessageBrokerClientSchema",
         "default": {
            "host": "redis",
            "port": 6379,
            "client_type": "redis",
            "broker_params": {},
            "connection_timeout": 300,
            "max_backoff": 300,
            "max_retries": 0
         }
      },
      "otel_endpoint": {
         "default": "localhost:4317",
         "title": "Otel Endpoint",
         "type": "string"
      },
      "raise_on_failure": {
         "default": false,
         "title": "Raise On Failure",
         "type": "boolean"
      }
   },
   "$defs": {
      "MessageBrokerClientSchema": {
         "description": "Configuration schema for message broker client connections.\nSupports Redis or simple in-memory clients.",
         "properties": {
            "host": {
               "default": "redis",
               "description": "Hostname of the broker service.",
               "title": "Host",
               "type": "string"
            },
            "port": {
               "default": 6379,
               "description": "Port to connect to. Must be between 1 and 65535.",
               "exclusiveMaximum": 65536,
               "exclusiveMinimum": 0,
               "title": "Port",
               "type": "integer"
            },
            "client_type": {
               "default": "redis",
               "description": "Type of broker client. Supported values: 'redis', 'simple'.",
               "enum": [
                  "redis",
                  "simple"
               ],
               "title": "Client Type",
               "type": "string"
            },
            "broker_params": {
               "anyOf": [
                  {
                     "additionalProperties": true,
                     "type": "object"
                  },
                  {
                     "type": "null"
                  }
               ],
               "description": "Optional parameters passed to the broker client.",
               "title": "Broker Params"
            },
            "connection_timeout": {
               "default": 300,
               "description": "Connection timeout in seconds. Must be >= 0.",
               "minimum": 0,
               "title": "Connection Timeout",
               "type": "integer"
            },
            "max_backoff": {
               "default": 300,
               "description": "Maximum backoff time in seconds. Must be >= 0.",
               "minimum": 0,
               "title": "Max Backoff",
               "type": "integer"
            },
            "max_retries": {
               "default": 0,
               "description": "Maximum number of retries. Must be >= 0.",
               "minimum": 0,
               "title": "Max Retries",
               "type": "integer"
            }
         },
         "title": "MessageBrokerClientSchema",
         "type": "object"
      }
   },
   "additionalProperties": false
}

Config:
  • extra: str = forbid

Fields:
field broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema(host='redis', port=6379, client_type='redis', broker_params={}, connection_timeout=300, max_backoff=300, max_retries=0)#
field otel_endpoint: str = 'localhost:4317'#
field raise_on_failure: bool = False#

nv_ingest.framework.schemas.framework_otel_tracer_schema module#

pydantic model nv_ingest.framework.schemas.framework_otel_tracer_schema.OpenTelemetryTracerSchema[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "OpenTelemetryTracerSchema",
   "type": "object",
   "properties": {
      "otel_endpoint": {
         "default": "localhost:4317",
         "title": "Otel Endpoint",
         "type": "string"
      },
      "raise_on_failure": {
         "default": false,
         "title": "Raise On Failure",
         "type": "boolean"
      }
   },
   "additionalProperties": false
}

Config:
  • extra: str = forbid

Fields:
field otel_endpoint: str = 'localhost:4317'#
field raise_on_failure: bool = False#

nv_ingest.framework.schemas.framework_processing_job_schema module#

class nv_ingest.framework.schemas.framework_processing_job_schema.ConversionStatus(*values)[source]#

Bases: str, Enum

FAILED = 'failed'#
IN_PROGRESS = 'in_progress'#
SUCCESS = 'success'#
pydantic model nv_ingest.framework.schemas.framework_processing_job_schema.ProcessingJob[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "ProcessingJob",
   "type": "object",
   "properties": {
      "submitted_job_id": {
         "title": "Submitted Job Id",
         "type": "string"
      },
      "filename": {
         "title": "Filename",
         "type": "string"
      },
      "raw_result": {
         "default": "",
         "title": "Raw Result",
         "type": "string"
      },
      "content": {
         "default": "",
         "title": "Content",
         "type": "string"
      },
      "status": {
         "$ref": "#/$defs/ConversionStatus"
      },
      "error": {
         "anyOf": [
            {
               "type": "string"
            },
            {
               "type": "null"
            }
         ],
         "default": null,
         "title": "Error"
      }
   },
   "$defs": {
      "ConversionStatus": {
         "enum": [
            "in_progress",
            "success",
            "failed",
            "{'extra': 'forbid'}"
         ],
         "title": "ConversionStatus",
         "type": "string"
      }
   },
   "additionalProperties": false,
   "required": [
      "submitted_job_id",
      "filename",
      "status"
   ]
}

Config:
  • extra: str = forbid

Fields:
field content: str = ''#
field error: str | None = None#
field filename: str [Required]#
field raw_result: str = ''#
field status: ConversionStatus [Required]#
field submitted_job_id: str [Required]#

nv_ingest.framework.schemas.framework_task_injection_schema module#

pydantic model nv_ingest.framework.schemas.framework_task_injection_schema.TaskInjectionSchema[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "TaskInjectionSchema",
   "type": "object",
   "properties": {
      "raise_on_failure": {
         "default": false,
         "title": "Raise On Failure",
         "type": "boolean"
      }
   },
   "additionalProperties": false
}

Config:
  • extra: str = forbid

Fields:
field raise_on_failure: bool = False#

nv_ingest.framework.schemas.framework_vdb_task_sink_schema module#

pydantic model nv_ingest.framework.schemas.framework_vdb_task_sink_schema.VdbTaskSinkSchema[source]#

Bases: BaseModel

Show JSON schema
{
   "title": "VdbTaskSinkSchema",
   "type": "object",
   "properties": {
      "recreate": {
         "default": false,
         "title": "Recreate",
         "type": "boolean"
      },
      "service": {
         "default": "milvus",
         "title": "Service",
         "type": "string"
      },
      "is_service_serialized": {
         "default": false,
         "title": "Is Service Serialized",
         "type": "boolean"
      },
      "default_resource_name": {
         "default": "nv_ingest_collection",
         "title": "Default Resource Name",
         "type": "string"
      },
      "resource_schemas": {
         "additionalProperties": true,
         "default": {
            "nv_ingest_collection": {
               "index_conf": {
                  "field_name": "vector",
                  "index_type": "GPU_CAGRA",
                  "metric_type": "L2",
                  "params": {
                     "build_algo": "NN_DESCENT",
                     "graph_degree": 64,
                     "intermediate_graph_degree": 128
                  }
               },
               "schema_conf": {
                  "description": "NV-INGEST collection schema",
                  "enable_dynamic_field": true,
                  "schema_fields": [
                     {
                        "auto_id": true,
                        "description": "Primary key for the collection",
                        "is_primary": true,
                        "name": "pk",
                        "type": 5
                     },
                     {
                        "description": "Extracted content",
                        "name": "text",
                        "params": {
                           "max_length": 65535
                        },
                        "type": 21
                     },
                     {
                        "description": "Embedding vectors",
                        "name": "vector",
                        "params": {
                           "dim": 1024
                        },
                        "type": 101
                     },
                     {
                        "description": "Source document and raw data extracted content",
                        "name": "source",
                        "type": 23
                     },
                     {
                        "description": "Content metadata",
                        "name": "content_metadata",
                        "type": 23
                     }
                  ]
               }
            }
         },
         "title": "Resource Schemas",
         "type": "object"
      },
      "resource_kwargs": {
         "additionalProperties": true,
         "title": "Resource Kwargs",
         "type": "object"
      },
      "service_kwargs": {
         "additionalProperties": true,
         "default": {},
         "title": "Service Kwargs",
         "type": "object"
      },
      "batch_size": {
         "default": 5120,
         "title": "Batch Size",
         "type": "integer"
      },
      "write_time_interval": {
         "default": 1.0,
         "title": "Write Time Interval",
         "type": "number"
      },
      "retry_interval": {
         "default": 60.0,
         "title": "Retry Interval",
         "type": "number"
      },
      "raise_on_failure": {
         "default": false,
         "title": "Raise On Failure",
         "type": "boolean"
      },
      "progress_engines": {
         "default": 1,
         "minimum": 1,
         "title": "Progress Engines",
         "type": "integer"
      }
   },
   "additionalProperties": false
}

Config:
  • extra: str = forbid

Fields:
Validators:
field batch_size: int = 5120#
field default_resource_name: str = 'nv_ingest_collection'#
Validated by:
field is_service_serialized: bool = False#
field progress_engines: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=1)])] = 1#
Constraints:
  • ge = 1

field raise_on_failure: bool = False#
field recreate: bool = False#
field resource_kwargs: dict [Optional]#
field resource_schemas: dict = {'nv_ingest_collection': {'index_conf': {'field_name': 'vector', 'index_type': 'GPU_CAGRA', 'metric_type': 'L2', 'params': {'build_algo': 'NN_DESCENT', 'graph_degree': 64, 'intermediate_graph_degree': 128}}, 'schema_conf': {'description': 'NV-INGEST collection schema', 'enable_dynamic_field': True, 'schema_fields': [{'auto_id': True, 'description': 'Primary key for the collection', 'is_primary': True, 'name': 'pk', 'type': DataType.INT64}, {'description': 'Extracted content', 'name': 'text', 'params': {'max_length': 65535}, 'type': DataType.VARCHAR}, {'description': 'Embedding vectors', 'name': 'vector', 'params': {'dim': 1024}, 'type': DataType.FLOAT_VECTOR}, {'description': 'Source document and raw data extracted content', 'name': 'source', 'type': DataType.JSON}, {'description': 'Content metadata', 'name': 'content_metadata', 'type': DataType.JSON}]}}}#
field retry_interval: float = 60.0#
field service: str = 'milvus'#
Validated by:
field service_kwargs: dict = {}#
field write_time_interval: float = 1.0#
validator validate_resource_name  »  default_resource_name[source]#
validator validate_service  »  service[source]#
nv_ingest.framework.schemas.framework_vdb_task_sink_schema.build_default_milvus_config(
embedding_size: int = 1024,
) Dict[str, Any][source]#

Builds the configuration for Milvus.

This function creates a dictionary configuration for a Milvus collection. It includes the index configuration and the schema configuration, with various fields like id, title, link, summary, page_content, and embedding.

Parameters:

embedding_size (int) – The size of the embedding vector.

Returns:

A dictionary containing the configuration settings for Milvus.

Return type:

Dict[str, Any]

Module contents#