nv_ingest_client.primitives.tasks package#
Submodules#
nv_ingest_client.primitives.tasks.caption module#
- class nv_ingest_client.primitives.tasks.caption.CaptionTask(
- api_key: str | None = None,
- endpoint_url: str | None = None,
- prompt: str | None = None,
- model_name: str | None = None,
Bases:
Task
- class nv_ingest_client.primitives.tasks.caption.CaptionTaskSchema(
- *,
- api_key: str | None = None,
- endpoint_url: str | None = None,
- prompt: str | None = None,
- model_name: str | None = None,
Bases:
BaseModel
- api_key: str | None#
- endpoint_url: str | None#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid', 'protected_namespaces': ()}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- model_name: str | None#
- prompt: str | None#
nv_ingest_client.primitives.tasks.chart_extraction module#
nv_ingest_client.primitives.tasks.dedup module#
- class nv_ingest_client.primitives.tasks.dedup.DedupTask(
- content_type: Literal['image'] = 'image',
- filter: bool = False,
Bases:
Task
Object for document dedup task
- class nv_ingest_client.primitives.tasks.dedup.DedupTaskSchema(*, content_type: str = 'image', filter: bool = False)[source]#
Bases:
BaseModel
- content_type: str#
- filter: bool#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
nv_ingest_client.primitives.tasks.embed module#
- class nv_ingest_client.primitives.tasks.embed.EmbedTask(
- endpoint_url: str | None = None,
- model_name: str | None = None,
- api_key: str | None = None,
- text: bool | None = None,
- tables: bool | None = None,
- filter_errors: bool = False,
Bases:
Task
Object for document embedding tasks.
This class encapsulates the configuration and runtime state for an embedding task, including details like the endpoint URL, model name, and API key.
- class nv_ingest_client.primitives.tasks.embed.EmbedTaskSchema(
- *,
- endpoint_url: str | None = None,
- model_name: str | None = None,
- api_key: str | None = None,
- filter_errors: bool = False,
Bases:
BaseModel
Schema for embed task configuration.
This schema contains configuration details for an embedding task, including the endpoint URL, model name, API key, and error filtering flag.
- endpoint_url#
URL of the embedding endpoint. Default is None.
- Type:
Optional[str]
- model_name#
Name of the embedding model. Default is None.
- Type:
Optional[str]
- api_key#
API key for authentication with the embedding service. Default is None.
- Type:
Optional[str]
- filter_errors#
Flag to indicate whether errors should be filtered. Default is False.
- Type:
bool
- api_key: str | None#
- endpoint_url: str | None#
- filter_errors: bool#
- classmethod handle_deprecated_fields(
- values: Dict[str, Any],
Handle deprecated fields before model validation.
This validator checks for the presence of deprecated keys (‘text’ and ‘tables’) in the input dictionary and removes them. Warnings are issued if these keys are found.
- Parameters:
values (Dict[str, Any]) – Input dictionary of model values.
- Returns:
The updated dictionary with deprecated fields removed.
- Return type:
Dict[str, Any]
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- model_name: str | None#
nv_ingest_client.primitives.tasks.extract module#
- class nv_ingest_client.primitives.tasks.extract.ExtractTask(
- document_type,
- extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = 'pdfium',
- extract_text: bool = False,
- extract_images: bool = False,
- extract_tables: bool = False,
- extract_charts: bool | None = None,
- extract_audio_params: Dict[str, Any] | None = None,
- extract_images_method: Literal['simple', 'group'] = 'group',
- extract_images_params: Dict[str, Any] | None = None,
- extract_tables_method: Literal['yolox', 'pdfium', 'nemoretriever_parse'] = 'yolox',
- extract_infographics: bool = False,
- text_depth: str = 'document',
- paddle_output_format: str = 'pseudo_markdown',
Bases:
Task
Object for document extraction task
- property document_type#
- class nv_ingest_client.primitives.tasks.extract.ExtractTaskSchema(
- *,
- document_type: str,
- extract_method: str = None,
- extract_text: bool = True,
- extract_images: bool = True,
- extract_images_method: str = 'group',
- extract_images_params: Dict[str, Any] | None = None,
- extract_tables: bool = True,
- extract_tables_method: str = 'yolox',
- extract_charts: bool | None = None,
- extract_infographics: bool = False,
- extract_audio_params: Dict[str, Any] | None = None,
- text_depth: str = 'document',
- paddle_output_format: str = 'pseudo_markdown',
Bases:
BaseModel
- document_type: str#
- extract_audio_params: Dict[str, Any] | None#
- extract_charts: bool | None#
- extract_images: bool#
- extract_images_method: str#
- extract_images_params: Dict[str, Any] | None#
- extract_infographics: bool#
- extract_method: str#
- extract_tables: bool#
- extract_tables_method: str#
- extract_text: bool#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- paddle_output_format: str#
- text_depth: str#
nv_ingest_client.primitives.tasks.filter module#
- class nv_ingest_client.primitives.tasks.filter.FilterTask(
- content_type: Literal['image'] = 'image',
- min_size: int = 128,
- max_aspect_ratio: int | float = 5.0,
- min_aspect_ratio: int | float = 0.2,
- filter: bool = False,
Bases:
Task
Object for document filter task
- class nv_ingest_client.primitives.tasks.filter.FilterTaskSchema(
- *,
- content_type: str = 'image',
- min_size: int = 128,
- max_aspect_ratio: float | int = 5.0,
- min_aspect_ratio: float | int = 0.2,
- filter: bool = False,
Bases:
BaseModel
- content_type: str#
- filter: bool#
- max_aspect_ratio: float | int#
- min_aspect_ratio: float | int#
- min_size: int#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
nv_ingest_client.primitives.tasks.infographic_extraction module#
nv_ingest_client.primitives.tasks.split module#
- class nv_ingest_client.primitives.tasks.split.SplitTask(
- tokenizer: str | None = None,
- chunk_size: int = 1024,
- chunk_overlap: int = 150,
- params: dict = {},
Bases:
Task
Object for document splitting task
- class nv_ingest_client.primitives.tasks.split.SplitTaskSchema(
- *,
- tokenizer: str | None = None,
- chunk_size: int = 1024,
- chunk_overlap: int = 150,
- params: dict = {},
Bases:
BaseModel
- chunk_overlap: int#
- chunk_size: int#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- params: dict#
- tokenizer: str | None#
nv_ingest_client.primitives.tasks.store module#
- class nv_ingest_client.primitives.tasks.store.StoreEmbedTask(params: dict | None = None, **extra_params)[source]#
Bases:
Task
Object for image storage task.
- class nv_ingest_client.primitives.tasks.store.StoreEmbedTaskSchema(**extra_data: Any)[source]#
Bases:
BaseModel
- model_config: ClassVar[ConfigDict] = {'extra': 'allow'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- class nv_ingest_client.primitives.tasks.store.StoreTask(
- structured: bool = True,
- images: bool = False,
- store_method: Literal['minio'] | None = None,
- params: dict | None = None,
- **extra_params,
Bases:
Task
Object for image storage task.
- class nv_ingest_client.primitives.tasks.store.StoreTaskSchema(
- *,
- store_method: str = None,
- **extra_data: Any,
Bases:
BaseModel
- model_config: ClassVar[ConfigDict] = {'extra': 'allow'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- store_method: str#
nv_ingest_client.primitives.tasks.table_extraction module#
nv_ingest_client.primitives.tasks.task_base module#
- class nv_ingest_client.primitives.tasks.task_base.TaskType(value)[source]#
Bases:
Enum
An enumeration.
- CAPTION = 1#
- CHART_DATA_EXTRACT = 12#
- DEDUP = 2#
- EMBED = 3#
- EXTRACT = 4#
- FILTER = 5#
- INFOGRAPHIC_DATA_EXTRACT = 13#
- SPLIT = 6#
- STORE = 9#
- STORE_EMBEDDING = 8#
- TABLE_DATA_EXTRACT = 11#
- TRANSFORM = 7#
- VDB_UPLOAD = 10#
- nv_ingest_client.primitives.tasks.task_base.is_valid_task_type(task_type_str: str) bool [source]#
Checks if the provided string is a valid TaskType enum value.
- Parameters:
task_type_str (str) – The string to check against the TaskType enum values.
- Returns:
True if the string is a valid TaskType enum value, False otherwise.
- Return type:
bool
nv_ingest_client.primitives.tasks.task_factory module#
- class nv_ingest_client.primitives.tasks.task_factory.TaskUnimplemented(**kwargs)[source]#
Bases:
Task
Placeholder for unimplemented tasks
nv_ingest_client.primitives.tasks.transform module#
nv_ingest_client.primitives.tasks.vdb_upload module#
- class nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTask(
- filter_errors: bool = False,
- bulk_ingest: bool = False,
- bulk_ingest_path: str = 'embeddings/',
- params: dict | None = None,
Bases:
Task
Object for document embedding task
- class nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTaskSchema(
- *,
- filter_errors: bool = False,
- bulk_ingest: bool = False,
- bulk_ingest_path: str = '',
- params: dict = None,
Bases:
BaseModel
- bulk_ingest: bool#
- bulk_ingest_path: str#
- filter_errors: bool#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- params: dict#
Module contents#
- class nv_ingest_client.primitives.tasks.AudioExtractionTask(
- auth_token: str | None = None,
- grpc_endpoint: str | None = None,
- infer_protocol: str | None = None,
- function_id: str | None = None,
- use_ssl: bool | None = None,
- ssl_cert: str | None = None,
- segment_audio: bool | None = None,
Bases:
Task
- class nv_ingest_client.primitives.tasks.CaptionTask(
- api_key: str | None = None,
- endpoint_url: str | None = None,
- prompt: str | None = None,
- model_name: str | None = None,
Bases:
Task
- class nv_ingest_client.primitives.tasks.ChartExtractionTask[source]#
Bases:
Task
Object for chart extraction task
- class nv_ingest_client.primitives.tasks.DedupTask(
- content_type: Literal['image'] = 'image',
- filter: bool = False,
Bases:
Task
Object for document dedup task
- class nv_ingest_client.primitives.tasks.EmbedTask(
- endpoint_url: str | None = None,
- model_name: str | None = None,
- api_key: str | None = None,
- text: bool | None = None,
- tables: bool | None = None,
- filter_errors: bool = False,
Bases:
Task
Object for document embedding tasks.
This class encapsulates the configuration and runtime state for an embedding task, including details like the endpoint URL, model name, and API key.
- class nv_ingest_client.primitives.tasks.ExtractTask(
- document_type,
- extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = 'pdfium',
- extract_text: bool = False,
- extract_images: bool = False,
- extract_tables: bool = False,
- extract_charts: bool | None = None,
- extract_audio_params: Dict[str, Any] | None = None,
- extract_images_method: Literal['simple', 'group'] = 'group',
- extract_images_params: Dict[str, Any] | None = None,
- extract_tables_method: Literal['yolox', 'pdfium', 'nemoretriever_parse'] = 'yolox',
- extract_infographics: bool = False,
- text_depth: str = 'document',
- paddle_output_format: str = 'pseudo_markdown',
Bases:
Task
Object for document extraction task
- property document_type#
- class nv_ingest_client.primitives.tasks.FilterTask(
- content_type: Literal['image'] = 'image',
- min_size: int = 128,
- max_aspect_ratio: int | float = 5.0,
- min_aspect_ratio: int | float = 0.2,
- filter: bool = False,
Bases:
Task
Object for document filter task
- class nv_ingest_client.primitives.tasks.InfographicExtractionTask[source]#
Bases:
Task
Object for infographic extraction task
- class nv_ingest_client.primitives.tasks.SplitTask(
- tokenizer: str | None = None,
- chunk_size: int = 1024,
- chunk_overlap: int = 150,
- params: dict = {},
Bases:
Task
Object for document splitting task
- class nv_ingest_client.primitives.tasks.StoreEmbedTask(params: dict | None = None, **extra_params)[source]#
Bases:
Task
Object for image storage task.
- class nv_ingest_client.primitives.tasks.StoreTask(
- structured: bool = True,
- images: bool = False,
- store_method: Literal['minio'] | None = None,
- params: dict | None = None,
- **extra_params,
Bases:
Task
Object for image storage task.
- class nv_ingest_client.primitives.tasks.TableExtractionTask[source]#
Bases:
Task
Object for table extraction tasks
- class nv_ingest_client.primitives.tasks.TaskType(value)[source]#
Bases:
Enum
An enumeration.
- CAPTION = 1#
- CHART_DATA_EXTRACT = 12#
- DEDUP = 2#
- EMBED = 3#
- EXTRACT = 4#
- FILTER = 5#
- INFOGRAPHIC_DATA_EXTRACT = 13#
- SPLIT = 6#
- STORE = 9#
- STORE_EMBEDDING = 8#
- TABLE_DATA_EXTRACT = 11#
- TRANSFORM = 7#
- VDB_UPLOAD = 10#
- nv_ingest_client.primitives.tasks.is_valid_task_type(task_type_str: str) bool [source]#
Checks if the provided string is a valid TaskType enum value.
- Parameters:
task_type_str (str) – The string to check against the TaskType enum values.
- Returns:
True if the string is a valid TaskType enum value, False otherwise.
- Return type:
bool