nv_ingest_client.primitives.tasks package#
Submodules#
nv_ingest_client.primitives.tasks.caption module#
- class nv_ingest_client.primitives.tasks.caption.CaptionTask(
- api_key: str | None = None,
- endpoint_url: str | None = None,
- prompt: str | None = None,
- model_name: str | None = None,
Bases:
Task
- class nv_ingest_client.primitives.tasks.caption.CaptionTaskSchema(
- *,
- api_key: str | None = None,
- endpoint_url: str | None = None,
- prompt: str | None = None,
- model_name: str | None = None,
Bases:
BaseModel
- api_key: str | None#
- endpoint_url: str | None#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid', 'protected_namespaces': ()}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- model_name: str | None#
- prompt: str | None#
nv_ingest_client.primitives.tasks.chart_extraction module#
nv_ingest_client.primitives.tasks.dedup module#
- class nv_ingest_client.primitives.tasks.dedup.DedupTask(
- content_type: Literal['image'] = 'image',
- filter: bool = False,
Bases:
Task
Object for document dedup task
- class nv_ingest_client.primitives.tasks.dedup.DedupTaskSchema(*, content_type: str = 'image', filter: bool = False)[source]#
Bases:
BaseModel
- content_type: str#
- filter: bool#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
nv_ingest_client.primitives.tasks.embed module#
- class nv_ingest_client.primitives.tasks.embed.EmbedTask(
- endpoint_url: str | None = None,
- model_name: str | None = None,
- api_key: str | None = None,
- text: bool | None = None,
- tables: bool | None = None,
- filter_errors: bool = False,
Bases:
Task
Object for document embedding task
- class nv_ingest_client.primitives.tasks.embed.EmbedTaskSchema(
- *,
- endpoint_url: str | None = None,
- model_name: str | None = None,
- api_key: str | None = None,
- filter_errors: bool = False,
Bases:
BaseModel
- api_key: str | None#
- endpoint_url: str | None#
- filter_errors: bool#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- model_name: str | None#
nv_ingest_client.primitives.tasks.extract module#
- class nv_ingest_client.primitives.tasks.extract.ExtractTask(
- document_type,
- extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = 'pdfium',
- extract_text: bool = False,
- extract_images: bool = False,
- extract_tables: bool = False,
- extract_charts: bool | None = None,
- extract_audio_params: Dict[str, Any] | None = None,
- extract_images_method: Literal['simple', 'group'] = 'group',
- extract_images_params: Dict[str, Any] | None = None,
- extract_tables_method: Literal['yolox', 'pdfium', 'nemoretriever_parse'] = 'yolox',
- extract_infographics: bool = False,
- text_depth: str = 'document',
- paddle_output_format: str = 'pseudo_markdown',
Bases:
Task
Object for document extraction task
- property document_type#
- class nv_ingest_client.primitives.tasks.extract.ExtractTaskSchema(
- *,
- document_type: str,
- extract_method: str = None,
- extract_text: bool = True,
- extract_images: bool = True,
- extract_images_method: str = 'group',
- extract_images_params: Dict[str, Any] | None = None,
- extract_tables: bool = True,
- extract_tables_method: str = 'yolox',
- extract_charts: bool | None = None,
- extract_infographics: bool = False,
- text_depth: str = 'document',
- paddle_output_format: str = 'pseudo_markdown',
Bases:
BaseModel
- document_type: str#
- extract_charts: bool | None#
- extract_images: bool#
- extract_images_method: str#
- extract_images_params: Dict[str, Any] | None#
- extract_infographics: bool#
- extract_method: str#
- extract_tables: bool#
- extract_tables_method: str#
- extract_text: bool#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- paddle_output_format: str#
- text_depth: str#
nv_ingest_client.primitives.tasks.filter module#
- class nv_ingest_client.primitives.tasks.filter.FilterTask(
- content_type: Literal['image'] = 'image',
- min_size: int = 128,
- max_aspect_ratio: int | float = 5.0,
- min_aspect_ratio: int | float = 0.2,
- filter: bool = False,
Bases:
Task
Object for document filter task
- class nv_ingest_client.primitives.tasks.filter.FilterTaskSchema(
- *,
- content_type: str = 'image',
- min_size: int = 128,
- max_aspect_ratio: float | int = 5.0,
- min_aspect_ratio: float | int = 0.2,
- filter: bool = False,
Bases:
BaseModel
- content_type: str#
- filter: bool#
- max_aspect_ratio: float | int#
- min_aspect_ratio: float | int#
- min_size: int#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
nv_ingest_client.primitives.tasks.infographic_extraction module#
nv_ingest_client.primitives.tasks.split module#
- class nv_ingest_client.primitives.tasks.split.SplitTask(
- tokenizer: str | None = None,
- chunk_size: int = 1024,
- chunk_overlap: int = 150,
- params: dict = {},
Bases:
Task
Object for document splitting task
- class nv_ingest_client.primitives.tasks.split.SplitTaskSchema(
- *,
- tokenizer: str | None = None,
- chunk_size: int = 1024,
- chunk_overlap: int = 150,
- params: dict = {},
Bases:
BaseModel
- chunk_overlap: int#
- chunk_size: int#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- params: dict#
- tokenizer: str | None#
nv_ingest_client.primitives.tasks.store module#
- class nv_ingest_client.primitives.tasks.store.StoreEmbedTask(params: dict | None = None, **extra_params)[source]#
Bases:
Task
Object for image storage task.
- class nv_ingest_client.primitives.tasks.store.StoreEmbedTaskSchema(**extra_data: Any)[source]#
Bases:
BaseModel
- model_config: ClassVar[ConfigDict] = {'extra': 'allow'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- class nv_ingest_client.primitives.tasks.store.StoreTask(
- structured: bool = True,
- images: bool = False,
- store_method: Literal['minio'] | None = None,
- params: dict | None = None,
- **extra_params,
Bases:
Task
Object for image storage task.
- class nv_ingest_client.primitives.tasks.store.StoreTaskSchema(
- *,
- store_method: str = None,
- **extra_data: Any,
Bases:
BaseModel
- model_config: ClassVar[ConfigDict] = {'extra': 'allow'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- store_method: str#
nv_ingest_client.primitives.tasks.table_extraction module#
nv_ingest_client.primitives.tasks.task_base module#
- class nv_ingest_client.primitives.tasks.task_base.TaskType(value)[source]#
Bases:
Enum
An enumeration.
- CAPTION = 1#
- CHART_DATA_EXTRACT = 12#
- DEDUP = 2#
- EMBED = 3#
- EXTRACT = 4#
- FILTER = 5#
- INFOGRAPHIC_DATA_EXTRACT = 13#
- SPLIT = 6#
- STORE = 9#
- STORE_EMBEDDING = 8#
- TABLE_DATA_EXTRACT = 11#
- TRANSFORM = 7#
- VDB_UPLOAD = 10#
- nv_ingest_client.primitives.tasks.task_base.is_valid_task_type(task_type_str: str) bool [source]#
Checks if the provided string is a valid TaskType enum value.
- Parameters:
task_type_str (str) – The string to check against the TaskType enum values.
- Returns:
True if the string is a valid TaskType enum value, False otherwise.
- Return type:
bool
nv_ingest_client.primitives.tasks.task_factory module#
- class nv_ingest_client.primitives.tasks.task_factory.TaskUnimplemented(**kwargs)[source]#
Bases:
Task
Placeholder for unimplemented tasks
nv_ingest_client.primitives.tasks.transform module#
nv_ingest_client.primitives.tasks.vdb_upload module#
- class nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTask(
- filter_errors: bool = False,
- bulk_ingest: bool = False,
- bulk_ingest_path: str = 'embeddings/',
- params: dict | None = None,
Bases:
Task
Object for document embedding task
- class nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTaskSchema(
- *,
- filter_errors: bool = False,
- bulk_ingest: bool = False,
- bulk_ingest_path: str = '',
- params: dict = None,
Bases:
BaseModel
- bulk_ingest: bool#
- bulk_ingest_path: str#
- filter_errors: bool#
- model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- params: dict#
Module contents#
- class nv_ingest_client.primitives.tasks.AudioExtractionTask(
- auth_token: str | None = None,
- grpc_endpoint: str | None = None,
- infer_protocol: str | None = None,
- function_id: str | None = None,
- use_ssl: bool | None = None,
- ssl_cert: str | None = None,
Bases:
Task
- class nv_ingest_client.primitives.tasks.CaptionTask(
- api_key: str | None = None,
- endpoint_url: str | None = None,
- prompt: str | None = None,
- model_name: str | None = None,
Bases:
Task
- class nv_ingest_client.primitives.tasks.ChartExtractionTask[source]#
Bases:
Task
Object for chart extraction task
- class nv_ingest_client.primitives.tasks.DedupTask(
- content_type: Literal['image'] = 'image',
- filter: bool = False,
Bases:
Task
Object for document dedup task
- class nv_ingest_client.primitives.tasks.EmbedTask(
- endpoint_url: str | None = None,
- model_name: str | None = None,
- api_key: str | None = None,
- text: bool | None = None,
- tables: bool | None = None,
- filter_errors: bool = False,
Bases:
Task
Object for document embedding task
- class nv_ingest_client.primitives.tasks.ExtractTask(
- document_type,
- extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = 'pdfium',
- extract_text: bool = False,
- extract_images: bool = False,
- extract_tables: bool = False,
- extract_charts: bool | None = None,
- extract_audio_params: Dict[str, Any] | None = None,
- extract_images_method: Literal['simple', 'group'] = 'group',
- extract_images_params: Dict[str, Any] | None = None,
- extract_tables_method: Literal['yolox', 'pdfium', 'nemoretriever_parse'] = 'yolox',
- extract_infographics: bool = False,
- text_depth: str = 'document',
- paddle_output_format: str = 'pseudo_markdown',
Bases:
Task
Object for document extraction task
- property document_type#
- class nv_ingest_client.primitives.tasks.FilterTask(
- content_type: Literal['image'] = 'image',
- min_size: int = 128,
- max_aspect_ratio: int | float = 5.0,
- min_aspect_ratio: int | float = 0.2,
- filter: bool = False,
Bases:
Task
Object for document filter task
- class nv_ingest_client.primitives.tasks.InfographicExtractionTask[source]#
Bases:
Task
Object for infographic extraction task
- class nv_ingest_client.primitives.tasks.SplitTask(
- tokenizer: str | None = None,
- chunk_size: int = 1024,
- chunk_overlap: int = 150,
- params: dict = {},
Bases:
Task
Object for document splitting task
- class nv_ingest_client.primitives.tasks.StoreEmbedTask(params: dict | None = None, **extra_params)[source]#
Bases:
Task
Object for image storage task.
- class nv_ingest_client.primitives.tasks.StoreTask(
- structured: bool = True,
- images: bool = False,
- store_method: Literal['minio'] | None = None,
- params: dict | None = None,
- **extra_params,
Bases:
Task
Object for image storage task.
- class nv_ingest_client.primitives.tasks.TableExtractionTask[source]#
Bases:
Task
Object for table extraction tasks
- class nv_ingest_client.primitives.tasks.TaskType(value)[source]#
Bases:
Enum
An enumeration.
- CAPTION = 1#
- CHART_DATA_EXTRACT = 12#
- DEDUP = 2#
- EMBED = 3#
- EXTRACT = 4#
- FILTER = 5#
- INFOGRAPHIC_DATA_EXTRACT = 13#
- SPLIT = 6#
- STORE = 9#
- STORE_EMBEDDING = 8#
- TABLE_DATA_EXTRACT = 11#
- TRANSFORM = 7#
- VDB_UPLOAD = 10#
- nv_ingest_client.primitives.tasks.is_valid_task_type(task_type_str: str) bool [source]#
Checks if the provided string is a valid TaskType enum value.
- Parameters:
task_type_str (str) – The string to check against the TaskType enum values.
- Returns:
True if the string is a valid TaskType enum value, False otherwise.
- Return type:
bool