nv_ingest_client.primitives.tasks package#

Submodules#

nv_ingest_client.primitives.tasks.caption module#

class nv_ingest_client.primitives.tasks.caption.CaptionTask( api_key: str | None = None, endpoint_url: str | None = None, prompt: str | None = None, model_name: str | None = None, )[source]#

Bases: Task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.caption.CaptionTaskSchema( *, api_key: str | None = None, endpoint_url: str | None = None, prompt: str | None = None, model_name: str | None = None, )[source]#

Bases: BaseModel

api_key: str | None#

endpoint_url: str | None#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid', 'protected_namespaces': ()}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str | None#

prompt: str | None#

nv_ingest_client.primitives.tasks.chart_extraction module#

class nv_ingest_client.primitives.tasks.chart_extraction.ChartExtractionSchema[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest_client.primitives.tasks.chart_extraction.ChartExtractionTask[source]#

Bases: Task

Object for chart extraction task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.dedup module#

class nv_ingest_client.primitives.tasks.dedup.DedupTask( content_type: Literal['image'] = 'image', filter: bool = False, )[source]#

Bases: Task

Object for document dedup task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.dedup.DedupTaskSchema(*, content_type: str = 'image', filter: bool = False)[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#

content_type: str#

classmethod content_type_must_be_valid(v)[source]#

filter: bool#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nv_ingest_client.primitives.tasks.embed module#

Bases: Task

Object for document embedding tasks.

This class encapsulates the configuration and runtime state for an embedding task, including details like the endpoint URL, model name, and API key.

to_dict() → Dict[str, Any][source]#

Convert the EmbedTask configuration to a dictionary for submission.

Returns:: A dictionary containing the task type and properties, suitable for submission (e.g., to a Redis database).
Return type:: Dict[str, Any]

class nv_ingest_client.primitives.tasks.embed.EmbedTaskSchema( *, endpoint_url: str | None = None, model_name: str | None = None, api_key: str | None = None, filter_errors: bool = False, )[source]#

Bases: BaseModel

Schema for embed task configuration.

This schema contains configuration details for an embedding task, including the endpoint URL, model name, API key, and error filtering flag.

endpoint_url#

URL of the embedding endpoint. Default is None.

Type:: Optional[str]

model_name#

Name of the embedding model. Default is None.

Type:: Optional[str]

api_key#

API key for authentication with the embedding service. Default is None.

Type:: Optional[str]

filter_errors#

Flag to indicate whether errors should be filtered. Default is False.

Type:: bool

api_key: str | None#

endpoint_url: str | None#

filter_errors: bool#

classmethod handle_deprecated_fields( values: Dict[str, Any], ) → Dict[str, Any][source]#

Handle deprecated fields before model validation.

This validator checks for the presence of deprecated keys (‘text’ and ‘tables’) in the input dictionary and removes them. Warnings are issued if these keys are found.

Parameters:: values (Dict[str, Any]) – Input dictionary of model values.
Returns:: The updated dictionary with deprecated fields removed.
Return type:: Dict[str, Any]

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str | None#

nv_ingest_client.primitives.tasks.extract module#

class nv_ingest_client.primitives.tasks.extract.ExtractTask( document_type, extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = 'pdfium', extract_text: bool = False, extract_images: bool = False, extract_tables: bool = False, extract_charts: bool | None = None, extract_audio_params: Dict[str, Any] | None = None, extract_images_method: Literal['simple', 'group'] = 'group', extract_images_params: Dict[str, Any] | None = None, extract_tables_method: Literal['yolox', 'pdfium', 'nemoretriever_parse'] = 'yolox', extract_infographics: bool = False, text_depth: str = 'document', paddle_output_format: str = 'pseudo_markdown', )[source]#

Bases: Task

Object for document extraction task

property document_type#

to_dict() → Dict[source]#: Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.extract.ExtractTaskSchema( *, document_type: str, extract_method: str = None, extract_text: bool = True, extract_images: bool = True, extract_images_method: str = 'group', extract_images_params: Dict[str, Any] | None = None, extract_tables: bool = True, extract_tables_method: str = 'yolox', extract_charts: bool | None = None, extract_infographics: bool = False, extract_audio_params: Dict[str, Any] | None = None, text_depth: str = 'document', paddle_output_format: str = 'pseudo_markdown', )[source]#

Bases: BaseModel

document_type: str#

classmethod document_type_must_be_supported(v)[source]#

extract_audio_params: Dict[str, Any] | None#

extract_charts: bool | None#

extract_images: bool#

extract_images_method: str#

classmethod extract_images_method_must_be_valid(v)[source]#

extract_images_params: Dict[str, Any] | None#

extract_infographics: bool#

extract_method: str#

classmethod extract_method_must_be_valid(v, values, **kwargs)[source]#

extract_tables: bool#

extract_tables_method: str#

classmethod extract_tables_method_must_be_valid(

v,

values,

**kwargs,

)[source]#

extract_text: bool#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

paddle_output_format: str#

classmethod set_default_extract_charts(v, values)[source]#

classmethod set_default_extract_method(values)[source]#

text_depth: str#

nv_ingest_client.primitives.tasks.filter module#

class nv_ingest_client.primitives.tasks.filter.FilterTask( content_type: Literal['image'] = 'image', min_size: int = 128, max_aspect_ratio: int | float = 5.0, min_aspect_ratio: int | float = 0.2, filter: bool = False, )[source]#

Bases: Task

Object for document filter task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.filter.FilterTaskSchema( *, content_type: str = 'image', min_size: int = 128, max_aspect_ratio: float | int = 5.0, min_aspect_ratio: float | int = 0.2, filter: bool = False, )[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#

content_type: str#

classmethod content_type_must_be_valid(v)[source]#

filter: bool#

max_aspect_ratio: float | int#

min_aspect_ratio: float | int#

min_size: int#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nv_ingest_client.primitives.tasks.infographic_extraction module#

class nv_ingest_client.primitives.tasks.infographic_extraction.InfographicExtractionSchema[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest_client.primitives.tasks.infographic_extraction.InfographicExtractionTask[source]#

Bases: Task

Object for infographic extraction task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.split module#

class nv_ingest_client.primitives.tasks.split.SplitTask( tokenizer: str | None = None, chunk_size: int = 1024, chunk_overlap: int = 150, params: dict = {}, )[source]#

Bases: Task

Object for document splitting task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.split.SplitTaskSchema( *, tokenizer: str | None = None, chunk_size: int = 1024, chunk_overlap: int = 150, params: dict = {}, )[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#

chunk_overlap: int#

chunk_size: int#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#

tokenizer: str | None#

nv_ingest_client.primitives.tasks.store module#

class nv_ingest_client.primitives.tasks.store.StoreEmbedTask(params: dict | None = None, **extra_params)[source]#

Bases: Task

Object for image storage task.

to_dict() → Dict[source]#: Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.store.StoreEmbedTaskSchema(**extra_data: Any)[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'allow'#

model_config: ClassVar[ConfigDict] = {'extra': 'allow'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest_client.primitives.tasks.store.StoreTask(

structured: bool = True,

images: bool = False,

store_method: Literal['minio'] | None = None,

params: dict | None = None,

**extra_params,

)[source]#

Bases: Task

Object for image storage task.

to_dict() → Dict[source]#: Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.store.StoreTaskSchema(

*,

store_method: str = None,

**extra_data: Any,

)[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'allow'#

model_config: ClassVar[ConfigDict] = {'extra': 'allow'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

classmethod set_default_store_method(values)[source]#

store_method: str#

nv_ingest_client.primitives.tasks.table_extraction module#

class nv_ingest_client.primitives.tasks.table_extraction.TableExtractionSchema[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest_client.primitives.tasks.table_extraction.TableExtractionTask[source]#

Bases: Task

Object for table extraction tasks

to_dict() → Dict[source]#: Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.task_base module#

class nv_ingest_client.primitives.tasks.task_base.Task[source]#

Bases: object

Generic task Object

to_dict() → Dict[source]#: Returns a string with the task specification. This string is used for constructing tasks that are then submitted to the redis client

class nv_ingest_client.primitives.tasks.task_base.TaskType(value)[source]#

Bases: Enum

An enumeration.

CAPTION = 1#

CHART_DATA_EXTRACT = 12#

DEDUP = 2#

EMBED = 3#

EXTRACT = 4#

FILTER = 5#

INFOGRAPHIC_DATA_EXTRACT = 13#

SPLIT = 6#

STORE = 9#

STORE_EMBEDDING = 8#

TABLE_DATA_EXTRACT = 11#

TRANSFORM = 7#

VDB_UPLOAD = 10#

nv_ingest_client.primitives.tasks.task_base.is_valid_task_type(task_type_str: str) → bool[source]#

Checks if the provided string is a valid TaskType enum value.

Parameters:: task_type_str (str) – The string to check against the TaskType enum values.
Returns:: True if the string is a valid TaskType enum value, False otherwise.
Return type:: bool

nv_ingest_client.primitives.tasks.task_factory module#

class nv_ingest_client.primitives.tasks.task_factory.TaskUnimplemented(**kwargs)[source]#

Bases: Task

Placeholder for unimplemented tasks

nv_ingest_client.primitives.tasks.task_factory.task_factory(

task_type: TaskType | str,

**kwargs,

) → Task[source]#

Factory method for creating tasks based on the provided task type.

Parameters:

task_type (TaskType) – The type of the task to create.
**kwargs (dict) – Additional keyword arguments to pass to the task’s constructor.

Returns:

An instance of the task corresponding to the given task type.

Return type:

Task

Raises:

ValueError – If an invalid task type is provided.

nv_ingest_client.primitives.tasks.transform module#

nv_ingest_client.primitives.tasks.vdb_upload module#

class nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTask( filter_errors: bool = False, bulk_ingest: bool = False, bulk_ingest_path: str = 'embeddings/', params: dict | None = None, )[source]#

Bases: Task

Object for document embedding task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTaskSchema( *, filter_errors: bool = False, bulk_ingest: bool = False, bulk_ingest_path: str = '', params: dict = None, )[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#

bulk_ingest: bool#

bulk_ingest_path: str#

filter_errors: bool#

model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#

Module contents#

Bases: Task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.CaptionTask( api_key: str | None = None, endpoint_url: str | None = None, prompt: str | None = None, model_name: str | None = None, )[source]#

Bases: Task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.ChartExtractionTask[source]#

Bases: Task

Object for chart extraction task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.DedupTask( content_type: Literal['image'] = 'image', filter: bool = False, )[source]#

Bases: Task

Object for document dedup task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

Bases: Task

Object for document embedding tasks.

This class encapsulates the configuration and runtime state for an embedding task, including details like the endpoint URL, model name, and API key.

to_dict() → Dict[str, Any][source]#

Convert the EmbedTask configuration to a dictionary for submission.

Returns:: A dictionary containing the task type and properties, suitable for submission (e.g., to a Redis database).
Return type:: Dict[str, Any]

class nv_ingest_client.primitives.tasks.ExtractTask( document_type, extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = 'pdfium', extract_text: bool = False, extract_images: bool = False, extract_tables: bool = False, extract_charts: bool | None = None, extract_audio_params: Dict[str, Any] | None = None, extract_images_method: Literal['simple', 'group'] = 'group', extract_images_params: Dict[str, Any] | None = None, extract_tables_method: Literal['yolox', 'pdfium', 'nemoretriever_parse'] = 'yolox', extract_infographics: bool = False, text_depth: str = 'document', paddle_output_format: str = 'pseudo_markdown', )[source]#

Bases: Task

Object for document extraction task

property document_type#

to_dict() → Dict[source]#: Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.FilterTask( content_type: Literal['image'] = 'image', min_size: int = 128, max_aspect_ratio: int | float = 5.0, min_aspect_ratio: int | float = 0.2, filter: bool = False, )[source]#

Bases: Task

Object for document filter task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.InfographicExtractionTask[source]#

Bases: Task

Object for infographic extraction task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.SplitTask( tokenizer: str | None = None, chunk_size: int = 1024, chunk_overlap: int = 150, params: dict = {}, )[source]#

Bases: Task

Object for document splitting task

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.StoreEmbedTask(params: dict | None = None, **extra_params)[source]#

Bases: Task

Object for image storage task.

to_dict() → Dict[source]#: Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.StoreTask(

structured: bool = True,

images: bool = False,

store_method: Literal['minio'] | None = None,

params: dict | None = None,

**extra_params,

)[source]#

Bases: Task

Object for image storage task.

to_dict() → Dict[source]#: Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.TableExtractionTask[source]#

Bases: Task

Object for table extraction tasks

to_dict() → Dict[source]#: Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.Task[source]#

Bases: object

Generic task Object

to_dict() → Dict[source]#: Returns a string with the task specification. This string is used for constructing tasks that are then submitted to the redis client

class nv_ingest_client.primitives.tasks.TaskType(value)[source]#

Bases: Enum

An enumeration.

CAPTION = 1#

CHART_DATA_EXTRACT = 12#

DEDUP = 2#

EMBED = 3#

EXTRACT = 4#

FILTER = 5#

INFOGRAPHIC_DATA_EXTRACT = 13#

SPLIT = 6#

STORE = 9#

STORE_EMBEDDING = 8#

TABLE_DATA_EXTRACT = 11#

TRANSFORM = 7#

VDB_UPLOAD = 10#

nv_ingest_client.primitives.tasks.is_valid_task_type(task_type_str: str) → bool[source]#

Checks if the provided string is a valid TaskType enum value.

Parameters:: task_type_str (str) – The string to check against the TaskType enum values.
Returns:: True if the string is a valid TaskType enum value, False otherwise.
Return type:: bool

nv_ingest_client.primitives.tasks.task_factory(

task_type: TaskType | str,

**kwargs,

) → Task[source]#

Factory method for creating tasks based on the provided task type.

Parameters:

task_type (TaskType) – The type of the task to create.
**kwargs (dict) – Additional keyword arguments to pass to the task’s constructor.

Returns:

An instance of the task corresponding to the given task type.

Return type:

Task

Raises:

ValueError – If an invalid task type is provided.