nv_ingest_client.primitives.tasks package#

Submodules#

nv_ingest_client.primitives.tasks.caption module#

class nv_ingest_client.primitives.tasks.caption.CaptionTask(
api_key: str | None = None,
endpoint_url: str | None = None,
prompt: str | None = None,
model_name: str | None = None,
)[source]#

Bases: Task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.caption.CaptionTaskSchema(
*,
api_key: str | None = None,
endpoint_url: str | None = None,
prompt: str | None = None,
model_name: str | None = None,
)[source]#

Bases: BaseModel

api_key: str | None#
endpoint_url: str | None#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid', 'protected_namespaces': ()}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str | None#
prompt: str | None#

nv_ingest_client.primitives.tasks.chart_extraction module#

class nv_ingest_client.primitives.tasks.chart_extraction.ChartExtractionSchema[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest_client.primitives.tasks.chart_extraction.ChartExtractionTask[source]#

Bases: Task

Object for chart extraction task

to_dict() Dict[source]#

Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.dedup module#

class nv_ingest_client.primitives.tasks.dedup.DedupTask(
content_type: Literal['image'] = 'image',
filter: bool = False,
)[source]#

Bases: Task

Object for document dedup task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.dedup.DedupTaskSchema(*, content_type: str = 'image', filter: bool = False)[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#
content_type: str#
classmethod content_type_must_be_valid(v)[source]#
filter: bool#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nv_ingest_client.primitives.tasks.embed module#

class nv_ingest_client.primitives.tasks.embed.EmbedTask(
endpoint_url: str | None = None,
model_name: str | None = None,
api_key: str | None = None,
text: bool | None = None,
tables: bool | None = None,
filter_errors: bool = False,
)[source]#

Bases: Task

Object for document embedding task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.embed.EmbedTaskSchema(
*,
endpoint_url: str | None = None,
model_name: str | None = None,
api_key: str | None = None,
filter_errors: bool = False,
)[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#
api_key: str | None#
endpoint_url: str | None#
filter_errors: bool#
classmethod handle_deprecated_fields(values)[source]#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_name: str | None#

nv_ingest_client.primitives.tasks.extract module#

class nv_ingest_client.primitives.tasks.extract.ExtractTask(
document_type,
extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = 'pdfium',
extract_text: bool = False,
extract_images: bool = False,
extract_tables: bool = False,
extract_charts: bool | None = None,
extract_audio_params: Dict[str, Any] | None = None,
extract_images_method: Literal['simple', 'group'] = 'group',
extract_images_params: Dict[str, Any] | None = None,
extract_tables_method: Literal['yolox', 'pdfium', 'nemoretriever_parse'] = 'yolox',
extract_infographics: bool = False,
text_depth: str = 'document',
paddle_output_format: str = 'pseudo_markdown',
)[source]#

Bases: Task

Object for document extraction task

property document_type#
to_dict() Dict[source]#

Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.extract.ExtractTaskSchema(
*,
document_type: str,
extract_method: str = None,
extract_text: bool = True,
extract_images: bool = True,
extract_images_method: str = 'group',
extract_images_params: Dict[str, Any] | None = None,
extract_tables: bool = True,
extract_tables_method: str = 'yolox',
extract_charts: bool | None = None,
extract_infographics: bool = False,
text_depth: str = 'document',
paddle_output_format: str = 'pseudo_markdown',
)[source]#

Bases: BaseModel

document_type: str#
classmethod document_type_must_be_supported(v)[source]#
extract_charts: bool | None#
extract_images: bool#
extract_images_method: str#
classmethod extract_images_method_must_be_valid(v)[source]#
extract_images_params: Dict[str, Any] | None#
extract_infographics: bool#
extract_method: str#
classmethod extract_method_must_be_valid(v, values, **kwargs)[source]#
extract_tables: bool#
extract_tables_method: str#
classmethod extract_tables_method_must_be_valid(
v,
values,
**kwargs,
)[source]#
extract_text: bool#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

paddle_output_format: str#
classmethod set_default_extract_charts(v, values)[source]#
classmethod set_default_extract_method(values)[source]#
text_depth: str#

nv_ingest_client.primitives.tasks.filter module#

class nv_ingest_client.primitives.tasks.filter.FilterTask(
content_type: Literal['image'] = 'image',
min_size: int = 128,
max_aspect_ratio: int | float = 5.0,
min_aspect_ratio: int | float = 0.2,
filter: bool = False,
)[source]#

Bases: Task

Object for document filter task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.filter.FilterTaskSchema(
*,
content_type: str = 'image',
min_size: int = 128,
max_aspect_ratio: float | int = 5.0,
min_aspect_ratio: float | int = 0.2,
filter: bool = False,
)[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#
content_type: str#
classmethod content_type_must_be_valid(v)[source]#
filter: bool#
max_aspect_ratio: float | int#
min_aspect_ratio: float | int#
min_size: int#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

nv_ingest_client.primitives.tasks.infographic_extraction module#

class nv_ingest_client.primitives.tasks.infographic_extraction.InfographicExtractionSchema[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest_client.primitives.tasks.infographic_extraction.InfographicExtractionTask[source]#

Bases: Task

Object for infographic extraction task

to_dict() Dict[source]#

Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.split module#

class nv_ingest_client.primitives.tasks.split.SplitTask(
tokenizer: str | None = None,
chunk_size: int = 1024,
chunk_overlap: int = 150,
params: dict = {},
)[source]#

Bases: Task

Object for document splitting task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.split.SplitTaskSchema(
*,
tokenizer: str | None = None,
chunk_size: int = 1024,
chunk_overlap: int = 150,
params: dict = {},
)[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#
chunk_overlap: int#
chunk_size: int#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#
tokenizer: str | None#

nv_ingest_client.primitives.tasks.store module#

class nv_ingest_client.primitives.tasks.store.StoreEmbedTask(params: dict | None = None, **extra_params)[source]#

Bases: Task

Object for image storage task.

to_dict() Dict[source]#

Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.store.StoreEmbedTaskSchema(**extra_data: Any)[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'allow'#
model_config: ClassVar[ConfigDict] = {'extra': 'allow'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest_client.primitives.tasks.store.StoreTask(
structured: bool = True,
images: bool = False,
store_method: Literal['minio'] | None = None,
params: dict | None = None,
**extra_params,
)[source]#

Bases: Task

Object for image storage task.

to_dict() Dict[source]#

Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.store.StoreTaskSchema(
*,
store_method: str = None,
**extra_data: Any,
)[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'allow'#
model_config: ClassVar[ConfigDict] = {'extra': 'allow'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

classmethod set_default_store_method(values)[source]#
store_method: str#

nv_ingest_client.primitives.tasks.table_extraction module#

class nv_ingest_client.primitives.tasks.table_extraction.TableExtractionSchema[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class nv_ingest_client.primitives.tasks.table_extraction.TableExtractionTask[source]#

Bases: Task

Object for table extraction tasks

to_dict() Dict[source]#

Convert to a dict for submission to redis

nv_ingest_client.primitives.tasks.task_base module#

class nv_ingest_client.primitives.tasks.task_base.Task[source]#

Bases: object

Generic task Object

to_dict() Dict[source]#

Returns a string with the task specification. This string is used for constructing tasks that are then submitted to the redis client

class nv_ingest_client.primitives.tasks.task_base.TaskType(value)[source]#

Bases: Enum

An enumeration.

CAPTION = 1#
CHART_DATA_EXTRACT = 12#
DEDUP = 2#
EMBED = 3#
EXTRACT = 4#
FILTER = 5#
INFOGRAPHIC_DATA_EXTRACT = 13#
SPLIT = 6#
STORE = 9#
STORE_EMBEDDING = 8#
TABLE_DATA_EXTRACT = 11#
TRANSFORM = 7#
VDB_UPLOAD = 10#
nv_ingest_client.primitives.tasks.task_base.is_valid_task_type(task_type_str: str) bool[source]#

Checks if the provided string is a valid TaskType enum value.

Parameters:

task_type_str (str) – The string to check against the TaskType enum values.

Returns:

True if the string is a valid TaskType enum value, False otherwise.

Return type:

bool

nv_ingest_client.primitives.tasks.task_factory module#

class nv_ingest_client.primitives.tasks.task_factory.TaskUnimplemented(**kwargs)[source]#

Bases: Task

Placeholder for unimplemented tasks

nv_ingest_client.primitives.tasks.task_factory.task_factory(
task_type: TaskType | str,
**kwargs,
) Task[source]#

Factory method for creating tasks based on the provided task type.

Parameters:
  • task_type (TaskType) – The type of the task to create.

  • **kwargs (dict) – Additional keyword arguments to pass to the task’s constructor.

Returns:

An instance of the task corresponding to the given task type.

Return type:

Task

Raises:

ValueError – If an invalid task type is provided.

nv_ingest_client.primitives.tasks.transform module#

nv_ingest_client.primitives.tasks.vdb_upload module#

class nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTask(
filter_errors: bool = False,
bulk_ingest: bool = False,
bulk_ingest_path: str = 'embeddings/',
params: dict | None = None,
)[source]#

Bases: Task

Object for document embedding task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.vdb_upload.VdbUploadTaskSchema(
*,
filter_errors: bool = False,
bulk_ingest: bool = False,
bulk_ingest_path: str = '',
params: dict = None,
)[source]#

Bases: BaseModel

class Config[source]#

Bases: object

extra = 'forbid'#
bulk_ingest: bool#
bulk_ingest_path: str#
filter_errors: bool#
model_config: ClassVar[ConfigDict] = {'extra': 'forbid'}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

params: dict#

Module contents#

class nv_ingest_client.primitives.tasks.AudioExtractionTask(
auth_token: str | None = None,
grpc_endpoint: str | None = None,
infer_protocol: str | None = None,
function_id: str | None = None,
use_ssl: bool | None = None,
ssl_cert: str | None = None,
)[source]#

Bases: Task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.CaptionTask(
api_key: str | None = None,
endpoint_url: str | None = None,
prompt: str | None = None,
model_name: str | None = None,
)[source]#

Bases: Task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.ChartExtractionTask[source]#

Bases: Task

Object for chart extraction task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.DedupTask(
content_type: Literal['image'] = 'image',
filter: bool = False,
)[source]#

Bases: Task

Object for document dedup task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.EmbedTask(
endpoint_url: str | None = None,
model_name: str | None = None,
api_key: str | None = None,
text: bool | None = None,
tables: bool | None = None,
filter_errors: bool = False,
)[source]#

Bases: Task

Object for document embedding task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.ExtractTask(
document_type,
extract_method: Literal['adobe', 'nemoretriever_parse', 'haystack', 'llama_parse', 'pdfium', 'tika', 'unstructured_io'] = 'pdfium',
extract_text: bool = False,
extract_images: bool = False,
extract_tables: bool = False,
extract_charts: bool | None = None,
extract_audio_params: Dict[str, Any] | None = None,
extract_images_method: Literal['simple', 'group'] = 'group',
extract_images_params: Dict[str, Any] | None = None,
extract_tables_method: Literal['yolox', 'pdfium', 'nemoretriever_parse'] = 'yolox',
extract_infographics: bool = False,
text_depth: str = 'document',
paddle_output_format: str = 'pseudo_markdown',
)[source]#

Bases: Task

Object for document extraction task

property document_type#
to_dict() Dict[source]#

Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.FilterTask(
content_type: Literal['image'] = 'image',
min_size: int = 128,
max_aspect_ratio: int | float = 5.0,
min_aspect_ratio: int | float = 0.2,
filter: bool = False,
)[source]#

Bases: Task

Object for document filter task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.InfographicExtractionTask[source]#

Bases: Task

Object for infographic extraction task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.SplitTask(
tokenizer: str | None = None,
chunk_size: int = 1024,
chunk_overlap: int = 150,
params: dict = {},
)[source]#

Bases: Task

Object for document splitting task

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.StoreEmbedTask(params: dict | None = None, **extra_params)[source]#

Bases: Task

Object for image storage task.

to_dict() Dict[source]#

Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.StoreTask(
structured: bool = True,
images: bool = False,
store_method: Literal['minio'] | None = None,
params: dict | None = None,
**extra_params,
)[source]#

Bases: Task

Object for image storage task.

to_dict() Dict[source]#

Convert to a dict for submission to redis (fixme)

class nv_ingest_client.primitives.tasks.TableExtractionTask[source]#

Bases: Task

Object for table extraction tasks

to_dict() Dict[source]#

Convert to a dict for submission to redis

class nv_ingest_client.primitives.tasks.Task[source]#

Bases: object

Generic task Object

to_dict() Dict[source]#

Returns a string with the task specification. This string is used for constructing tasks that are then submitted to the redis client

class nv_ingest_client.primitives.tasks.TaskType(value)[source]#

Bases: Enum

An enumeration.

CAPTION = 1#
CHART_DATA_EXTRACT = 12#
DEDUP = 2#
EMBED = 3#
EXTRACT = 4#
FILTER = 5#
INFOGRAPHIC_DATA_EXTRACT = 13#
SPLIT = 6#
STORE = 9#
STORE_EMBEDDING = 8#
TABLE_DATA_EXTRACT = 11#
TRANSFORM = 7#
VDB_UPLOAD = 10#
nv_ingest_client.primitives.tasks.is_valid_task_type(task_type_str: str) bool[source]#

Checks if the provided string is a valid TaskType enum value.

Parameters:

task_type_str (str) – The string to check against the TaskType enum values.

Returns:

True if the string is a valid TaskType enum value, False otherwise.

Return type:

bool

nv_ingest_client.primitives.tasks.task_factory(
task_type: TaskType | str,
**kwargs,
) Task[source]#

Factory method for creating tasks based on the provided task type.

Parameters:
  • task_type (TaskType) – The type of the task to create.

  • **kwargs (dict) – Additional keyword arguments to pass to the task’s constructor.

Returns:

An instance of the task corresponding to the given task type.

Return type:

Task

Raises:

ValueError – If an invalid task type is provided.