Source code for nv_ingest_client.primitives.tasks.split
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# pylint: disable=too-few-public-methods
# pylint: disable=too-many-arguments
import logging
from typing import Dict
from typing import Optional
from pydantic import BaseModel
from .task_base import Task
logger = logging.getLogger(__name__)
[docs]
class SplitTaskSchema(BaseModel):
tokenizer: Optional[str] = None
chunk_size: int = 1024
chunk_overlap: int = 150
params: dict = {}
[docs]
class Config:
extra = "forbid"
[docs]
class SplitTask(Task):
"""
Object for document splitting task
"""
def __init__(
self,
tokenizer: str = None,
chunk_size: int = 1024,
chunk_overlap: int = 150,
params: dict = {},
) -> None:
"""
Setup Split Task Config
"""
super().__init__()
self._tokenizer = tokenizer
self._chunk_size = chunk_size
self._chunk_overlap = chunk_overlap
self._params = params
def __str__(self) -> str:
"""
Returns a string with the object's config and run time state
"""
info = ""
info += "Split Task:\n"
info += f" tokenizer: {self._tokenizer}\n"
info += f" chunk_size: {self._chunk_size}\n"
info += f" chunk_overlap: {self._chunk_overlap}\n"
for key, value in self._params.items():
info += f" {key}: {value}\n"
return info
[docs]
def to_dict(self) -> Dict:
"""
Convert to a dict for submission to redis
"""
split_params = {}
if self._tokenizer is not None:
split_params["tokenizer"] = self._tokenizer
if self._chunk_size is not None:
split_params["chunk_size"] = self._chunk_size
if self._chunk_overlap is not None:
split_params["chunk_overlap"] = self._chunk_overlap
if self._params is not None:
split_params["params"] = self._params
return {"type": "split", "task_properties": split_params}