Source code for nv_ingest.schemas.text_splitter_schema

# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from pydantic import Field, BaseModel, field_validator

from typing import Optional

from typing_extensions import Annotated


[docs] class TextSplitterSchema(BaseModel): tokenizer: Optional[str] = None chunk_size: Annotated[int, Field(gt=0)] = 1024 chunk_overlap: Annotated[int, Field(ge=0)] = 150 raise_on_failure: bool = False
[docs] @field_validator("chunk_overlap") def check_chunk_overlap(cls, v, values, **kwargs): if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]: raise ValueError("chunk_overlap must be less than chunk_size") return v