nemo_curator.stages.deduplication.id_generator

View as Markdown

Module Contents

Classes

NameDescription
IdGeneratorRay actor version of IdGenerator.
IdGeneratorBaseBase IdGenerator class without Ray decorator for testing and direct use.

Functions

Data

CURATOR_DEDUP_ID_STR

CURATOR_ID_GENERATOR_ACTOR_NAME

API

class nemo_curator.stages.deduplication.id_generator.IdGenerator()

Bases: IdGeneratorBase

Ray actor version of IdGenerator.

nemo_curator.stages.deduplication.id_generator.IdGenerator.wait() -> None

Function used by create_id_generator_actor to make sure the actor is started.

class nemo_curator.stages.deduplication.id_generator.IdGeneratorBase(
start_id: int = 0,
batch_registry: dict[str, tuple[int, int]] | None = None
)

Base IdGenerator class without Ray decorator for testing and direct use.

batch_registry
= batch_registry or {}
nemo_curator.stages.deduplication.id_generator.IdGeneratorBase.from_disk(
filepath: str,
storage_options: dict[str, typing.Any] | None = None
) -> nemo_curator.stages.deduplication.id_generator.IdGeneratorBase
classmethod
nemo_curator.stages.deduplication.id_generator.IdGeneratorBase.get_batch_range(
files: str | list[str] | None,
key: str | None
) -> tuple[int, int]
nemo_curator.stages.deduplication.id_generator.IdGeneratorBase.hash_files(
filepath: str | list[str]
) -> str
nemo_curator.stages.deduplication.id_generator.IdGeneratorBase.register_batch(
files: str | list[str],
count: int
) -> int
nemo_curator.stages.deduplication.id_generator.IdGeneratorBase.to_disk(
filepath: str,
storage_options: dict[str, typing.Any] | None = None
) -> None
nemo_curator.stages.deduplication.id_generator.create_id_generator_actor(
filepath: str | None = None,
storage_options: dict[str, typing.Any] | None = None
) -> None

Create an id generator actor.

Parameters:

filepath
strDefaults to None

Path from where we want to load the id generator state json file. If None, a new actor is created.

storage_options
dict[str, Any] | NoneDefaults to None

Storage options to pass to fsspec.open.

nemo_curator.stages.deduplication.id_generator.get_id_generator_actor() -> ray.actor.ActorHandle[nemo_curator.stages.deduplication.id_generator.IdGenerator]
nemo_curator.stages.deduplication.id_generator.kill_id_generator_actor() -> None
nemo_curator.stages.deduplication.id_generator.write_id_generator_to_disk(
filepath: str,
storage_options: dict[str, typing.Any] | None = None
) -> None
nemo_curator.stages.deduplication.id_generator.CURATOR_DEDUP_ID_STR = '_curator_dedup_id'
nemo_curator.stages.deduplication.id_generator.CURATOR_ID_GENERATOR_ACTOR_NAME = 'curator_deduplication_id_generator'