stages.deduplication.id_generator#

Module Contents#

Classes#

IdGenerator

Ray actor version of IdGenerator.

IdGeneratorBase

Base IdGenerator class without Ray decorator for testing and direct use.

Functions#

Data#

API#

stages.deduplication.id_generator.CURATOR_DEDUP_ID_STR#

‘_curator_dedup_id’

stages.deduplication.id_generator.CURATOR_ID_GENERATOR_ACTOR_NAME#

‘curator_deduplication_id_generator’

class stages.deduplication.id_generator.IdGenerator(
start_id: int = 0,
batch_registry: dict[str, tuple[int, int]] | None = None,
)#

Bases: stages.deduplication.id_generator.IdGeneratorBase

Ray actor version of IdGenerator.

Initialization

class stages.deduplication.id_generator.IdGeneratorBase(
start_id: int = 0,
batch_registry: dict[str, tuple[int, int]] | None = None,
)#

Base IdGenerator class without Ray decorator for testing and direct use.

Initialization

classmethod from_disk(
filepath: str,
storage_options: dict[str, Any] | None = None,
) stages.deduplication.id_generator.IdGeneratorBase#
get_batch_range(
files: str | list[str] | None,
key: str | None,
) tuple[int, int]#
hash_files(filepath: str | list[str]) str#
register_batch(files: str | list[str], count: int) int#
to_disk(
filepath: str,
storage_options: dict[str, Any] | None = None,
) None#
stages.deduplication.id_generator.create_id_generator_actor(
filepath: str | None = None,
storage_options: dict[str, Any] | None = None,
) None#

Create an id generator actor.

Args: filepath (str): Path from where we want to load the id generator state json file. If None, a new actor is created. storage_options (dict[str, Any] | None): Storage options to pass to fsspec.open.

stages.deduplication.id_generator.get_id_generator_actor() ray.actor.ActorHandle[stages.deduplication.id_generator.IdGenerator]#
stages.deduplication.id_generator.kill_id_generator_actor() None#
stages.deduplication.id_generator.write_id_generator_to_disk(
filepath: str,
storage_options: dict[str, Any] | None = None,
) None#