nemo_curator.stages.image.deduplication.removal

View as Markdown

Module Contents

Classes

NameDescription
ImageDuplicatesRemovalStageFilter stage that removes images whose IDs appear in a Parquet file.

API

class nemo_curator.stages.image.deduplication.removal.ImageDuplicatesRemovalStage(
removal_parquets_dir: str,
duplicate_id_field: str = 'id',
verbose: bool = False,
num_workers_per_node: int | None = None,
name: str = 'image_dedup_filter',
_ids_to_remove: set[str] = set()
)
Dataclass

Bases: ProcessingStage[ImageBatch, ImageBatch]

Filter stage that removes images whose IDs appear in a Parquet file.

The Parquet file must contain a column with image identifiers; by default this column is assumed to be id to match writer metadata. You can change the column name via duplicate_id_field.

Parameters:

removal_parquets_dir
str

Directory containing Parquet files with image IDs to remove

duplicate_id_field
strDefaults to 'id'

Name of the column containing image IDs to remove

verbose
boolDefaults to False

Whether to log verbose output

num_workers_per_node
int | NoneDefaults to None

Number of workers per node for the stage. This is sometimes needed to avoid OOM when concurrently running actors on one node loading the same removal parquet files into memory.

_ids_to_remove
set[str] = field(default_factory=set)
duplicate_id_field
str = 'id'
name
str = 'image_dedup_filter'
num_workers_per_node
int | None = None
removal_parquets_dir
str
verbose
bool = False
nemo_curator.stages.image.deduplication.removal.ImageDuplicatesRemovalStage.inputs() -> tuple[list[str], list[str]]
nemo_curator.stages.image.deduplication.removal.ImageDuplicatesRemovalStage.outputs() -> tuple[list[str], list[str]]
nemo_curator.stages.image.deduplication.removal.ImageDuplicatesRemovalStage.process(
task: nemo_curator.tasks.ImageBatch
) -> nemo_curator.tasks.ImageBatch
nemo_curator.stages.image.deduplication.removal.ImageDuplicatesRemovalStage.setup(
_worker_metadata = None
) -> None
nemo_curator.stages.image.deduplication.removal.ImageDuplicatesRemovalStage.xenna_stage_spec() -> dict[str, typing.Any]