modules.base#

Module Contents#

Classes#

BaseDeduplicationModule

Base class for all NeMo Curator deduplication modules.

BaseModule

Base class for all NeMo Curator modules.

API#

class modules.base.BaseDeduplicationModule(
id_field: str,
text_field: str,
perform_removal: bool = False,
logger: logging.LoggerAdapter | str = './',
profile_dir: str | None = None,
cache_dir: str | None = None,
input_backend: Literal[pandas, cudf, any] = 'any',
**kwargs,
)#

Bases: modules.base.BaseModule

Base class for all NeMo Curator deduplication modules.

Initialization

Constructs a Module

Args: input_backend (Literal[“pandas”, “cudf”, “any”]): The backend the input dataframe must be on for the module to work name (str, Optional): The name of the module. If None, defaults to self.class.name

call(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#

Execute the deduplication process.

Args: dataset (DocumentDataset): Input dataset for deduplication. Returns: DocumentDataset: Deduplicated dataset if perform_removal is False, otherwise the dataset with duplicates removed.

abstractmethod identify_duplicates(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#

Identifies duplicates in a dataset

Args: dataset (DocumentDataset): The dataset to identify duplicates in

abstractmethod remove(
dataset: nemo_curator.datasets.DocumentDataset,
duplicates_to_remove: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#

Removes duplicates from a dataset

Args: dataset (DocumentDataset): The dataset to remove duplicates from

class modules.base.BaseModule(
input_backend: Literal[pandas, cudf, any],
name: str | None = None,
)#

Bases: abc.ABC

Base class for all NeMo Curator modules.

Handles validating that data lives on the correct device for each module

Initialization

Constructs a Module

Args: input_backend (Literal[“pandas”, “cudf”, “any”]): The backend the input dataframe must be on for the module to work name (str, Optional): The name of the module. If None, defaults to self.class.name

SUPPORTED_BACKENDS#

(‘pandas’, ‘cudf’, ‘any’)

abstractmethod call(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#

Performs an arbitrary operation on a dataset

Args: dataset (DocumentDataset): The dataset to operate on