modules.base#

Module Contents#

Classes#

API#

class modules.base.BaseDeduplicationModule(
id_field: str,
text_field: str,
perform_removal: bool = False,
logger: logging.LoggerAdapter | str = './',
profile_dir: str | None = None,
cache_dir: str | None = None,
input_backend: Literal[pandas, cudf, any] = 'any',
**kwargs,
)#

Bases: modules.base.BaseModule

Initialization

call(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#
abstractmethod identify_duplicates(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#
abstractmethod remove(
dataset: nemo_curator.datasets.DocumentDataset,
duplicates_to_remove: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#
class modules.base.BaseModule(
input_backend: Literal[pandas, cudf, any],
name: str | None = None,
)#

Bases: abc.ABC

Initialization

SUPPORTED_BACKENDS#

(‘pandas’, ‘cudf’, ‘any’)

abstractmethod call(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#