modules.fuzzy_dedup.fuzzyduplicates#

Module Contents#

Classes#

API#

class modules.fuzzy_dedup.fuzzyduplicates.FuzzyDuplicates(
config: nemo_curator.modules.config.FuzzyDuplicatesConfig,
logger: logging.LoggerAdapter | str = './',
perform_removal: bool = False,
)#

Bases: nemo_curator.modules.base.BaseDeduplicationModule

Initialization

identify_duplicates(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset | None#
remove(
dataset: nemo_curator.datasets.DocumentDataset,
duplicates_to_remove: nemo_curator.datasets.DocumentDataset | None,
) nemo_curator.datasets.DocumentDataset#