modules.semantic_dedup.semdedup#

Module Contents#

Classes#

API#

class modules.semantic_dedup.semdedup.SemDedup(
config: nemo_curator.modules.config.SemDedupConfig,
input_column: str = 'text',
id_column: str = 'id',
perform_removal: bool = False,
logger: logging.Logger | str = './',
)#

Bases: nemo_curator.modules.base.BaseDeduplicationModule

Initialization

identify_duplicates(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#
remove(
dataset: nemo_curator.datasets.DocumentDataset,
duplicates_to_remove: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#