modules.semantic_dedup.semdedup#
Module Contents#
Classes#
API#
- class modules.semantic_dedup.semdedup.SemDedup(
- config: nemo_curator.modules.config.SemDedupConfig,
- input_column: str = 'text',
- id_column: str = 'id',
- perform_removal: bool = False,
- logger: logging.Logger | str = './',
Bases:
nemo_curator.modules.base.BaseDeduplicationModuleInitialization
- identify_duplicates(
- dataset: nemo_curator.datasets.DocumentDataset,
- remove(
- dataset: nemo_curator.datasets.DocumentDataset,
- duplicates_to_remove: nemo_curator.datasets.DocumentDataset,