modules.exact_dedup#
Module Contents#
Classes#
API#
- class modules.exact_dedup.ExactDuplicates(
- logger: logging.LoggerAdapter | str = './',
- id_field: str = 'id',
- text_field: str = 'text',
- hash_method: str = 'md5',
- perform_removal: bool = False,
- profile_dir: str | None = None,
- cache_dir: str | None = None,
Bases:
nemo_curator.modules.base.BaseDeduplicationModuleInitialization
- SUPPORTED_HASHES#
‘frozenset(…)’
- hash_documents(
- df: cudf.Series | pandas.Series,
- identify_duplicates(
- dataset: nemo_curator.datasets.DocumentDataset,
- remove(
- dataset: nemo_curator.datasets.DocumentDataset,
- duplicates_to_remove: nemo_curator.datasets.DocumentDataset | None,