modules.base#
Module Contents#
Classes#
API#
- class modules.base.BaseDeduplicationModule(
- id_field: str,
- text_field: str,
- perform_removal: bool = False,
- logger: logging.LoggerAdapter | str = './',
- profile_dir: str | None = None,
- cache_dir: str | None = None,
- input_backend: Literal[pandas, cudf, any] = 'any',
- **kwargs,
Bases:
modules.base.BaseModuleInitialization
- call(
- dataset: nemo_curator.datasets.DocumentDataset,
- abstractmethod identify_duplicates(
- dataset: nemo_curator.datasets.DocumentDataset,
- abstractmethod remove(
- dataset: nemo_curator.datasets.DocumentDataset,
- duplicates_to_remove: nemo_curator.datasets.DocumentDataset,