modules.exact_dedup#

Module Contents#

Classes#

API#

class modules.exact_dedup.ExactDuplicates(
logger: logging.LoggerAdapter | str = './',
id_field: str = 'id',
text_field: str = 'text',
hash_method: str = 'md5',
perform_removal: bool = False,
profile_dir: str | None = None,
cache_dir: str | None = None,
)#

Bases: nemo_curator.modules.base.BaseDeduplicationModule

Initialization

SUPPORTED_HASHES#

‘frozenset(…)’

hash_documents(
df: cudf.Series | pandas.Series,
) cudf.Series | pandas.Series#
identify_duplicates(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#
remove(
dataset: nemo_curator.datasets.DocumentDataset,
duplicates_to_remove: nemo_curator.datasets.DocumentDataset | None,
) nemo_curator.datasets.DocumentDataset#