modules.task#

Module Contents#

Classes#

API#

class modules.task.TaskDecontamination(
tasks: nemo_curator.tasks.downstream_task.DownstreamTask | collections.abc.Iterable[nemo_curator.tasks.downstream_task.DownstreamTask],
text_field: str = 'text',
max_ngram_size: int = 13,
max_matches: int = 10,
min_document_length: int = 200,
remove_char_each_side: int = 200,
max_splits: int = 10,
removed_dir: str | None = None,
)#

Bases: nemo_curator.modules.base.BaseModule

Initialization

call(
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#
find_matching_ngrams(
task_ngrams: dict,
dataset: nemo_curator.datasets.DocumentDataset,
) dict#
prepare_task_ngram_count() dict#
remove_matching_ngrams(
matched_ngrams: dict,
ngram_freq: list[tuple],
dataset: nemo_curator.datasets.DocumentDataset,
) nemo_curator.datasets.DocumentDataset#