modules.task#
Module Contents#
Classes#
API#
- class modules.task.TaskDecontamination(
- tasks: nemo_curator.tasks.downstream_task.DownstreamTask | collections.abc.Iterable[nemo_curator.tasks.downstream_task.DownstreamTask],
- text_field: str = 'text',
- max_ngram_size: int = 13,
- max_matches: int = 10,
- min_document_length: int = 200,
- remove_char_each_side: int = 200,
- max_splits: int = 10,
- removed_dir: str | None = None,
Bases:
nemo_curator.modules.base.BaseModuleInitialization
- call(
- dataset: nemo_curator.datasets.DocumentDataset,
- find_matching_ngrams(
- task_ngrams: dict,
- dataset: nemo_curator.datasets.DocumentDataset,
- prepare_task_ngram_count() dict#
- remove_matching_ngrams(
- matched_ngrams: dict,
- ngram_freq: list[tuple],
- dataset: nemo_curator.datasets.DocumentDataset,