Task Decontamination#
Base Class#
- class nemo_curator.tasks.DownstreamTask#
- class nemo_curator.tasks.import_task(task_path: str)#
Module#
- class nemo_curator.TaskDecontamination(
- tasks: DownstreamTask | Iterable[DownstreamTask],
- text_field: str = 'text',
- max_ngram_size: int = 13,
- max_matches: int = 10,
- min_document_length: int = 200,
- remove_char_each_side: int = 200,
- max_splits: int = 10,
- removed_dir: str | None = None,
- call(
- dataset: DocumentDataset,
Performs an arbitrary operation on a dataset
- Parameters:
dataset (DocumentDataset) – The dataset to operate on
- prepare_task_ngram_count() dict #
Computes a dictionary of all ngrams in each task as keys and each value set to 0.
Tasks#
- class nemo_curator.tasks.Race(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.Squad(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.ArcEasy(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.ArcChallenge(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.OpenBookQA(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.BoolQ(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.Copa(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.RTE(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.MultiRC(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.WSC(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.CB(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.ANLI(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.Record(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.COQA(
- file_path: str | None = None,
- min_ngram_size: int = 8,
- max_ngram_size: int = 13,
- class nemo_curator.tasks.TriviaQA(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.Quac(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.WebQA(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.Drop(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.WiC(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.MMLU(
- path: str | None = None,
- min_ngram_size: int = 8,
- max_ngram_size: int = 13,
- class nemo_curator.tasks.BigBenchHard(
- path: str | None = None,
- min_ngram_size: int = 8,
- max_ngram_size: int = 13,
- class nemo_curator.tasks.BigBenchLight(
- path: str | None = None,
- min_ngram_size: int = 8,
- max_ngram_size: int = 13,
- class nemo_curator.tasks.Multilingual(
- path: str | None = None,
- min_ngram_size: int = 8,
- max_ngram_size: int = 13,
- class nemo_curator.tasks.PIQA(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.Winogrande(min_ngram_size: int = 8, max_ngram_size: int = 13)#
- class nemo_curator.tasks.Lambada(
- file_path: str,
- min_ngram_size: int = 8,
- max_ngram_size: int = 13,
- class nemo_curator.tasks.NumDasc(
- n: int,
- file_path: str,
- min_ngram_size: int = 8,
- max_ngram_size: int = 13,
- class nemo_curator.tasks.StoryCloze(
- file_path: str,
- min_ngram_size: int = 8,
- max_ngram_size: int = 13,