Task Decontamination#

Base Class#

class nemo_curator.tasks.DownstreamTask#
class nemo_curator.tasks.import_task(task_path: str)#

Module#

class nemo_curator.TaskDecontamination(
tasks: DownstreamTask | Iterable[DownstreamTask],
text_field: str = 'text',
max_ngram_size: int = 13,
max_matches: int = 10,
min_document_length: int = 200,
remove_char_each_side: int = 200,
max_splits: int = 10,
removed_dir: str | None = None,
)#
call(
dataset: DocumentDataset,
) DocumentDataset#

Performs an arbitrary operation on a dataset

Parameters:

dataset (DocumentDataset) – The dataset to operate on

prepare_task_ngram_count() dict#

Computes a dictionary of all ngrams in each task as keys and each value set to 0.

Tasks#

class nemo_curator.tasks.Race(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.Squad(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.ArcEasy(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.ArcChallenge(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.OpenBookQA(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.BoolQ(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.Copa(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.RTE(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.MultiRC(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.WSC(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.CB(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.ANLI(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.Record(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.COQA(
file_path: str | None = None,
min_ngram_size: int = 8,
max_ngram_size: int = 13,
)#
class nemo_curator.tasks.TriviaQA(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.Quac(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.WebQA(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.Drop(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.WiC(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.MMLU(
path: str | None = None,
min_ngram_size: int = 8,
max_ngram_size: int = 13,
)#
class nemo_curator.tasks.BigBenchHard(
path: str | None = None,
min_ngram_size: int = 8,
max_ngram_size: int = 13,
)#
class nemo_curator.tasks.BigBenchLight(
path: str | None = None,
min_ngram_size: int = 8,
max_ngram_size: int = 13,
)#
class nemo_curator.tasks.Multilingual(
path: str | None = None,
min_ngram_size: int = 8,
max_ngram_size: int = 13,
)#
class nemo_curator.tasks.PIQA(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.Winogrande(min_ngram_size: int = 8, max_ngram_size: int = 13)#
class nemo_curator.tasks.Lambada(
file_path: str,
min_ngram_size: int = 8,
max_ngram_size: int = 13,
)#
class nemo_curator.tasks.NumDasc(
n: int,
file_path: str,
min_ngram_size: int = 8,
max_ngram_size: int = 13,
)#
class nemo_curator.tasks.StoryCloze(
file_path: str,
min_ngram_size: int = 8,
max_ngram_size: int = 13,
)#