nemo_rl.data#

Subpackages#

Submodules#

Package Contents#

Classes#

DataConfig

MMLUEvalDataConfig

Config for MMLU and multilingual MMLU datasets.

MMLUProEvalDataConfig

Config for MMLU Pro dataset.

AIMEEvalDataConfig

Config for AIME datasets.

GPQAEvalDataConfig

Config for GPQA datasets.

MathEvalDataConfig

Config for Math datasets.

LocalMathEvalDataConfig

Config for local math datasets loaded from files.

Data#

API#

class nemo_rl.data.DataConfig[source]#

Bases: typing.TypedDict

max_input_seq_length: int#

None

prompt_file: NotRequired[str | None]#

None

system_prompt_file: NotRequired[str | None]#

None

dataset_name: str#

None

val_dataset_name: NotRequired[str]#

None

add_bos: NotRequired[bool]#

None

add_eos: NotRequired[bool]#

None

input_key: NotRequired[str]#

None

output_key: NotRequired[str | None]#

None

add_generation_prompt: NotRequired[bool]#

None

add_system_prompt: NotRequired[bool]#

None

split: NotRequired[str | None]#

None

shuffle: bool#

None

seed: NotRequired[int | None]#

None

download_dir: NotRequired[str]#

None

train_data_path: NotRequired[str]#

None

val_data_paths: NotRequired[dict[str, str]]#

None

num_workers: NotRequired[int]#

None

class nemo_rl.data.MMLUEvalDataConfig[source]#

Bases: typing.TypedDict

Config for MMLU and multilingual MMLU datasets.

Supports dataset_name: “mmlu” or “mmlu_{language}” where language is one of: AR-XY, BN-BD, DE-DE, EN-US, ES-LA, FR-FR, HI-IN, ID-ID, IT-IT, JA-JP, KO-KR, PT-BR, ZH-CN, SW-KE, YO-NG

Initialization

Initialize self. See help(type(self)) for accurate signature.

max_input_seq_length: int#

None

dataset_name: Literal[mmlu, mmlu_AR-XY, mmlu_BN-BD, mmlu_DE-DE, mmlu_EN-US, mmlu_ES-LA, mmlu_FR-FR, mmlu_HI-IN, mmlu_ID-ID, mmlu_IT-IT, mmlu_JA-JP, mmlu_KO-KR, mmlu_PT-BR, mmlu_ZH-CN, mmlu_SW-KE, mmlu_YO-NG]#

None

prompt_file: NotRequired[str | None]#

None

system_prompt_file: NotRequired[str | None]#

None

class nemo_rl.data.MMLUProEvalDataConfig[source]#

Bases: typing.TypedDict

Config for MMLU Pro dataset.

Initialization

Initialize self. See help(type(self)) for accurate signature.

max_input_seq_length: int#

None

dataset_name: Literal[mmlu_pro]#

None

prompt_file: NotRequired[str | None]#

None

system_prompt_file: NotRequired[str | None]#

None

class nemo_rl.data.AIMEEvalDataConfig[source]#

Bases: typing.TypedDict

Config for AIME datasets.

Initialization

Initialize self. See help(type(self)) for accurate signature.

max_input_seq_length: int#

None

dataset_name: Literal[aime2024, aime2025]#

None

prompt_file: NotRequired[str | None]#

None

system_prompt_file: NotRequired[str | None]#

None

class nemo_rl.data.GPQAEvalDataConfig[source]#

Bases: typing.TypedDict

Config for GPQA datasets.

Initialization

Initialize self. See help(type(self)) for accurate signature.

max_input_seq_length: int#

None

dataset_name: Literal[gpqa, gpqa_diamond]#

None

prompt_file: NotRequired[str | None]#

None

system_prompt_file: NotRequired[str | None]#

None

class nemo_rl.data.MathEvalDataConfig[source]#

Bases: typing.TypedDict

Config for Math datasets.

Initialization

Initialize self. See help(type(self)) for accurate signature.

max_input_seq_length: int#

None

dataset_name: Literal[math, math500]#

None

prompt_file: NotRequired[str | None]#

None

system_prompt_file: NotRequired[str | None]#

None

class nemo_rl.data.LocalMathEvalDataConfig[source]#

Bases: typing.TypedDict

Config for local math datasets loaded from files.

dataset_name can be a URL or local file path. Requires additional fields: problem_key, solution_key, file_format, split.

Initialization

Initialize self. See help(type(self)) for accurate signature.

max_input_seq_length: int#

None

dataset_name: str#

None

problem_key: str#

None

solution_key: str#

None

file_format: Literal[csv, json]#

None

split: NotRequired[str | None]#

None

prompt_file: NotRequired[str | None]#

None

system_prompt_file: NotRequired[str | None]#

None

nemo_rl.data.EvalDataConfigType#

None