nemo_rl.evals.eval#

Module Contents#

Classes#

Functions#

setup

Set up components for model evaluation.

eval_pass_k

Evaluate pass@k score using an unbiased estimator.

run_env_eval

Main entry point for running evaluation using environment.

_run_env_eval_impl

Unified implementation for both sync and async evaluation.

_generate_texts

Generate texts using either sync or async method.

_print_results

Print evaluation results.

API#

class nemo_rl.evals.eval.EvalConfig[source]#

Bases: typing.TypedDict

metric: str#

None

num_tests_per_prompt: int#

None

seed: int#

None

pass_k_value: int#

None

class nemo_rl.evals.eval.MasterConfig[source]#

Bases: typing.TypedDict

eval: nemo_rl.evals.eval.EvalConfig#

None

generate: nemo_rl.models.generation.interfaces.GenerationConfig#

None

data: nemo_rl.data.MathDataConfig#

None

env: nemo_rl.environments.math_environment.MathEnvConfig#

None

cluster: nemo_rl.distributed.virtual_cluster.ClusterConfig#

None

nemo_rl.evals.eval.setup(
master_config: nemo_rl.evals.eval.MasterConfig,
tokenizer: transformers.AutoTokenizer,
dataset: nemo_rl.data.datasets.AllTaskProcessedDataset,
) tuple[nemo_rl.models.generation.vllm.VllmGeneration, torch.utils.data.DataLoader, nemo_rl.evals.eval.MasterConfig][source]#

Set up components for model evaluation.

Initializes the VLLM model and data loader.

Parameters:
  • master_config – Configuration settings.

  • dataset – Dataset to evaluate on.

Returns:

VLLM model, data loader, and config.

nemo_rl.evals.eval.eval_pass_k(
rewards: torch.Tensor,
num_tests_per_prompt: int,
k: int,
) float[source]#

Evaluate pass@k score using an unbiased estimator.

Reference: https://github.com/huggingface/evaluate/blob/32546aafec25cdc2a5d7dd9f941fc5be56ba122f/metrics/code_eval/code_eval.py#L198-L213

Parameters:
  • rewards – Tensor of shape (batch_size * num_tests_per_prompt)

  • k – int (pass@k value)

Returns:

float

Return type:

pass_k_score

nemo_rl.evals.eval.run_env_eval(vllm_generation, dataloader, env, master_config)[source]#

Main entry point for running evaluation using environment.

Generates model responses and evaluates them by env.

Parameters:
  • vllm_generation – Model for generating responses.

  • dataloader – Data loader with evaluation samples.

  • env – Environment that scores responses.

  • master_config – Configuration settings.

async nemo_rl.evals.eval._run_env_eval_impl(
vllm_generation,
dataloader,
env,
master_config,
use_async=False,
)[source]#

Unified implementation for both sync and async evaluation.

async nemo_rl.evals.eval._generate_texts(vllm_generation, inputs, use_async)[source]#

Generate texts using either sync or async method.

nemo_rl.evals.eval._print_results(
master_config,
generation_config,
score,
dataset_size,
metric,
pass_k_value,
num_tests_per_prompt,
)[source]#

Print evaluation results.