nat.plugins.eval.runtime.eval_harness#

Lightweight ATIF-only evaluator harness.

This harness is intentionally narrow in scope: - it evaluates ATIF-native evaluators only (evaluate_atif_fn) - it runs evaluators concurrently - it returns per-evaluator EvalOutput objects

Example:
harness = EvaluationHarness()
results = await harness.evaluate(
    evaluators={"trajectory": trajectory_evaluator},
    atif_samples=atif_samples,
)

Attributes#

Classes#

EvaluationHarness

Run ATIF-native evaluators against a shared sample list.

Module Contents#

logger#
class EvaluationHarness(logger_instance: logging.Logger | None = None)#

Run ATIF-native evaluators against a shared sample list.

_logger#
async _evaluate_single(
evaluator_name: str,
evaluator: nat.plugins.eval.evaluator.atif_evaluator.AtifEvaluator,
atif_samples: nat.plugins.eval.evaluator.atif_evaluator.AtifEvalSampleList,
) tuple[str, nat.plugins.eval.data_models.evaluator_io.EvalOutput] | None#

Evaluate one evaluator using the ATIF lane.

Returns:

A tuple of evaluator name and result on success, otherwise None.

async evaluate(
evaluators: dict[str, nat.plugins.eval.evaluator.atif_evaluator.AtifEvaluator],
atif_samples: nat.plugins.eval.evaluator.atif_evaluator.AtifEvalSampleList,
) dict[str, nat.plugins.eval.data_models.evaluator_io.EvalOutput]#

Evaluate ATIF-native evaluators concurrently.

Args:

evaluators: Evaluators keyed by evaluator name. atif_samples: Pre-built ATIF samples shared by all evaluators.

Returns:

A mapping of evaluator name to EvalOutput for successful evaluators.