Source code for nemo_evaluator.api.api_dataclasses

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import warnings
from enum import Enum
from typing import Any, Dict, Optional

import jinja2
from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator

from nemo_evaluator.adapters.adapter_config import AdapterConfig
from nemo_evaluator.core.utils import get_jinja2_environment

# NOTE: For ApiEndpoint, EvaluationTarget, ConfigParams, and EvaluationConfig all fields
#       are Optional and default=None, because depending on the command run (run_eval or
#       ls) we either require them or don't. We also don't require user to provide all
#       of them. The framework.yml often provides the defaults.


[docs] class EndpointType(str, Enum): """EndpointType is used to determine appropriate URL, payload structure or native harness inference class""" UNDEFINED = "undefined" CHAT = "chat" COMPLETIONS = "completions" VLM = "vlm" EMBEDDING = "embedding"
[docs] class ApiEndpoint(BaseModel): """API endpoint configuration containing information on endpoint placement, targeted model name and adapter used before prompting endpoint.""" model_config = ConfigDict(use_enum_values=True, extra="forbid") api_key: Optional[str] = Field( description="[DEPRECATED] Use 'api_key_name' instead. Name of the environment variable that stores API key for the model", default=None, deprecated=True, ) api_key_name: Optional[str] = Field( description="Name of the environment variable that stores API key for the model", default=None, ) model_id: Optional[str] = Field(description="Name of the model", default=None) stream: Optional[bool] = Field( description="Whether responses should be streamed", default=None ) type: Optional[EndpointType] = Field( description="The type of the target", default=None ) url: Optional[str] = Field(description="Url of the model", default=None) adapter_config: Optional[AdapterConfig] = Field( description="Adapter configuration", default=None )
[docs] @model_validator(mode="before") @classmethod def handle_api_key_deprecation(cls, values): """Handle deprecation of api_key in favor of api_key_name.""" if isinstance(values, dict): api_key = values.get("api_key") api_key_name = values.get("api_key_name") # If both are set, raise an error if ( api_key is not None and api_key_name is not None and api_key != api_key_name ): raise ValueError( "Both 'api_key' and 'api_key_name' are set and they are different. " "'api_key' is deprecated, please use only 'api_key_name'." ) # If only api_key is set, copy to api_key_name and warn if api_key is not None and api_key_name is None: warnings.warn( "'api_key' is deprecated and will be removed in a future version. " "Please use 'api_key_name' instead.", DeprecationWarning, stacklevel=2, ) values["api_key_name"] = api_key return values
[docs] class EndpointModelConfig(BaseModel): """Supporting model configuration.""" model_id: str = Field(description="Name of the model") url: str = Field(description="Url of the model") api_key_name: Optional[str] = Field( description="Name of the env variable that stores API key", default=None ) stream: Optional[bool] = Field( description="Whether responses should be streamed", default=None ) type: Optional[EndpointType] = Field( description="The type of the target", default=None ) adapter_config: Optional[AdapterConfig] = Field( description="Adapter configuration", default=None ) temperature: Optional[float] = Field(description="Temperature", default=None) top_p: Optional[float] = Field(description="Top p", default=None) max_new_tokens: Optional[int] = Field(description="Max new tokens", default=None) max_retries: Optional[int] = Field(description="Max retries", default=None) parallelism: Optional[int] = Field(description="Parallelism", default=None) request_timeout: Optional[int] = Field(description="Request timeout", default=None) is_base_url: Optional[bool] = Field( description="Whether the URL is a base URL", default=False ) # NOTE: we don't use extra yet but it will allow customization when needed extra: Optional[Dict[str, Any]] = Field(description="Extra", default=None)
[docs] class EvaluationTarget(BaseModel): """Target configuration for API endpoints.""" model_config = ConfigDict(extra="forbid") api_endpoint: Optional[ApiEndpoint] = Field( description="API endpoint to be used for evaluation", default=None )
[docs] class ConfigParams(BaseModel): """Parameters for evaluation execution.""" model_config = ConfigDict(extra="forbid") def __init__(self, **data): try: super().__init__(**data) except ValidationError as e: # Check if any errors are extra_forbidden and add valid fields hint for err in e.errors(): if err.get("type") == "extra_forbidden": valid_fields = list(ConfigParams.model_fields.keys()) raise ValueError( f"Invalid parameter '{err['loc'][0]}'. " f"Valid params: {valid_fields}" ) from e raise limit_samples: Optional[int | float] = Field( description="Limit number of evaluation samples", default=None ) max_new_tokens: Optional[int] = Field( description="Max tokens to generate", default=None ) max_retries: Optional[int] = Field( description="Number of REST request retries", default=None ) parallelism: Optional[int] = Field( description="Parallelism to be used", default=None ) task: Optional[str] = Field(description="Name of the task", default=None) temperature: Optional[float] = Field( description="Float value between 0 and 1. temp of 0 indicates greedy decoding, where the token with highest prob is chosen. Temperature can't be set to 0.0 currently", default=None, ) request_timeout: Optional[int] = Field( description="REST response timeout", default=None ) top_p: Optional[float] = Field( description="Float value between 0 and 1; limits to the top tokens within a certain probability. top_p=0 means the model will only consider the single most likely token for the next prediction", default=None, ) extra: Optional[Dict[str, Any]] = Field( description="Framework specific parameters to be used for evaluation", default_factory=dict, )
[docs] class EvaluationConfig(BaseModel): """Configuration for evaluation runs.""" model_config = ConfigDict(extra="forbid") output_dir: Optional[str] = Field( description="Directory to output the results", default=None ) params: Optional[ConfigParams] = Field( description="Parameters to be used for evaluation", default=None ) supported_endpoint_types: Optional[list[str]] = Field( description="Supported endpoint types like chat or completions", default=None ) type: Optional[str] = Field(description="Type of the task", default=None)
[docs] class EvaluationMetadata(dict): """We put here various evaluation metadata that does not influence the evaluation.""" pass
[docs] class Evaluation(BaseModel): model_config = ConfigDict(extra="forbid") command: str = Field(description="jinja template of the command to be executed") framework_name: str = Field(description="Name of the framework") pkg_name: str = Field(description="Name of the package") config: EvaluationConfig target: EvaluationTarget
[docs] def render_command(self): values = self.model_dump() env = get_jinja2_environment() def recursive_render(tpl): prev = tpl while True: try: curr = env.from_string(prev).render(values) if curr != prev: prev = curr else: return curr except jinja2.exceptions.UndefinedError as e: raise ValueError(f"Missing required configuration field: {e}") return recursive_render(self.command)
[docs] class ScoreStats(BaseModel): """Stats for a score.""" count: Optional[int] = Field( default=None, description="The number of values used for computing the score.", ) sum: Optional[float] = Field( default=None, description="The sum of all values used for computing the score.", ) sum_squared: Optional[float] = Field( default=None, description="The sum of the square of all values used for computing the score.", ) min: Optional[float] = Field( default=None, description="The minimum of all values used for computing the score.", ) max: Optional[float] = Field( default=None, description="The maximum of all values used for computing the score.", ) mean: Optional[float] = Field( default=None, description="The mean of all values used for computing the score.", ) variance: Optional[float] = Field( default=None, description="""This is the population variance, not the sample variance.""", ) stddev: Optional[float] = Field( default=None, description="""This is the population standard deviation, not the sample standard deviation.""", ) stderr: Optional[float] = Field(default=None, description="The standard error.")
[docs] class Score(BaseModel): """Atomic class that contains the value of particular metric and corresponding stats""" value: float = Field(description="The value/score produced on this metric") stats: ScoreStats = Field(description="Statistics associated with this metric")
[docs] class MetricResult(BaseModel): """Defines mapping from metric name to its scores.""" scores: Dict[str, Score] = Field( default_factory=dict, description="Mapping from metric name to scores." )
[docs] class TaskResult(BaseModel): """Defines set of metrics that were calculated for particular task.""" metrics: Dict[str, MetricResult] = Field( default_factory=dict, description="The value for all the metrics computed for the task", )
[docs] class GroupResult(BaseModel): """Some tasks can be grouped or logically split. This class defines result on grouping level.""" groups: Optional[Dict[str, "GroupResult"]] = Field( default=None, description="The results for the subgroups." ) metrics: Dict[str, MetricResult] = Field( default_factory=dict, description="The value for all the metrics computed for the group.", )
[docs] class EvaluationResult(BaseModel): """EvaluationResults bundles per-tasks and per-group results.""" tasks: Optional[Dict[str, TaskResult]] = Field( default_factory=dict, description="The results at the task-level" ) groups: Optional[Dict[str, GroupResult]] = Field( default_factory=dict, description="The results at the group-level" )