Create an Evaluation Configuration#

To create a configuration for an evaluation, send a POST request to the /evaluation/configs API. The URL of the evaluator API depends on where you deploy evaluator and how you configure it. For more information, refer to Job Target and Configuration Matrix.

Prerequisites#

  • Set your EVALUATOR_BASE_URL environment variable to your evaluator service endpoint:

    export EVALUATOR_BASE_URL="https://your-evaluator-service-endpoint"
    
  • Review the available evaluation configuration types

import os
from nemo_microservices import NeMoMicroservices

# Initialize the client
client = NeMoMicroservices(
    base_url=os.environ['EVALUATOR_BASE_URL']
)

# Create an evaluation config
config = client.evaluation.configs.create(
    type="gsm8k",
    name="my-configuration-lm-harness-gsm8k-1",
    namespace="my-organization",
    params={
        "temperature": 0.00001,
        "top_p": 0.00001,
        "max_tokens": 256,
        "stop": ["<|eot|>"],
        "extra": {
            "num_fewshot": 8,
            "batch_size": 16,
            "bootstrap_iters": 100000,
            "dataset_seed": 42,
            "use_greedy": True,
            "top_k": 1,
            "hf_token": "<my-token>",
            "tokenizer_backend": "hf",
            "tokenizer": "meta-llama/Llama-3.1-8B-Instruct",
            "apply_chat_template": True,
            "fewshot_as_multiturn": True
        }
    }
)

print("Evaluation config created successfully")
curl -X "POST" "${EVALUATOR_BASE_URL}/evaluation/configs" \
    -H 'accept: application/json' \
    -H 'Content-Type: application/json' \
    -d '{
        "type": "gsm8k",
        "name": "my-configuration-lm-harness-gsm8k-1",
        "namespace": "my-organization",
        "params": {
            "temperature": 0.00001,
            "top_p": 0.00001,
            "max_tokens": 256,
            "stop": ["<|eot|>"],
            "extra": {
                "num_fewshot": 8,
                "batch_size": 16,
                "bootstrap_iters": 100000,
                "dataset_seed": 42,
                "use_greedy": true,
                "top_k": 1,
                "hf_token": "<my-token>",
                "tokenizer_backend": "hf",
                "tokenizer": "meta-llama/Llama-3.1-8B-Instruct",
                "apply_chat_template": true,
                "fewshot_as_multiturn": true
            }
        }
    }'

Options#

API#

  1. Perform a POST request to the /v1/evaluation/configs endpoint.

    curl -X "POST" "${EVALUATOR_SERVICE_URL}/v1/evaluation/configs" \
        -H 'accept: application/json' \
        -H 'Content-Type: application/json' \
        -d '
        {
            "type": "<evaluation-type>",
            "name": "<my-configuration-name>",
            "namespace": "<my-namespace>",
            // More config details
        }'
    
    data = {
        "type": "<evaluation-type>",
        "name": "<my-configuration-name>",
        "namespace": "<my-namespace>",
        # More config details
    }
    
    endpoint = f"{EVALUATOR_SERVICE_URL}/v1/evaluation/configs"
    
    response = requests.post(endpoint, json=data).json()
    
  2. Review the returned configuration.

    Example Response
    {
        "created_at": "2025-03-19T22:50:02.206136",
        "updated_at": "2025-03-19T22:50:02.206138",
        "id": "eval-config-MNOP1234QRST5678",
        "name": "my-configuration-lm-harness-gsm8k-1",
        "namespace": "my-organization",
        "type": "gsm8k",
        "params": {
            "temperature": 0.00001,      
            "top_p": 0.00001,
            "max_tokens": 256,
            "stop": ["<|eot|>"],
            "extra": {
                "num_fewshot": 8,
                "batch_size": 16,
                "bootstrap_iters": 100000,
                "dataset_seed": 42,
                "use_greedy": true,
                "top_k": 1,
                "hf_token": "<my-token>",
                "tokenizer_backend": "hf",
                "tokenizer": "meta-llama/Llama-3.1-8B-Instruct",
                "apply_chat_template": true,
                "fewshot_as_multiturn": true
            }
        },
        "custom_fields": {}
    }
    

SDK#

from nemo_microservices import NeMoMicroservices

# Initialize the client
client = NeMoMicroservices(
    base_url=f"{EVALUATOR_SERVICE_URL}"
)

# Create an evaluation config
config = client.evaluation.configs.create(
    type="gsm8k",
    name="my-configuration-lm-harness-gsm8k-1",
    namespace="my-organization",
    params={
        "temperature": 0.00001,
        "top_p": 0.00001,
        "max_tokens": 256,
        "stop": ["<|eot|>"],
        "extra": {
            "num_fewshot": 8,
            "batch_size": 16,
            "bootstrap_iters": 100000,
            "dataset_seed": 42,
            "use_greedy": True,
            "top_k": 1,
            "hf_token": "<my-token>",
            "tokenizer_backend": "hf",
            "tokenizer": "meta-llama/Llama-3.1-8B-Instruct",
            "apply_chat_template": True,
            "fewshot_as_multiturn": True
        }
    }
)

print("Evaluation config created successfully")
import asyncio
from nemo_microservices import AsyncNeMoMicroservices

async def create_evaluation_config():
    # Initialize the async client
    client = AsyncNeMoMicroservices(
        base_url=f"{EVALUATOR_SERVICE_URL}"
    )
    
    # Create an evaluation config
    config = await client.evaluation.configs.create(
        type="gsm8k",
        name="my-configuration-lm-harness-gsm8k-1",
        namespace="my-organization",
        params={
            "temperature": 0.00001,
            "top_p": 0.00001,
            "max_tokens": 256,
            "stop": ["<|eot|>"],
            "extra": {
                "num_fewshot": 8,
                "batch_size": 16,
                "bootstrap_iters": 100000,
                "dataset_seed": 42,
                "use_greedy": True,
                "top_k": 1,
                "hf_token": "<my-token>",
                "tokenizer_backend": "hf",
                "tokenizer": "meta-llama/Llama-3.1-8B-Instruct",
                "apply_chat_template": True,
                "fewshot_as_multiturn": True
            }
        }
    )
    
    print("Evaluation config created successfully")
    return config

# Run the async function
asyncio.run(create_evaluation_config())