Get Evaluation Results#

To get evaluation results as a JSON response, send a GET request to the evaluation/jobs/<job_id>/results endpoint. You must provide the ID of the job as shown in the following code.

v2 (Preview)#

Warning

v2 API Preview: The v2 API is available for testing and feedback but is not yet recommended for production use. Breaking changes may occur before the stable release.

The v2 API provides structured access to different types of results.

import os
from nemo_microservices import NeMoMicroservices

# Initialize the client
client = NeMoMicroservices(
    base_url=os.environ['EVALUATOR_BASE_URL']
)

job_id = "job-id"

# List available results (v2 API)
available_results = client.v2.evaluation.jobs.results.list(job_id)

# Get specific result types
evaluation_results = client.v2.evaluation.jobs.results.evaluation_results.retrieve(job_id)

print(f"Available results: {[r.result_name for r in available_results.data]}")
print(f"Evaluation results: {evaluation_results}")
# List available results
curl -X "GET" "${EVALUATOR_BASE_URL}/v2/evaluation/jobs/<job-id>/results" \
  -H 'accept: application/json'

# Get evaluation results
curl -X "GET" "${EVALUATOR_BASE_URL}/v2/evaluation/jobs/<job-id>/results/evaluation-results/download" \
  -H 'accept: application/json'

# Get specific result info
curl -X "GET" "${EVALUATOR_BASE_URL}/v2/evaluation/jobs/<job-id>/results/artifacts" \
  -H 'accept: application/json'
v2 Available Results Response
[
  {
    "result_name": "evaluation-results",
    "job_id": "job-dq1pjj6vj5p64xaeqgvuk4",
    "created_at": "2025-09-08T19:21:43.078131",
    "artifact_url": "hf://default/job-results-job-dq1pjj6vj5p64xaeqgvuk4/evaluation-results",
    "artifact_storage_type": "nds"
  },
  {
    "result_name": "artifacts",
    "job_id": "job-dq1pjj6vj5p64xaeqgvuk4",
    "created_at": "2025-09-08T19:21:39.665664",
    "artifact_url": "hf://default/job-results-job-dq1pjj6vj5p64xaeqgvuk4/artifacts",
    "artifact_storage_type": "nds"
  }
]
v2 Evaluation Results Response
{
  "id": "evaluation_result-SWUrgtWYsFCu1TgeYPpRSN",
  "job": "job-dq1pjj6vj5p64xaeqgvuk4",
  "tasks": null,
  "groups": {
    "bfcl": {
      "metrics": {
        "live_overall_acc": {
          "scores": {
            "live_overall_acc": {
              "value": 0,
              "stats": {}
            }
          }
        },
        "multi_turn_overall_acc": {
          "scores": {
            "multi_turn_overall_acc": {
              "value": 0,
              "stats": {}
            }
          }
        },
        "non_live_overall_acc": {
          "scores": {
            "non_live_overall_acc": {
              "value": 0,
              "stats": {}
            }
          }
        }
      }
    }
  }
}

v2 Result Types#

The v2 API distinguishes between different result types:

  • evaluation-results: The structured evaluation metrics and scores (equivalent to v1 /results)

  • artifacts: Job artifacts including logs, intermediate files, and other outputs

v1 (Current)#

import os
from nemo_microservices import NeMoMicroservices

# Initialize the client
client = NeMoMicroservices(
    base_url=os.environ['EVALUATOR_BASE_URL']
)

# Get job results (v1 API)
results = client.evaluation.jobs.results("job-id")

# Access the results
print(f"Result ID: {results.id}")
print(f"Job ID: {results.job}")
print(f"Tasks: {results.tasks}")
print(f"Groups: {results.groups}")
curl -X "GET" "${EVALUATOR_BASE_URL}/v1/evaluation/jobs/<job-id>/results" \
  -H 'accept: application/json'
Example Response
{
    "created_at": "2025-03-19T22:53:43.619932",
    "updated_at": "2025-03-19T22:53:43.619934",
    "id": "evaluation_result-1234ABCD5678EFGH",
    "job": "eval-UVW123XYZ456",
    "tasks": {
        "exact_match": {
            "metrics": {
                "exact_match": {
                    "scores": {
                        "gsm8k-metric_ranking-1": {
                            "value": 0.0
                        },
                        "gsm8k-metric_ranking-3": {
                            "value": 0.8
                        }
                    }
                }
            }
        },
        "exact_match_stderr": {
            "metrics": {
                "exact_match_stderr": {
                    "scores": {
                        "gsm8k-metric_ranking-2": {
                            "value": 0.0
                        },
                        "gsm8k-metric_ranking-4": {
                            "value": 0.19999999999999998
                        }
                    }
                }
            }
        }
    },
    "groups": {
        "evaluation": {
            "metrics": {
                "evaluation": {
                    "scores": {
                        "exact_match": {
                            "value": 0.4
                        },
                        "exact_match_stderr": {
                            "value": 0.09999999999999999
                        }
                    }
                }
            }
        }
    },
    "namespace": "default",
    "custom_fields": {}
}