Run an LLM Judge Eval#
Learn how to run an LLM Judge evaluation over a custom dataset.
Evaluation type:
custom, using the LLM as a Judge flowDataset: HelpSteer2
Tip
This tutorial takes around 3 minutes to complete.
Prerequisites#
Set up Evaluator. Refer to the demo cluster setup prerequisites or production deployment guides for the platform and Evaluator individually.
Store your service URLs as variables for use in code.
import os # Replace with your endpoints os.environ["EVALUATOR_BASE_URL"] = "http(s)://<your evaluator service endpoint>" os.environ["NEMO_DATASTORE_URL"] = "http(s)://<your datastore endpoint>" # Namespace and dataset name used for hf:// URL NAMESPACE = "default" DATASET_NAME = "my-repo" # Required tokens os.environ["NVIDIA_NIM_API_KEY"] = "<your model API key>" os.environ["HF_TOKEN"] = "<your HF token>"
export EVALUATOR_BASE_URL="http(s)://<your evaluator service endpoint>" export NEMO_DATASTORE_URL="http(s)://<your datastore endpoint>" export NAMESPACE="default" export DATASET_NAME="my-repo" export NVIDIA_NIM_API_KEY="<your model API key>" export HF_TOKEN="<your HF token>"
Ensure your Hugging Face token can create and write to datasets.
Set up a Judge LLM in your cluster, or use an external one such as one from build.nvidia.com (LLama 3.3 70b Example).
1. Prepare Your Dataset#
First, we’ll prepare a custom dataset from HelpSteer2 by extracting only the prompt and response columns for evaluation. Later, We will compare the LLM judge’s predictions with the original metrics.
Download and process the dataset.
import requests import pandas as pd # Download the HelpSteer2 dataset from Hugging Face df = pd.read_json("hf://datasets/nvidia/HelpSteer2/train.jsonl.gz", lines=True) # Extract only the prompt and response columns for evaluation df = df[["prompt", "response"]].head(30) # Save to a local file file_name = "helpsteer2.jsonl" df.to_json(file_name, orient="records", lines=True) print(f"Dataset prepared with {len(df)} samples") print(f"Sample data:") print(df.head())
Upload dataset to NeMo Data Store.
import os from huggingface_hub import HfApi HF_ENDPOINT = f"{os.environ['NEMO_DATASTORE_URL']}/v1/hf" hf_api = HfApi(endpoint=HF_ENDPOINT, token=os.environ["HF_TOKEN"]) repo_id = f"{NAMESPACE}/{DATASET_NAME}" # Create the dataset repo if it doesn't exist hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True) # Upload the file result = hf_api.upload_file( path_or_fileobj=file_name, path_in_repo=file_name, repo_id=repo_id, repo_type="dataset", revision="main", commit_message=f"Eval dataset in {repo_id}" ) print(f"Dataset uploaded: {result}")
2. Submit the Evaluation Job#
v2 (Preview)#
Warning
v2 API Preview: The v2 API is available for testing and feedback but is not yet recommended for production use. Breaking changes may occur before the stable release.
import os
from nemo_microservices import NeMoMicroservices
client = NeMoMicroservices(base_url=os.environ["EVALUATOR_BASE_URL"])
# Model endpoint settings for the judge
MODEL_BASE_URL = "<insert LLM Judge model URL here>" # Add the url of your endpoint; it can be any model endpoint, such as an OpenAI endpoint, or a NIM endpoint.
MODEL_ID = "<insert model ID here>" # replace as needed
files_url = f"hf://datasets/{NAMESPACE}/{DATASET_NAME}"
# Inline config mirrors the developer notebook
config = {
"type": "custom",
"tasks": {
"my-task": {
"type": "data",
"metrics": {
"my_eval": {
"type": "llm-judge",
"params": {
"model": {
"api_endpoint": {
"url": MODEL_BASE_URL,
"model_id": MODEL_ID,
"format": "openai",
"api_key": os.environ["NVIDIA_NIM_API_KEY"]
}
},
"template": {
"messages": [
{"role": "system", "content": "You are an expert evaluator for answers to user queries. Your task is to assess responses to user queries based on helpfulness, relevance, accuracy, and clarity."},
{"role": "user", "content": "Calculate the following metrics for the response: User Query: {{item.prompt}} Model Response: {{item.response}} Metrics: 1. Helpfulness (0-4): How well does the response help the user? 2. Correctness (0-4): Is the information correct? 3. Coherence (0-4): Is the response logically consistent and well-structured? 4. Complexity (0-4): How sophisticated is the response? 5. Verbosity (0-4): Is the response appropriately detailed? Instructions: Assign a score from 0 (poor) to 4 (excellent) for each metric. Respond in JSON format only: { \"helpfulness\": ..., \"correctness\": ..., \"coherence\": ..., \"complexity\": ..., \"verbosity\": ... }"}
]
},
"scores": {
"helpfulness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"helpfulness\": *(\\d+)"}},
"correctness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"correctness\": *(\\d+)"}},
"coherence": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"coherence\": *(\\d+)"}},
"complexity": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"complexity\": *(\\d+)"}},
"verbosity": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"verbosity\": *(\\d+)"}}
}
}
}
},
"dataset": {"files_url": files_url}
}
}
}
target = {"type": "dataset", "dataset": {"files_url": files_url}}
job = client.v2.evaluation.jobs.create(
spec={
"target": target,
"config": config
}
)
job_id = job.id
curl -X POST "${EVALUATOR_BASE_URL}/v2/evaluation/jobs" \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"spec": {
"target": {"type": "dataset", "dataset": {"files_url": "hf://datasets/'${NAMESPACE}'/'${DATASET_NAME}'"}},
"config": {
"type": "custom",
"tasks": {
"my-task": {
"type": "data",
"metrics": {
"my_eval": {
"type": "llm-judge",
"params": {
"model": {
"api_endpoint": {
"url": "<insert LLM Judge model URL here>",
"model_id": "<insert model ID here>",
"format": "openai",
"api_key": "'${NVIDIA_NIM_API_KEY}'"
}
},
"template": {
"messages": [
{"role": "system", "content": "You are an expert evaluator for answers to user queries. Your task is to assess responses to user queries based on helpfulness, relevance, accuracy, and clarity."},
{"role": "user", "content": "Calculate the following metrics for the response: User Query: {{item.prompt}} Model Response: {{item.response}} Metrics: 1. Helpfulness (0-4): How well does the response help the user? 2. Correctness (0-4): Is the information correct? 3. Coherence (0-4): Is the response logically consistent and well-structured? 4. Complexity (0-4): How sophisticated is the response? 5. Verbosity (0-4): Is the response appropriately detailed? Instructions: Assign a score from 0 (poor) to 4 (excellent) for each metric. Respond in JSON format only: { \"helpfulness\": ..., \"correctness\": ..., \"coherence\": ..., \"complexity\": ..., \"verbosity\": ... }"}
]
},
"scores": {
"helpfulness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"helpfulness\": *(\\\\d+)"}},
"correctness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"correctness\": *(\\\\d+)"}},
"coherence": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"coherence\": *(\\\\d+)"}},
"complexity": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"complexity\": *(\\\\d+)"}},
"verbosity": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"verbosity\": *(\\\\d+)"}}
}
}
}
},
"dataset": {"files_url": "hf://datasets/'${NAMESPACE}'/'${DATASET_NAME}'"}
}
}
}
}
}'
v1 (Current)#
import os
from nemo_microservices import NeMoMicroservices
client = NeMoMicroservices(base_url=os.environ["EVALUATOR_BASE_URL"])
# Model endpoint settings for the judge
MODEL_BASE_URL = "<insert LLM Judge model URL here>" # Add the url of your endpoint; it can be any model endpoint, such as an OpenAI endpoint, or a NIM endpoint.
MODEL_ID = "<insert model ID here>" # replace as needed
files_url = f"hf://datasets/{NAMESPACE}/{DATASET_NAME}"
# Inline config mirrors the developer notebook
config = {
"type": "custom",
"name": "my-config",
"namespace": NAMESPACE,
"tasks": {
"my-task": {
"type": "data",
"metrics": {
"my_eval": {
"type": "llm-judge",
"params": {
"model": {
"api_endpoint": {
"url": MODEL_BASE_URL,
"model_id": MODEL_ID,
"format": "openai",
"api_key": os.environ["NVIDIA_NIM_API_KEY"]
}
},
"template": {
"messages": [
{"role": "system", "content": "You are an expert evaluator for answers to user queries. Your task is to assess responses to user queries based on helpfulness, relevance, accuracy, and clarity."},
{"role": "user", "content": "Calculate the following metrics for the response: User Query: {{item.prompt}} Model Response: {{item.response}} Metrics: 1. Helpfulness (0-4): How well does the response help the user? 2. Correctness (0-4): Is the information correct? 3. Coherence (0-4): Is the response logically consistent and well-structured? 4. Complexity (0-4): How sophisticated is the response? 5. Verbosity (0-4): Is the response appropriately detailed? Instructions: Assign a score from 0 (poor) to 4 (excellent) for each metric. Respond in JSON format only: { \"helpfulness\": ..., \"correctness\": ..., \"coherence\": ..., \"complexity\": ..., \"verbosity\": ... }"}
]
},
"scores": {
"helpfulness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"helpfulness\": *(\\d+)"}},
"correctness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"correctness\": *(\\d+)"}},
"coherence": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"coherence\": *(\\d+)"}},
"complexity": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"complexity\": *(\\d+)"}},
"verbosity": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"verbosity\": *(\\d+)"}}
}
}
}
},
"dataset": {"files_url": files_url}
}
}
}
target = {"type": "dataset", "dataset": {"files_url": files_url}}
job = client.evaluation.jobs.create(
namespace=NAMESPACE,
target=target,
config=config
)
job_id = job.id
curl -X POST "${EVALUATOR_BASE_URL}/v1/evaluation/jobs" \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"namespace": "'${NAMESPACE}'",
"target": {"type": "dataset", "dataset": {"files_url": "hf://datasets/'${NAMESPACE}'/'${DATASET_NAME}'"}},
"config": {
"type": "custom",
"name": "my-config",
"namespace": "'${NAMESPACE}'",
"tasks": {
"my-task": {
"type": "data",
"metrics": {
"my_eval": {
"type": "llm-judge",
"params": {
"model": {
"api_endpoint": {
"url": "https://integrate.api.nvidia.com/v1",
"model_id": "<insert model ID here>",
"format": "openai",
"api_key": "'${NVIDIA_NIM_API_KEY}'"
}
},
"template": {
"messages": [
{"role": "system", "content": "You are an expert evaluator for answers to user queries. Your task is to assess responses to user queries based on helpfulness, relevance, accuracy, and clarity."},
{"role": "user", "content": "Calculate the following metrics for the response: User Query: {{item.prompt}} Model Response: {{item.response}} Metrics: 1. Helpfulness (0-4): How well does the response help the user? 2. Correctness (0-4): Is the information correct? 3. Coherence (0-4): Is the response logically consistent and well-structured? 4. Complexity (0-4): How sophisticated is the response? 5. Verbosity (0-4): Is the response appropriately detailed? Instructions: Assign a score from 0 (poor) to 4 (excellent) for each metric. Respond in JSON format only: { \"helpfulness\": ..., \"correctness\": ..., \"coherence\": ..., \"complexity\": ..., \"verbosity\": ... }"}
]
},
"scores": {
"helpfulness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"helpfulness\\\": *(\\\\d+)"}},
"correctness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"correctness\\\": *(\\\\d+)"}},
"coherence": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"coherence\\\": *(\\\\d+)"}},
"complexity": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"complexity\\\": *(\\\\d+)"}},
"verbosity": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"verbosity\\\": *(\\\\d+)"}}
}
}
}
},
"dataset": {"files_url": "hf://datasets/'${NAMESPACE}'/'${DATASET_NAME}'"}
}
}
}
}'
3. Check Job Status#
Check job status until it completes.
v2 (Preview)#
In v2, status information is consolidated into the main job details response.
# v2 - Get all job details including status in one call
job_details = client.v2.evaluation.jobs.retrieve(job.id)
print(f"Job status: {job_details.status}")
print(f"Status details: {job_details.status_details}")
if job_details.status_details and 'progress' in job_details.status_details:
print(f"Progress: {job_details.status_details['progress']}%")
curl -X "GET" "${EVALUATOR_BASE_URL}/v2/evaluation/jobs/${job_id}" \
-H 'accept: application/json'
v1 (Current)#
# Get the status of the evaluation job using the job_id from the previous step
job_status = client.evaluation.jobs.status(job_id)
print(f"Job status: {job_status.message}")
print(f"Progress: {job_status.progress}%")
curl -X "GET" "${EVALUATOR_BASE_URL}/v1/evaluation/jobs/${job_id}/status" \
-H 'accept: application/json'
Example response:
{
"message": "completed",
"task_status": {},
"progress": 100
}
4. Retrieve Evaluation Results#
Once the job completes, retrieve the evaluation results to analyze the LLM judge’s assessments.
As Download#
Download results ZIP to a local file.
v2 (Preview)#
# v2 - Download job artifacts (includes logs, intermediate files, etc.)
artifacts_zip = client.v2.evaluation.jobs.results.artifacts.retrieve(job_id)
artifacts_zip.write_to_file("evaluation_artifacts.zip")
print("Saved artifacts to evaluation_artifacts.zip")
# v2 - Download evaluation results separately
eval_results = client.v2.evaluation.jobs.results.evaluation_results.retrieve(job_id)
with open("evaluation_results.json", "w") as f:
f.write(eval_results.model_dump_json(indent=2, exclude_none=True))
print("Saved results to evaluation_results.json")
# Download job artifacts
curl -X GET "${EVALUATOR_BASE_URL}/v2/evaluation/jobs/${job_id}/results/artifacts/download" \
-H 'accept: application/zip' \
-o evaluation_artifacts.zip
# Download evaluation results
curl -X GET "${EVALUATOR_BASE_URL}/v2/evaluation/jobs/${job_id}/results/evaluation-results/download" \
-H 'accept: application/zip' \
-o evaluation_results.zip
v1 (Current)#
zip_response = client.evaluation.jobs.download_results(job_id)
zip_response.write_to_file("evaluation_results.zip")
print("Saved to evaluation_results.zip")
curl -X GET "${EVALUATOR_BASE_URL}/v1/evaluation/jobs/${job_id}/download-results" \
-H 'accept: application/zip' \
-o evaluation_results.zip
As JSON#
View results as JSON.
v2 (Preview)#
# v2 - Get structured evaluation results
results = client.v2.evaluation.jobs.results.evaluation_results.retrieve(job_id)
# Access the results
print(results)
# v2 - List available result types
available_results = client.v2.evaluation.jobs.results.list(job_id)
print(f"Available results: {[r.result_name for r in available_results.data]}")
# Get structured evaluation results
curl -X GET "${EVALUATOR_BASE_URL}/v2/evaluation/jobs/${job_id}/results/evaluation-results/download" \
-H 'accept: application/json'
# List available result types
curl -X GET "${EVALUATOR_BASE_URL}/v2/evaluation/jobs/${job_id}/results" \
-H 'accept: application/json'
v1 (Current)#
results = client.evaluation.jobs.results(job_id)
# Access the results
print(f"Result ID: {results.id}")
print(f"Job ID: {results.job}")
print(f"Tasks: {results.tasks}")
print(f"Groups: {results.groups}")
curl -X GET "${EVALUATOR_BASE_URL}/v1/evaluation/jobs/${job_id}/results" -H 'accept: application/json'