Run an LLM Judge Eval#
Learn how to run an LLM Judge evaluation over a custom dataset.
- Evaluation type: - custom, using the LLM as a Judge flow
- Dataset: HelpSteer2 
Tip
This tutorial takes around 3 minutes to complete.
Prerequisites#
- Set up Evaluator. Refer to the demo cluster setup prerequisites or production deployment guides for the platform and Evaluator individually. 
- Store your service URLs as variables for use in code. - import os # Replace with your endpoints os.environ["EVALUATOR_BASE_URL"] = "http(s)://<your evaluator service endpoint>" os.environ["NEMO_DATASTORE_URL"] = "http(s)://<your datastore endpoint>" # Namespace and dataset name used for hf:// URL NAMESPACE = "default" DATASET_NAME = "my-repo" # Required tokens os.environ["NVIDIA_NIM_API_KEY"] = "<your model API key>" os.environ["HF_TOKEN"] = "<your HF token>" - export EVALUATOR_BASE_URL="http(s)://<your evaluator service endpoint>" export NEMO_DATASTORE_URL="http(s)://<your datastore endpoint>" export NAMESPACE="default" export DATASET_NAME="my-repo" export NVIDIA_NIM_API_KEY="<your model API key>" export HF_TOKEN="<your HF token>" 
- Ensure your Hugging Face token can create and write to datasets. 
- Set up a Judge LLM in your cluster, or use an external one such as one from build.nvidia.com (LLama 3.3 70b Example). 
1. Prepare Your Dataset#
First, we’ll prepare a custom dataset from HelpSteer2 by extracting only the prompt and response columns for evaluation. Later, We will compare the LLM judge’s predictions with the original metrics.
- Download and process the dataset. - import requests import pandas as pd # Download the HelpSteer2 dataset from Hugging Face df = pd.read_json("hf://datasets/nvidia/HelpSteer2/train.jsonl.gz", lines=True) # Extract only the prompt and response columns for evaluation df = df[["prompt", "response"]].head(30) # Save to a local file file_name = "helpsteer2.jsonl" df.to_json(file_name, orient="records", lines=True) print(f"Dataset prepared with {len(df)} samples") print(f"Sample data:") print(df.head()) 
- Upload dataset to NeMo Data Store. - import os from huggingface_hub import HfApi HF_ENDPOINT = f"{os.environ['NEMO_DATASTORE_URL']}/v1/hf" hf_api = HfApi(endpoint=HF_ENDPOINT, token=os.environ["HF_TOKEN"]) repo_id = f"{NAMESPACE}/{DATASET_NAME}" # Create the dataset repo if it doesn't exist hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True) # Upload the file result = hf_api.upload_file( path_or_fileobj=file_name, path_in_repo=file_name, repo_id=repo_id, repo_type="dataset", revision="main", commit_message=f"Eval dataset in {repo_id}" ) print(f"Dataset uploaded: {result}") 
2. Submit the Evaluation Job#
import os
from nemo_microservices import NeMoMicroservices
client = NeMoMicroservices(base_url=os.environ["EVALUATOR_BASE_URL"])
# Model endpoint settings for the judge
MODEL_BASE_URL = "<insert LLM Judge model URL here>"  # Add the url of your endpoint; it can be any model endpoint, such as an OpenAI endpoint, or a NIM endpoint.
MODEL_ID = "<insert model ID here>"                         # replace as needed
files_url = f"hf://datasets/{NAMESPACE}/{DATASET_NAME}"
# Inline config mirrors the developer notebook
config = {
    "type": "custom",
    "name": "my-config",
    "namespace": NAMESPACE,
    "tasks": {
        "my-task": {
            "type": "data",
            "metrics": {
                "my_eval": {
                    "type": "llm-judge",
                    "params": {
                        "model": {
                            "api_endpoint": {
                                "url": MODEL_BASE_URL,
                                "model_id": MODEL_ID,
                                "format": "openai",
                                "api_key": os.environ["NVIDIA_NIM_API_KEY"]
                            }
                        },
                        "template": {
                            "messages": [
                                {"role": "system", "content": "You are an expert evaluator for answers to user queries. Your task is to assess responses to user queries based on helpfulness, relevance, accuracy, and clarity."},
                                {"role": "user", "content": "Calculate the following metrics for the response: User Query: {{item.prompt}} Model Response: {{item.response}} Metrics: 1. Helpfulness (0-4): How well does the response help the user? 2. Correctness (0-4): Is the information correct? 3. Coherence (0-4): Is the response logically consistent and well-structured? 4. Complexity (0-4): How sophisticated is the response? 5. Verbosity (0-4): Is the response appropriately detailed? Instructions: Assign a score from 0 (poor) to 4 (excellent) for each metric. Respond in JSON format only: { \"helpfulness\": ..., \"correctness\": ..., \"coherence\": ..., \"complexity\": ..., \"verbosity\": ... }"}
                            ]
                        },
                        "scores": {
                            "helpfulness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"helpfulness\": *(\\d+)"}},
                            "correctness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"correctness\": *(\\d+)"}},
                            "coherence":   {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"coherence\": *(\\d+)"}},
                            "complexity":  {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"complexity\": *(\\d+)"}},
                            "verbosity":   {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"verbosity\": *(\\d+)"}}
                        }
                    }
                }
            },
            "dataset": {"files_url": files_url}
        }
    }
}
target = {"type": "dataset", "dataset": {"files_url": files_url}}
job = client.evaluation.jobs.create(
    namespace=NAMESPACE,
    target=target,
    config=config
)
job_id = job.id
curl -X POST "${EVALUATOR_BASE_URL}/v1/evaluation/jobs" \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
    "namespace": "'${NAMESPACE}'",
    "target": {"type": "dataset", "dataset": {"files_url": "hf://datasets/'${NAMESPACE}'/'${DATASET_NAME}'"}},
    "config": {
      "type": "custom",
      "name": "my-config",
      "namespace": "'${NAMESPACE}'",
      "tasks": {
        "my-task": {
          "type": "data",
          "metrics": {
            "my_eval": {
              "type": "llm-judge",
              "params": {
                "model": {
                  "api_endpoint": {
                    "url": "https://integrate.api.nvidia.com/v1",
                    "model_id": "<insert model ID here>",
                    "format": "openai",
                    "api_key": "'${NVIDIA_NIM_API_KEY}'"
                  }
                },
                "template": {
                  "messages": [
                    {"role": "system", "content": "You are an expert evaluator for answers to user queries. Your task is to assess responses to user queries based on helpfulness, relevance, accuracy, and clarity."},
                    {"role": "user", "content": "Calculate the following metrics for the response: User Query: {{item.prompt}} Model Response: {{item.response}} Metrics: 1. Helpfulness (0-4): How well does the response help the user? 2. Correctness (0-4): Is the information correct? 3. Coherence (0-4): Is the response logically consistent and well-structured? 4. Complexity (0-4): How sophisticated is the response? 5. Verbosity (0-4): Is the response appropriately detailed? Instructions: Assign a score from 0 (poor) to 4 (excellent) for each metric. Respond in JSON format only: { \"helpfulness\": ..., \"correctness\": ..., \"coherence\": ..., \"complexity\": ..., \"verbosity\": ... }"}
                  ]
                },
                "scores": {
                  "helpfulness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"helpfulness\\\": *(\\\\d+)"}},
                  "correctness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"correctness\\\": *(\\\\d+)"}},
                  "coherence":   {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"coherence\\\": *(\\\\d+)"}},
                  "complexity":  {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"complexity\\\": *(\\\\d+)"}},
                  "verbosity":   {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"verbosity\\\": *(\\\\d+)"}}
                }
              }
            }
          },
          "dataset": {"files_url": "hf://datasets/'${NAMESPACE}'/'${DATASET_NAME}'"}
        }
      }
    }
  }'
3. Check Job Status#
Check job status until it completes.
# Get the status of the evaluation job using the job_id from the previous step
job_status = client.evaluation.jobs.status(job_id)
print(f"Job status: {job_status.message}")
print(f"Progress: {job_status.progress}%")
curl -X "GET" "${EVALUATOR_BASE_URL}/v1/evaluation/jobs/${job_id}/status" \
  -H 'accept: application/json'
Example response:
{
  "message": "completed",
  "task_status": {},
  "progress": 100
}
4. Retrieve Evaluation Results#
Once the job completes, retrieve the evaluation results to analyze the LLM judge’s assessments.
As Download#
Download results ZIP to a local file.
zip_response = client.evaluation.jobs.download_results(job_id)
zip_response.write_to_file("evaluation_results.zip")
print("Saved to evaluation_results.zip")
curl -X GET "${EVALUATOR_BASE_URL}/v1/evaluation/jobs/${job_id}/download-results" \
  -H 'accept: application/zip' \
  -o evaluation_results.zip
As JSON#
View results as JSON.
results = client.evaluation.jobs.results(job_id)
# Access the results
print(f"Result ID: {results.id}")
print(f"Job ID: {results.job}")
print(f"Tasks: {results.tasks}")
print(f"Groups: {results.groups}")
curl -X GET "${EVALUATOR_BASE_URL}/v1/evaluation/jobs/${job_id}/results" -H 'accept: application/json'