Run an LLM Judge Eval#

Learn how to run an LLM Judge evaluation over a custom dataset.

Tip

This tutorial takes around 3 minutes to complete.

Prerequisites#

  1. Set up Evaluator. Refer to the demo cluster setup prerequisites or production deployment guides for the platform and Evaluator individually.

  2. Store your service URLs as variables for use in code.

    import os
    
    # Replace with your endpoints
    os.environ["EVALUATOR_BASE_URL"] = "http(s)://<your evaluator service endpoint>"
    os.environ["NEMO_DATASTORE_URL"] = "http(s)://<your datastore endpoint>"
    
    # Namespace and dataset name used for hf:// URL
    NAMESPACE = "default"
    DATASET_NAME = "my-repo"
    
    # Required tokens
    os.environ["NVIDIA_NIM_API_KEY"] = "<your model API key>"
    os.environ["HF_TOKEN"] = "<your HF token>"
    
    export EVALUATOR_BASE_URL="http(s)://<your evaluator service endpoint>"
    export NEMO_DATASTORE_URL="http(s)://<your datastore endpoint>"
    export NAMESPACE="default"
    export DATASET_NAME="my-repo"
    export NVIDIA_NIM_API_KEY="<your model API key>"
    export HF_TOKEN="<your HF token>"
    
  3. Ensure your Hugging Face token can create and write to datasets.

  4. Set up a Judge LLM in your cluster, or use an external one such as one from build.nvidia.com (LLama 3.3 70b Example).


1. Prepare Your Dataset#

First, we’ll prepare a custom dataset from HelpSteer2 by extracting only the prompt and response columns for evaluation. Later, We will compare the LLM judge’s predictions with the original metrics.

  1. Download and process the dataset.

    import requests
    import pandas as pd
    
    # Download the HelpSteer2 dataset from Hugging Face
    df = pd.read_json("hf://datasets/nvidia/HelpSteer2/train.jsonl.gz", lines=True)
    
    # Extract only the prompt and response columns for evaluation
    df = df[["prompt", "response"]].head(30)
    
    # Save to a local file
    file_name = "helpsteer2.jsonl"
    df.to_json(file_name, orient="records", lines=True)
    
    print(f"Dataset prepared with {len(df)} samples")
    print(f"Sample data:")
    print(df.head())
    
  2. Upload dataset to NeMo Data Store.

    import os
    from huggingface_hub import HfApi
    
    HF_ENDPOINT = f"{os.environ['NEMO_DATASTORE_URL']}/v1/hf"
    
    hf_api = HfApi(endpoint=HF_ENDPOINT, token=os.environ["HF_TOKEN"])
    repo_id = f"{NAMESPACE}/{DATASET_NAME}"
    
    # Create the dataset repo if it doesn't exist
    hf_api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
    
    # Upload the file
    result = hf_api.upload_file(
        path_or_fileobj=file_name,
        path_in_repo=file_name,
        repo_id=repo_id,
        repo_type="dataset",
        revision="main",
        commit_message=f"Eval dataset in {repo_id}"
    )
    
    print(f"Dataset uploaded: {result}")
    

2. Submit the Evaluation Job#

import os
from nemo_microservices import NeMoMicroservices

client = NeMoMicroservices(base_url=os.environ["EVALUATOR_BASE_URL"])

# Model endpoint settings for the judge
MODEL_BASE_URL = "<insert LLM Judge model URL here>"  # Add the url of your endpoint; it can be any model endpoint, such as an OpenAI endpoint, or a NIM endpoint.
MODEL_ID = "<insert model ID here>"                         # replace as needed

files_url = f"hf://datasets/{NAMESPACE}/{DATASET_NAME}"

# Inline config mirrors the developer notebook
config = {
    "type": "custom",
    "name": "my-config",
    "namespace": NAMESPACE,
    "tasks": {
        "my-task": {
            "type": "data",
            "metrics": {
                "my_eval": {
                    "type": "llm-judge",
                    "params": {
                        "model": {
                            "api_endpoint": {
                                "url": MODEL_BASE_URL,
                                "model_id": MODEL_ID,
                                "format": "openai",
                                "api_key": os.environ["NVIDIA_NIM_API_KEY"]
                            }
                        },
                        "template": {
                            "messages": [
                                {"role": "system", "content": "You are an expert evaluator for answers to user queries. Your task is to assess responses to user queries based on helpfulness, relevance, accuracy, and clarity."},
                                {"role": "user", "content": "Calculate the following metrics for the response: User Query: {{item.prompt}} Model Response: {{item.response}} Metrics: 1. Helpfulness (0-4): How well does the response help the user? 2. Correctness (0-4): Is the information correct? 3. Coherence (0-4): Is the response logically consistent and well-structured? 4. Complexity (0-4): How sophisticated is the response? 5. Verbosity (0-4): Is the response appropriately detailed? Instructions: Assign a score from 0 (poor) to 4 (excellent) for each metric. Respond in JSON format only: { \"helpfulness\": ..., \"correctness\": ..., \"coherence\": ..., \"complexity\": ..., \"verbosity\": ... }"}
                            ]
                        },
                        "scores": {
                            "helpfulness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"helpfulness\": *(\\d+)"}},
                            "correctness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"correctness\": *(\\d+)"}},
                            "coherence":   {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"coherence\": *(\\d+)"}},
                            "complexity":  {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"complexity\": *(\\d+)"}},
                            "verbosity":   {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\"verbosity\": *(\\d+)"}}
                        }
                    }
                }
            },
            "dataset": {"files_url": files_url}
        }
    }
}

target = {"type": "dataset", "dataset": {"files_url": files_url}}

job = client.evaluation.jobs.create(
    namespace=NAMESPACE,
    target=target,
    config=config
)

job_id = job.id
curl -X POST "${EVALUATOR_BASE_URL}/v1/evaluation/jobs" \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
    "namespace": "'${NAMESPACE}'",
    "target": {"type": "dataset", "dataset": {"files_url": "hf://datasets/'${NAMESPACE}'/'${DATASET_NAME}'"}},
    "config": {
      "type": "custom",
      "name": "my-config",
      "namespace": "'${NAMESPACE}'",
      "tasks": {
        "my-task": {
          "type": "data",
          "metrics": {
            "my_eval": {
              "type": "llm-judge",
              "params": {
                "model": {
                  "api_endpoint": {
                    "url": "https://integrate.api.nvidia.com/v1",
                    "model_id": "<insert model ID here>",
                    "format": "openai",
                    "api_key": "'${NVIDIA_NIM_API_KEY}'"
                  }
                },
                "template": {
                  "messages": [
                    {"role": "system", "content": "You are an expert evaluator for answers to user queries. Your task is to assess responses to user queries based on helpfulness, relevance, accuracy, and clarity."},
                    {"role": "user", "content": "Calculate the following metrics for the response: User Query: {{item.prompt}} Model Response: {{item.response}} Metrics: 1. Helpfulness (0-4): How well does the response help the user? 2. Correctness (0-4): Is the information correct? 3. Coherence (0-4): Is the response logically consistent and well-structured? 4. Complexity (0-4): How sophisticated is the response? 5. Verbosity (0-4): Is the response appropriately detailed? Instructions: Assign a score from 0 (poor) to 4 (excellent) for each metric. Respond in JSON format only: { \"helpfulness\": ..., \"correctness\": ..., \"coherence\": ..., \"complexity\": ..., \"verbosity\": ... }"}
                  ]
                },
                "scores": {
                  "helpfulness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"helpfulness\\\": *(\\\\d+)"}},
                  "correctness": {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"correctness\\\": *(\\\\d+)"}},
                  "coherence":   {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"coherence\\\": *(\\\\d+)"}},
                  "complexity":  {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"complexity\\\": *(\\\\d+)"}},
                  "verbosity":   {"type": "int", "parser": {"type": "regex", "pattern": "(?s).*\\\"verbosity\\\": *(\\\\d+)"}}
                }
              }
            }
          },
          "dataset": {"files_url": "hf://datasets/'${NAMESPACE}'/'${DATASET_NAME}'"}
        }
      }
    }
  }'

3. Check Job Status#

Check job status until it completes.

# Get the status of the evaluation job using the job_id from the previous step
job_status = client.evaluation.jobs.status(job_id)
print(f"Job status: {job_status.message}")
print(f"Progress: {job_status.progress}%")
curl -X "GET" "${EVALUATOR_BASE_URL}/v1/evaluation/jobs/${job_id}/status" \
  -H 'accept: application/json'

Example response:

{
  "message": "completed",
  "task_status": {},
  "progress": 100
}

4. Retrieve Evaluation Results#

Once the job completes, retrieve the evaluation results to analyze the LLM judge’s assessments.

As Download#

Download results ZIP to a local file.

zip_response = client.evaluation.jobs.download_results(job_id)
zip_response.write_to_file("evaluation_results.zip")
print("Saved to evaluation_results.zip")
curl -X GET "${EVALUATOR_BASE_URL}/v1/evaluation/jobs/${job_id}/download-results" \
  -H 'accept: application/zip' \
  -o evaluation_results.zip

As JSON#

View results as JSON.

results = client.evaluation.jobs.results(job_id)

# Access the results
print(f"Result ID: {results.id}")
print(f"Job ID: {results.job}")
print(f"Tasks: {results.tasks}")
print(f"Groups: {results.groups}")
curl -X GET "${EVALUATOR_BASE_URL}/v1/evaluation/jobs/${job_id}/results" -H 'accept: application/json'