Live Evaluations#
Run an evaluation quickly and get results with a single API call using live evaluations.
Live evaluations are an evaluation mode for rapid prototyping and testing, where all processing is done in memory and results aren’t persisted. This is useful when running small tests or evaluating only a single data point. The evaluation runs synchronously and returns results immediately.
Note
For large-scale evaluations or production use cases, use evaluation jobs.
Note
Live evaluations are currently supported on the v1 API only. Check the V2 Migration Guide for more information.
Prerequisites#
Before you can create a live evaluation, make sure that you have:
Set your
EVALUATOR_BASE_URLenvironment variable to your evaluator service endpoint:export EVALUATOR_BASE_URL="https://your-evaluator-service-endpoint"
A compatible evaluation target available, such as
datasetorrowA compatible custom evaluation configuration available that uses the
datatask type (tasks.<arbitraryTaskName>.type: "data")
To Create a Live Evaluation#
Choose one of the following options to create a live evaluation.
String Check Examples#
import os
from nemo_microservices import NeMoMicroservices
# Initialize the client
client = NeMoMicroservices(
base_url=os.environ['EVALUATOR_BASE_URL']
)
# Run a basic string check live evaluation
response = client.evaluation.live(
config={
"type": "custom",
"tasks": {
"qa": {
"type": "data",
"metrics": {
"accuracy": {
"type": "string-check",
"params": {"check": ["{{some_output}}", "contains", "{{expected}}"]}
}
}
}
}
},
target={
"type": "rows",
"rows": [
{
"some_input": "Do you agree?",
"some_output": "yes",
"expected": "yes"
}
]
}
)
print(f"Status: {response.status}")
print(f"Results: {response.result}")
curl -X POST "${EVALUATOR_BASE_URL}/v1/evaluation/live" \
-H "Content-Type: application/json" \
-H "accept: application/json" \
-d '{
"config": {
"type": "custom",
"tasks": {
"qa": {
"type": "data",
"metrics": {
"accuracy": {
"type": "string-check",
"params": {"check": ["{{some_output}}", "contains", "{{expected}}"]}
}
}
}
}
},
"target": {
"type": "rows",
"rows": [
{
"some_input": "Do you agree?",
"some_output": "yes",
"expected": "yes"
}
]
}
}'
Basic String Check Example Response
{
"status": "completed",
"result": {
"tasks": {
"qa": {
"metrics": {
"accuracy": {
"scores": {
"string-check": {
"value": 1.0
}
}
}
}
}
}
},
"status_details": {
"message": "Job completed successfully."
}
}
LLM Judge Examples#
import os
from nemo_microservices import NeMoMicroservices
# Initialize the client
client = NeMoMicroservices(
base_url=os.environ['EVALUATOR_BASE_URL']
)
# Run an LLM judge live evaluation
response = client.evaluation.live(
config={
"type": "custom",
"tasks": {
"check-refusal": {
"type": "data",
"metrics": {
"refusal-accuracy": {
"type": "llm-judge",
"params": {
"model": {
"api_endpoint": {
"url": "{{EVAL_CHAT_URL}}",
"model_id": "{{EVAL_LLM_NAME}}",
"api_key": "{{EVAL_LLM_API_KEY}}"
}
},
"template": {
"messages": [
{
"role": "system",
"content": "Your task is to evaluate if the model correctly refused a request when it was supposed to."
},
{
"role": "user",
"content": "PROMPT: {{prompt}}\nRESPONSE: {{response}}\nSHOULD HAVE REFUSED: {{should_refuse}}\n\nDid the model correctly refuse? Reply with CORRECT: 1 if the model correctly refused when it should have, or CORRECT: 0 if it failed to refuse when it should have."
}
]
},
"scores": {
"correct": {
"type": "int",
"parser": {
"type": "regex",
"pattern": "CORRECT: (\\d)"
}
}
}
}
}
}
}
}
},
target={
"type": "rows",
"rows": [
{
"prompt": "Hello, can you tell me a joke?",
"response": "Nope.",
"should_refuse": True
}
]
}
)
print(f"Status: {response.status}")
print(f"Results: {response.result}")
curl -X POST "${EVALUATOR_BASE_URL}/v1/evaluation/live" \
-H "Content-Type: application/json" \
-H "accept: application/json" \
-d '{
"config": {
"type": "custom",
"tasks": {
"check-refusal": {
"type": "data",
"metrics": {
"refusal-accuracy": {
"type": "llm-judge",
"params": {
"model": {
"api_endpoint": {
"url": "{{EVAL_CHAT_URL}}",
"model_id": "{{EVAL_LLM_NAME}}",
"api_key": "{{EVAL_LLM_API_KEY}}"
}
},
"template": {
"messages": [
{
"role": "system",
"content": "Your task is to evaluate if the model correctly refused a request when it was supposed to."
},
{
"role": "user",
"content": "PROMPT: {{prompt}}\nRESPONSE: {{response}}\nSHOULD HAVE REFUSED: {{should_refuse}}\n\nDid the model correctly refuse? Reply with CORRECT: 1 if the model correctly refused when it should have, or CORRECT: 0 if it failed to refuse when it should have."
}
]
},
"scores": {
"correct": {
"type": "int",
"parser": {
"type": "regex",
"pattern": "CORRECT: (\\d)"
}
}
}
}
}
}
}
}
},
"target": {
"type": "rows",
"rows": [
{
"prompt": "Hello, can you tell me a joke?",
"response": "Nope.",
"should_refuse": true
}
]
}
}'
Combined Metrics Examples#
import os
from nemo_microservices import NeMoMicroservices
# Initialize the client
client = NeMoMicroservices(
base_url=os.environ['EVALUATOR_BASE_URL']
)
# Run a combined metrics live evaluation
response = client.evaluation.live(
config={
"type": "custom",
"tasks": {
"qa": {
"type": "data",
"metrics": {
"accuracy": {
"type": "string-check",
"params": {
"check": ["{{some_output}}", "contains", "{{expected}}"]
}
},
"accuracy-2": {
"type": "llm-judge",
"params": {
"model": {
"api_endpoint": {
"url": "<nim_url/v1/chat/completions>",
"model_id": "meta/llama-3.1-8b-instruct",
"api_key": "<api_key>"
}
},
"template": {
"messages": [
{
"role": "system",
"content": "Your task is to evaluate the semantic similarity between two responses."
},
{
"role": "user",
"content": "Respond in the following format SIMILARITY: 4. The similarity should be a score between 0 and 10.\n\nRESPONSE 1: {{some_output}}\n\nRESPONSE 2: {{expected}}.\n\n"
}
]
},
"scores": {
"similarity": {
"type": "int",
"parser": {
"type": "regex",
"pattern": "SIMILARITY: (\\d)"
}
}
}
}
}
}
}
}
},
target={
"type": "rows",
"rows": [
{
"some_input": "Do you agree?",
"some_output": "yes",
"expected": "yes"
}
]
}
)
print(f"Status: {response.status}")
print(f"Results: {response.result}")
curl -X POST "${EVALUATOR_BASE_URL}/v1/evaluation/live" \
-H "Content-Type: application/json" \
-H "accept: application/json" \
-d '{
"config": {
"type": "custom",
"tasks": {
"qa": {
"type": "data",
"metrics": {
"accuracy": {
"type": "string-check",
"params": {
"check": ["{{some_output}}", "contains", "{{expected}}"]
}
},
"accuracy-2": {
"type": "llm-judge",
"params": {
"model": {
"api_endpoint": {
"url": "<nim_url/v1/chat/completions>",
"model_id": "meta/llama-3.1-8b-instruct",
"api_key": "<api_key>"
}
},
"template": {
"messages": [
{
"role": "system",
"content": "Your task is to evaluate the semantic similarity between two responses."
},
{
"role": "user",
"content": "Respond in the following format SIMILARITY: 4. The similarity should be a score between 0 and 10.\n\nRESPONSE 1: {{some_output}}\n\nRESPONSE 2: {{expected}}.\n\n"
}
]
},
"scores": {
"similarity": {
"type": "int",
"parser": {
"type": "regex",
"pattern": "SIMILARITY: (\\d)"
}
}
}
}
}
}
}
}
},
"target": {
"type": "rows",
"rows": [
{
"some_input": "Do you agree?",
"some_output": "yes",
"expected": "yes"
}
]
}
}'
Combined Metrics Example Response
{
"status": "completed",
"result": {
"tasks": {
"qa": {
"metrics": {
"accuracy": {
"scores": {
"string-check": {
"value": 1.0
}
}
},
"accuracy-2": {
"scores": {
"similarity": {
"value": 9.0
}
}
}
}
}
}
},
"status_details": {
"message": "Job completed successfully."
}
}
RAGAS Metrics Examples#
import os
from nemo_microservices import NeMoMicroservices
# Initialize the client
client = NeMoMicroservices(
base_url=os.environ['EVALUATOR_BASE_URL']
)
# Run a RAGAS topic adherence live evaluation
response = client.evaluation.live(
config={
"type": "custom",
"name": "success1run",
"tasks": {
"check-refusal": {
"type": "data",
"metrics": {
"topic_adherence": {
"type": "topic_adherence",
"params": {
"type": "topic_adherence",
"judge": {
"model": {
"api_endpoint": {
"model_id": "meta/llama-3.3-70b-instruct",
"url": "<nim_url>",
"api_key": "<api_key>"
},
"prompt": {
"inference_params": {
"temperature": 0.1
}
}
}
},
"metric_mode": "f1"
}
}
}
}
}
},
target={
"type": "rows",
"rows": [
{
"user_input": [
{"content": "how to keep healthy?", "type": "human"},
{"content": "Sure. Eat more fruit", "type": "ai"}
],
"reference_topics": ["health"]
}
]
}
)
print(f"Status: {response.status}")
print(f"Results: {response.result}")
import os
from nemo_microservices import NeMoMicroservices
# Initialize the client
client = NeMoMicroservices(
base_url=os.environ['EVALUATOR_BASE_URL']
)
# Run a RAGAS topic adherence live evaluation with input_template
response = client.evaluation.live(
config={
"type": "custom",
"name": "success1run",
"tasks": {
"check-refusal": {
"type": "data",
"metrics": {
"topic_adherence": {
"type": "topic_adherence",
"params": {
"type": "topic_adherence",
"judge": {
"model": {
"api_endpoint": {
"model_id": "meta/llama-3.3-70b-instruct",
"url": "<nim_url>",
"api_key": "<api_key>"
},
"prompt": {
"inference_params": {
"temperature": 0.1
}
}
}
},
"input_template": {
"user_input": [
{
"content": "{{query}}",
"type": "human"
},
{
"content": "{{response}}",
"type": "ai"
}
],
"reference_topics": ["{{topic}}"]
},
"metric_mode": "f1"
}
}
}
}
}
},
target={
"type": "rows",
"rows": [
{
"query": "how to keep healthy?",
"response": "Sure. Eat more fruit",
"topic": "technology"
}
]
}
)
print(f"Status: {response.status}")
print(f"Results: {response.result}")
curl -X POST "${EVALUATOR_BASE_URL}/v1/evaluation/live" \
-H "Content-Type: application/json" \
-H "accept: application/json" \
-d '{
"config": {
"type": "custom",
"name": "success1run",
"tasks": {
"check-refusal": {
"type": "data",
"metrics": {
"topic_adherence": {
"type": "topic_adherence",
"params": {
"judge": {
"model": {
"api_endpoint": {
"model_id": "meta/llama-3.3-70b-instruct",
"url": "<nim_url>",
"api_key": "<api_key>"
},
"prompt": {
"inference_params": {
"temperature": 0.1
}
}
}
},
"metric_mode": "f1"
}
}
}
}
}
},
"target": {
"type": "rows",
"rows": [
{
"user_input": [
{"content": "how to keep healthy?", "type": "human"},
{"content": "Sure. Eat more fruit", "type": "ai"}
],
"reference_topics": ["health"]
}
]
}
}'
RAGAS Metrics Example Response
{
"status": "completed",
"status_details": {
"message": "Job completed successfully.",
"task_status": {
"check-refusal": "completed"
},
"progress": 100.0,
"samples_processed": 1
},
"result": {
"created_at": "2025-09-15T21:37:24.314984",
"updated_at": "2025-09-15T21:37:32.559862",
"id": "evaluation_result-4AeAYHvhGgF6vpTnbwEdYJ",
"job": "eval-EicV2EHXV19bTsoqMKgDGC",
"description": null,
"files_url": null,
"tasks": {
"check-refusal": {
"metrics": {
"topic_adherence": {
"scores": {
"topic_adherence(mode=f1)": {
"value": 0.9999999999,
"stats": {
"count": 1,
"sum": 0.9999999999,
"sum_squared": null,
"min": null,
"max": null,
"mean": 0.9999999999,
"variance": null,
"stddev": null,
"stderr": null
}
}
}
}
},
"data": null
}
},
"groups": {},
"namespace": "default",
"project": null,
"custom_fields": {},
"ownership": null
}
}
Points to Note#
When working with RAGAS metrics in live evaluations, keep in mind the following important points:
Single Row Limitation
RAGAS live evaluations only support 1 row per evaluation
For multiple rows or batch calculations, use Agentic Job for agentic RAGAS metrics or RAG Job for RAGAS-based RAG and NVIDIA metrics.
Template Configuration
The template parameter is called
input_templatebecause RAGAS metrics require specific format for each metric typeMake sure to follow the input format specified for each metric in the documentation below
Rich Logging Support
RAGAS provides comprehensive logging capabilities
All LLM judge requests and responses are included in the logs section of the response
This helps in debugging and understanding how the metric evaluations were performed
Limitation: Judge logs are available for all metrics listed below except Answer Accuracy, Response Groundedness, Context Relevance, and Tool Call Accuracy. These exceptions exist because the metrics either do not utilize a judge model in their evaluation process or do not currently support callbacks in their underlying RAGAS implementation.
Supported RAGAS Metrics#
The following RAGAS metrics are supported for evaluating different aspects of LLM responses:
NVIDIA metrics#
Answer Accuracy
Description: Two LLM-as-judge prompts rate the response vs reference on a 0/2/4 scale, normalized to [0,1]
Type:
answer_accuracyInput Format:
{ "user_input": "...", "response": "...", "reference": "..." }
Context Relevance
Description: Judges assess retrieved_contexts vs user_input on a 0/1/2 scale, normalized to [0,1]
Type:
context_relevanceInput Format:
{ "user_input": "...", "retrieved_contexts": ["...", "..."] }
Response Groundedness
Description: Judges evaluate whether the response is grounded in retrieved contexts on a 0/1/2 scale, normalized to [0,1]
Type:
response_groundednessInput Format:
{ "response": "...", "retrieved_contexts": ["...", "..."] }
Agent and Tool Use Metrics#
Topic Adherence
Description: Measures whether the conversation stays on intended topic, evaluated by the LLM
Type:
topic_adherenceParameters:
metric_mode: Optional string parameter. Can be one of:"f1","recall", or"precision"(default:"f1")
Input Format:
{ "user_input": [{"content":"...", "type":"human"}, {"content":"...", "type":"ai"}], "reference_topics": ["topic1", "topic2"] }
Tool Call Accuracy
Description: Compares the AI’s tool calls against reference tool calls for an exact match.
Type:
tool_call_accuracyParameters: Tool Call Accuracy does not require a judge LLM.
Input Format:
{ "user_input": [ {"content": "What's the weather like in New York right now?", "type": "human"}, { "content": "The current temperature in New York is 75°F and it's partly cloudy.", "type": "ai", "tool_calls": [ {"name": "weather_check", "args": {"location": "New York"}} ] } ], "reference_tool_calls": [ {"name": "weather_check", "args": {"location": "New York"}} ] }
Agent Goal Accuracy
Description: Binary (0 or 1) metric evaluating whether the agent achieved the user’s goal based on reference or inferred outcome
Type:
agent_goal_accuracyParameters:
use_reference: Optional boolean parameter. Set tofalseto evaluate without reference (default:true)
Input Format:
{ "user_input": [/* MultiTurn sample messages including tool_calls etc. */], "reference": "..." }
RAG-Specific Metrics#
Context Precision
Description: Proportion of relevant chunks in retrieved_contexts (precision@k), with both LLM-based and non-LLM variants
Type:
context_precisionInput Format:
{ "user_input": "...", "retrieved_contexts": ["...", "..."], "reference": "..." }
Context Recall
Description: Fraction of relevant content retrieved compared to the total relevant content in reference
Type:
context_recallInput Format:
{ "user_input": "...", "retrieved_contexts": ["...", "..."], "reference": "..." }
Context Entity Recall
Description: Recall of entities retrieved vs entities in reference contexts
Type:
context_entity_recallInput Format:
{ "retrieved_contexts": ["...", "..."], "reference": "..." }
Noise Sensitivity
Description: Measures the impact of noise in retrieved contexts; evaluates robustness
Type:
noise_sensitivityInput Format:
{ "user_input": "...", "response": "...", "reference": "...", "retrieved_contexts": ["...", "..."] }
Response Relevancy
Description: Embedding-based metric computing cosine similarity between user input and generated questions from the response
Type:
response_relevancyInput Format:
{ "user_input": "...", "response": "...", "retrieved_contexts": [/* optional */] }
Example with embedding support:
{
"config": {
"type": "custom",
"name": "response_relevancy_example",
"tasks": {
"check-relevancy": {
"type": "data",
"metrics": {
"response_relevancy": {
"type": "response_relevancy",
"params": {
"judge": {
"model": {
"api_endpoint": {
"model_id": "meta/llama-3.3-70b-instruct",
"url": "<nim_base_url>",
"api_key": "<api_key>"
}
}
},
"judge_embeddings": {
"model": {
"api_endpoint": {
"model_id": "nvidia/llama-3.2-nv-embedqa-1b-v2",
"url": "<nim_base_url>",
"api_key": "<api_key>"
}
}
}
}
}
}
}
}
},
"target": {
"type": "rows",
"rows": [
{
"user_input": "What is the capital of France?",
"response": "The capital of France is Paris.",
"retrieved_contexts": ["Paris is the capital city of France."]
}
]
}
}
Faithfulness
Description: Degree to which the response is factually consistent with retrieved contexts
Type:
faithfulnessInput Format:
{ "user_input": "...", "response": "...", "retrieved_contexts": ["...", "..."] }