RAG Evaluation Flow#
RAG (Retrieval Augmented Generation) evaluation types are designed to measure the effectiveness of pipelines that retrieve relevant documents and generate answers based on retrieved content. Use this evaluation type to assess both retrieval and answer quality in RAG systems.
Prerequisites#
Before running RAG evaluations, ensure you have:
Access to embedding models for retrieval and evaluation
Judge LLM and embedding models for answer evaluation metrics
Proper API endpoints configured for your pipeline components
Custom Datasets
Upload your dataset to NeMo Data Store using Hugging Face CLI or SDK
Register your dataset in NeMo Entity Store using the Dataset APIs
Format your data according to the RAG data format requirements (BEIR, SQuAD, or RAGAS)
Tip
For a complete dataset creation walkthrough, see the dataset management tutorials or follow the end-to-end evaluation example.
Authentication for External Services#
Use API keys to authenticate to external providers in your pipeline components, such as embedding models, reranking services, LLMs, and judge models.
Tip
For comprehensive authentication configuration examples and security best practices, refer to API Key Authentication.
Add api_key
to an api_endpoint
configuration:
{
"api_endpoint": {
"url": "https://api.openai.com/v1/embeddings",
"model_id": "text-embedding-3-large",
"api_key": "sk-your-openai-key"
}
}
Retrieval + Answer Generation + Answer Evaluation (Standard Data)#
{
"type": "rag",
"name": "rag-standard",
"namespace": "my-organization",
"tasks": {
"my-beir-task": {
"type": "beir",
"dataset": {
"files_url": "file://nfcorpus/"
},
"params": {
"judge_llm": {
"api_endpoint": {
"url": "<my-judge-llm-url>",
"model_id": "<my-judge-llm-model>"
}
},
"judge_embeddings": {
"api_endpoint": {
"url": "<my-judge-embedding-url>",
"model_id": "<my-judge-embedding-model>"
}
},
"judge_timeout": 300,
"judge_max_retries": 5,
"judge_max_workers": 16
},
"metrics": {
"retriever_recall_5": {"type": "pytrec_eval"},
"retriever_ndcg_cut_5": {"type": "pytrec_eval"},
"retriever_recall_10": {"type": "pytrec_eval"},
"retriever_ndcg_cut_10": {"type": "pytrec_eval"},
"rag_faithfulness": {"type": "ragas"},
"rag_answer_relevancy": {"type": "ragas"}
}
}
}
}
{
"query": "What is the capital of France?",
"retrieved_docs": [
{"title": "France", "text": "Paris is the capital of France."}
],
"reference": "Paris",
"output": "Paris"
}
{
"job": "eval-abc123def456",
"files_url": "hf://datasets/evaluation-results/eval-abc123def456",
"tasks": {
"my-beir-task": {
"metrics": {
"rag_answer_relevancy": {
"scores": {
"answer_relevancy": {
"value": 1.0,
"stats": {}
}
}
},
"rag_faithfulness": {
"scores": {
"faithfulness": {
"value": 1.0,
"stats": {}
}
}
},
"retriever_retriever.ndcg_cut_5": {
"scores": {
"ndcg_cut_5": {
"value": 0.9,
"stats": {}
}
}
},
"retriever_retriever.recall_5": {
"scores": {
"recall_5": {
"value": 1.0,
"stats": {}
}
}
}
}
}
},
"namespace": "my-organization",
"custom_fields": {}
}
Retrieval + Answer Generation + Answer Evaluation (Custom Data)#
{
"type": "rag",
"name": "rag-custom",
"namespace": "my-organization",
"tasks": {
"my-beir-task": {
"type": "beir",
"dataset": {
"files_url": "hf://datasets/<my-dataset-namespace>/<my-dataset-name>/<my-dataset-file-path>"
},
"params": {
"judge_llm": {
"api_endpoint": {
"url": "<my-judge-llm-url>",
"model_id": "<my-judge-llm-model>"
}
},
"judge_embeddings": {
"api_endpoint": {
"url": "<my-judge-embedding-url>",
"model_id": "<my-judge-embedding-model>"
}
},
"judge_timeout": 300,
"judge_max_retries": 5,
"judge_max_workers": 16
},
"metrics": {
"retriever_recall_5": {"type": "pytrec_eval"},
"retriever_ndcg_cut_5": {"type": "pytrec_eval"},
"retriever_recall_10": {"type": "pytrec_eval"},
"retriever_ndcg_cut_10": {"type": "pytrec_eval"},
"rag_faithfulness": {"type": "ragas"},
"rag_answer_relevancy": {"type": "ragas"}
}
}
}
}
{
"query": "Who wrote Les Misérables?",
"retrieved_docs": [
{"title": "Les Misérables", "text": "Victor Hugo wrote Les Misérables."}
],
"reference": "Victor Hugo",
"output": "Victor Hugo"
}
{
"job": "eval-def789ghi012",
"files_url": "hf://datasets/evaluation-results/eval-def789ghi012",
"tasks": {
"my-beir-task": {
"metrics": {
"rag_answer_relevancy": {
"scores": {
"answer_relevancy": {
"value": 1.0,
"stats": {}
}
}
},
"rag_faithfulness": {
"scores": {
"faithfulness": {
"value": 1.0,
"stats": {}
}
}
},
"retriever_retriever.ndcg_cut_5": {
"scores": {
"ndcg_cut_5": {
"value": 0.95,
"stats": {}
}
}
},
"retriever_retriever.recall_5": {
"scores": {
"recall_5": {
"value": 1.0,
"stats": {}
}
}
}
}
}
},
"namespace": "my-organization",
"custom_fields": {}
}
Answer Evaluation (Pre-generated Answers)#
{
"type": "rag",
"name": "rag-ans-eval-pregen",
"namespace": "my-organization",
"tasks": {
"my-ragas-task": {
"type": "ragas",
"dataset": {
"files_url": "hf://datasets/<my-dataset-namespace>/<my-dataset-name>/<my-dataset-file-path>"
},
"params": {
"judge_llm": {
"api_endpoint": {
"url": "<my-judge-llm-url>",
"model_id": "<my-judge-llm-model>"
}
},
"judge_embeddings": {
"api_endpoint": {
"url": "<my-query-embedding-url>",
"model_id": "<my-query-embedding-model>"
}
},
"judge_timeout": 300,
"judge_max_retries": 5,
"judge_max_workers": 16
},
"metrics": {
"retriever_recall_5": {"type": "pytrec_eval"},
"retriever_ndcg_cut_5": {"type": "pytrec_eval"},
"retriever_recall_10": {"type": "pytrec_eval"},
"retriever_ndcg_cut_10": {"type": "pytrec_eval"},
"rag_faithfulness": {"type": "ragas"}
}
}
}
}
{
"query": "What is the main theme of Les Misérables?",
"reference": "Redemption",
"output": "Redemption"
}
{
"job": "eval-ghi345jkl678",
"files_url": "hf://datasets/evaluation-results/eval-ghi345jkl678",
"tasks": {
"my-ragas-task": {
"metrics": {
"rag_faithfulness": {
"scores": {
"faithfulness": {
"value": 1.0,
"stats": {}
}
}
},
"retriever_retriever.ndcg_cut_5": {
"scores": {
"ndcg_cut_5": {
"value": 0.8,
"stats": {}
}
}
},
"retriever_retriever.recall_5": {
"scores": {
"recall_5": {
"value": 1.0,
"stats": {}
}
}
}
}
}
},
"namespace": "my-organization",
"custom_fields": {}
}
RAG (OpenAI-compatible Judge LLM)#
{
"type": "rag",
"name": "rag-openai-judge",
"namespace": "my-organization",
"tasks": {
"my-beir-task": {
"type": "beir",
"dataset": {
"files_url": "file://nfcorpus/"
},
"params": {
"judge_llm": {
"api_endpoint": {
"url": "<my-nim-deployment-base-url>/v1/completions",
"model_id": "<my-model>",
"api_key": "<openai-api-key>"
}
},
"judge_embeddings": {
"api_endpoint": {
"url": "<my-query-embedding-url>",
"model_id": "<my-query-embedding-model>",
"api_key": "<openai-api-key>"
}
},
"judge_timeout": 300,
"judge_max_retries": 5,
"judge_max_workers": 16
},
"metrics": {
"retriever_recall_5": {"type": "pytrec_eval"},
"retriever_ndcg_cut_5": {"type": "pytrec_eval"},
"retriever_recall_10": {"type": "pytrec_eval"},
"retriever_ndcg_cut_10": {"type": "pytrec_eval"},
"rag_faithfulness": {"type": "ragas"},
"rag_answer_relevancy": {"type": "ragas"}
}
}
}
}
{
"query": "What is the population of Paris?",
"retrieved_docs": [
{"title": "Paris", "text": "The population of Paris is over 2 million."}
],
"reference": "over 2 million",
"output": "over 2 million"
}
{
"job": "eval-jkl901mno234",
"files_url": "hf://datasets/evaluation-results/eval-jkl901mno234",
"tasks": {
"my-beir-task": {
"metrics": {
"rag_answer_relevancy": {
"scores": {
"answer_relevancy": {
"value": 1.0,
"stats": {}
}
}
},
"rag_faithfulness": {
"scores": {
"faithfulness": {
"value": 1.0,
"stats": {}
}
}
},
"retriever_retriever.ndcg_cut_5": {
"scores": {
"ndcg_cut_5": {
"value": 0.92,
"stats": {}
}
}
},
"retriever_retriever.recall_5": {
"scores": {
"recall_5": {
"value": 1.0,
"stats": {}
}
}
}
}
}
},
"namespace": "my-organization",
"custom_fields": {}
}
RAG (using RAGAS NVIDIA Metrics)#
{
"target": {
"type": "rag",
"rag": {
"pipeline": {
"retriever": {
"pipeline": {
"query_embedding_model": {
"api_endpoint": {
"url": "<my-query-embedding-url>",
"model_id": "<my-query-embedding-model>",
"format": "nim"
}
},
"index_embedding_model": {
"api_endpoint": {
"url": "<my-index-embedding-url>",
"model_id": "<my-index-embedding-model>",
"format": "nim"
}
},
"top_k": 1
}
},
"model": {
"api_endpoint": {
"url": "<my-model-url>",
"model_id": "<my-model-id>",
"format": "nim"
}
}
}
}
},
"config": {
"type": "rag",
"tasks": {
"rag": {
"type": "ragas",
"dataset": {
"files_url": "<my-dataset-url>",
"format": "ragas"
},
"metrics": {
"retriever_recall_5": {
"type": "pytrec_eval"
},
"rag_answer_accuracy": {
"type": "ragas"
},
"rag_context_relevance": {
"type": "ragas"
},
"rag_response_groundedness": {
"type": "ragas"
}
},
"params": {
"judge_llm": {
"api_endpoint": {
"url": "<my-judge-llm-url>",
"model_id": "<my-judge-llm-model>"
}
},
"judge_embeddings": {
"api_endpoint": {
"url": "<my-judge-embedding-url>",
"model_id": "<my-judge-embedding-model>"
}
},
"judge_timeout": 120,
"judge_max_retries": 5,
"judge_max_workers": 24,
"judge_max_token": 2048,
"judge_llm_top_p": 1.0
}
}
}
}
}
{
"question": "What are the key features of NVIDIA NIMs?",
"contexts": [
"NVIDIA NIMs are containerized microservices that provide optimized inference for AI models."
],
"answer": "NVIDIA NIMs offer containerized AI inference with optimized performance and scalability.",
"ground_truth": ["NVIDIA NIMs are optimized containerized inference services for AI models."]
}
{
"tasks": {
"rag": {
"metrics": {
"rag_nv_accuracy": {
"scores": {
"nv_accuracy": {
"value": 0.95,
"stats": {}
}
}
},
"rag_nv_context_relevance": {
"scores": {
"nv_context_relevance": {
"value": 0.92,
"stats": {}
}
}
},
"rag_nv_response_groundedness": {
"scores": {
"nv_response_groundedness": {
"value": 0.98,
"stats": {}
}
}
},
"retriever_retriever.recall_5": {
"scores": {
"recall_5": {
"value": 1.0,
"stats": {}
}
}
}
}
}
}
}
Metrics#
The RAG Pipeline evaluation includes two categories of metrics: document retrieval and answer generation.
Document Retrieval#
The following table summarizes the key document retrieval metrics available for RAG evaluation:
Metric Name |
Description |
How k is set |
Notes |
---|---|---|---|
|
Fraction of relevant documents retrieved in the top k results |
User sets k (1 to top_k) |
top_k is the Retriever’s configured value |
|
Normalized Discounted Cumulative Gain at rank k (ranking quality up to k) |
User sets k (1 to top_k) |
Range: 0.0 - 1.0 |
|
NDCG at rank k (cutoff variant, often equivalent to ndcg_k) |
User sets k (1 to top_k) |
Range: 0.0 - 1.0 |
|
Precision at rank k (fraction of retrieved documents that are relevant) |
User sets k (1 to top_k) |
Range: 0.0 - 1.0 |
|
Mean Average Precision with cutoff at rank k |
User sets k (1 to top_k) |
Range: 0.0 - 1.0 |
Note
For an exhaustive list of supported retriever metrics, see Retriever Metrics. When using retrieval metrics in RAG spec, use the retriever_
prefix with pytrec_eval
type.
Retriever metrics are only computed when a retriever pipeline is specified in the RAG target configuration.
Answer Generation#
The following table summarizes the available answer generation metrics for RAG evaluation, including their requirements and dependencies:
Metric Name |
Description |
Dataset Format(s) |
Required Columns |
Eval Config Model Endpoints |
---|---|---|---|---|
|
Factual consistency of the answer vs. context (0–1, higher is better) |
beir, squad, ragas |
question, answer, contexts |
judge_llm |
|
Relevancy of answer to prompt; penalizes incomplete/redundant answers |
beir, squad, ragas |
question, answer |
judge_llm, judge_embeddings |
|
Accuracy vs. ground truth (0–1, higher is better) |
ragas |
question, answer, ground_truth |
judge_llm, judge_embeddings |
|
Semantic similarity to ground truth (0–1, higher is better) |
ragas |
ground_truth, answer |
judge_llm, judge_embeddings |
|
Precision of context ranking for ground-truth items (0–1, higher is better) |
ragas |
question, contexts, ground_truth |
judge_llm |
|
Recall: does context align with ground-truth answer? (0–1, higher is better) |
ragas |
question, contexts, ground_truth |
judge_llm |
|
Agreement between model response and reference ground truth via dual LLM-as-a-judge evaluation (0, 1, 2; higher is better) |
ragas |
question, answer, ground_truth |
judge_llm, judge_embeddings |
|
Evaluates whether retrieved contexts are pertinent to user input via dual LLM-as-a-judge assessment (0, 1, 2; higher is better) |
ragas |
question, contexts |
judge_llm |
|
Measures how well response claims are supported by retrieved contexts and can be found within them (0, 1, 2; higher is better) |
ragas |
question, answer, contexts |
judge_llm |
|
Recall of entities in context compared to ground truth (0–1, higher is better) |
ragas |
question, contexts, ground_truth |
judge_llm |
|
Measures robustness to irrelevant context (0–1, lower is better) |
ragas |
question, answer, contexts |
judge_llm |
|
Overall relevancy of the response to the query (0–1, higher is better) |
ragas |
question, answer |
judge_llm, judge_embeddings |
Legend:
judge_llm
: Metric uses a large language model as a judge.judge_embeddings
: Metric uses embedding-based similarity.
RAG Generation and Judge Parameters#
The following table summarizes the available parameters for configuring RAG answer generation and judge models:
Parameter Name |
Type |
Description |
Default |
Notes |
---|---|---|---|---|
Document Processing Parameters |
||||
|
string |
Truncation strategy for documents exceeding Milvus 65k character limit |
None |
Options: “start”, “end” |
Generation Parameters |
||||
|
int |
Maximum number of tokens to generate for RAG responses |
None |
Controls response length |
|
int |
Maximum number of concurrent workers for answer generation |
None |
Controls generation parallelism |
|
float |
Temperature for answer generation (0.0-1.0) |
None |
Higher values increase randomness |
Judge LLM Parameters |
||||
|
object |
Judge LLM model configuration with api_endpoint |
Required |
Contains model_id, url, api_key |
|
float |
Temperature for judge LLM (0.0-1.0) |
None |
Lower values for more consistent judging |
|
float |
Top-p sampling for judge LLM (0.0-1.0) |
None |
Controls token selection diversity |
|
int |
Maximum tokens for judge LLM responses |
None |
Limits judge response length |
Judge Embeddings Parameters |
||||
|
object |
Configuration for the embedding model used by judge (e.g., for answer_similarity) |
Required |
For similarity-based metrics |
Request Management |
||||
|
int |
Timeout for judge requests (seconds) |
None |
Also mapped to |
|
int |
Maximum retries for failed judge requests |
None |
Error handling for judge calls |
|
int |
Maximum concurrent judge workers |
None |
Controls judge evaluation parallelism |
Usage Example:
params:
judge_llm:
api_endpoint:
url: "https://integrate.api.nvidia.com/v1"
model_id: "nvdev/meta/llama-3.1-8b-instruct"
judge_embeddings:
api_endpoint:
url: "https://integrate.api.nvidia.com/v1"
model_id: "nvidia/nv-embedqa-e5-v5"
judge_timeout: 120
judge_max_retries: 5
judge_max_workers: 2
truncate_long_documents: "end"
generation_max_tokens: 512
generation_temperature: 0.8
judge_llm_temperature: 0.1
judge_llm_top_p: 0.9
judge_llm_max_tokens: 1024
Custom Dataset Format#
BEIR#
The BEIR (Benchmarking Information Retrieval) framework supports various datasets for evaluating retrieval systems. Supported BEIR datasets include:
fiqa
- Financial question answering datasetnfcorpus
- Natural language corpus for biomedical information retrievalscidocs
- Scientific document retrieval and citation recommendationscifact
- Scientific fact verification dataset
Note
For a complete list of available BEIR datasets, refer to the BEIR repository.
corpus.jsonl (BEIR)#
For BEIR, the corpus.jsonl
file contains a list of dictionaries with the following fields:
Field |
Type |
Required |
Description |
---|---|---|---|
|
string |
Yes |
Unique document identifier. |
|
string |
No |
Document title (optional). |
|
string |
Yes |
Document paragraph or passage. |
{"_id": "doc1", "title": "Albert Einstein", "text": "Albert Einstein was a German-born...."}
queries.jsonl (BEIR)#
The queries.jsonl
file contains a list of dictionaries with the following fields:
Field |
Type |
Required |
Description |
---|---|---|---|
|
string |
Yes |
Unique query identifier. |
|
string |
Yes |
Query text. |
{"_id": "q1", "text": "Who developed the mass-energy equivalence formula?"}
qrels.tsv (BEIR)#
The qrels.tsv
file is a tab-separated file with three columns: query-id
, corpus-id
, and score
. The first row should be a header.
Field |
Type |
Required |
Description |
---|---|---|---|
|
string |
Yes |
Query identifier (matches |
|
string |
Yes |
Document identifier (matches |
|
integer |
Yes |
Relevance score (typically 1 for relevant, 0 for not relevant). |
query-id corpus-id score
q1 doc1 1
SQuAD#
squad.json (SQuAD)#
For SQuAD, the squad.json
file contains question-answer pairs with their corresponding context passages in a structured format. It follows the official SQuAD dataset structure with nested fields for data, paragraphs, and question-answer pairs.
Field |
Type |
Required |
Description |
---|---|---|---|
|
list of objects |
Yes |
List of data entries, each with paragraphs. |
|
list of objects |
Yes |
List of paragraphs for each data entry. |
|
string |
Yes |
Context passage for the questions. |
|
string |
Yes |
Document identifier. |
|
list of objects |
Yes |
List of question-answer pairs. |
|
string |
Yes |
The question being asked. |
|
string |
Yes |
Unique identifier for the question. |
|
list of objects |
Yes |
List of answers, each with a |
|
string |
Yes |
The answer text (inside |
{
"data": [
{
"paragraphs": [
{
"context": "my context",
"document_id": "my id",
"qas": [
{
"question": "my question",
"id": "my id",
"answers": [
{"text": "my answer"}
]
}
]
}
]
}
]
}
Ragas#
ragas.json (Ragas)#
For Ragas, the ragas.json
file contains questions, contexts, answers, and ground truth for evaluating RAG systems. This format allows for comprehensive assessment of retrieval and generation quality.
Field |
Type |
Required |
Description |
---|---|---|---|
|
list of strings |
Yes |
List of questions. |
|
list of list of strings |
No |
List of context passages for each question. |
|
list of strings |
No |
List of predicted answers for each question. |
|
list of strings |
No |
List of ground truth answers for each question. |
{
"question": ["question #1", "question #2"],
# Optional. Used for Answer Generation and Answer Evaluation (for some specific RAG metrics)
"contexts": [["context #1 for question #1", "context #2 for question #1"], ["context #1 for question #2", "context #2 for question #2"]],
# Optional. Used for Answer Evaluation (for some specific RAG metrics)
"answer": ["predicted answer for question #1", "predicted answer for question #2"],
# Optional. Used for Answer Evaluation (for some specific RAG metrics)
"ground_truth": ["ground truth answer for question #1", "ground truth answer for question #2"]
}
{
"question": [
"When did the 2024 Paris Olympics opening ceremony take place?",
"Where was the 2024 Paris Olympics opening ceremony held?",
"Who lit the Olympic cauldron during the 2024 Paris Olympics opening ceremony?"
],
"contexts": [
[
"The 2024 Paris Olympics officially began with the opening ceremony on July 26, 2024.",
"The ceremony was staged along the River Seine in Paris, marking the first time an Olympic opening was held outside a traditional stadium.",
"French swimmer Marie Wattel had the honor of lighting the Olympic cauldron."
],
[
"The 2024 Paris Olympics officially began with the opening ceremony on July 26, 2024.",
"The ceremony was staged along the River Seine in Paris, marking the first time an Olympic opening was held outside a traditional stadium.",
"French swimmer Marie Wattel had the honor of lighting the Olympic cauldron."
],
[
"The 2024 Paris Olympics officially began with the opening ceremony on July 26, 2024.",
"The ceremony was staged along the River Seine in Paris, marking the first time an Olympic opening was held outside a traditional stadium.",
"French swimmer Marie Wattel had the honor of lighting the Olympic cauldron."
]
],
"ground_truth": [
"July 26, 2024",
"Along the River Seine, Paris",
"French swimmer Marie Wattel"
]
}