Frontier Judge QA Filter | NVIDIA NeMo Data Designer

Download Recipe

1 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 # SPDX-License-Identifier: Apache-2.0
3 # /// script
4 # requires-python = ">=3.10"
5 # dependencies = [
6 #     "data-designer>=0.5.6",
7 # ]
8 # ///
9 """Long-Document Understanding Frontier Model QA Judge Recipe
10 
11 Use a frontier VLM as an LLM-as-a-judge to evaluate the quality
12 of (question, answer) pairs generated by the upstream visual QA recipes. The
13 judge scores each example across five rubrics:
14 
15   1. **Answer Correctness** – factual accuracy against the visible document
16   2. **Question Quality** – reasoning depth, ambiguity, specificity
17   3. **Visual Grounding** – reliance on visual elements vs. plain text
18   4. **Format Compliance** – answer format matches the question type
19   5. **Training Signal Strength** – overall value as VLM training data
20 
21 A weighted composite score (0–1) is computed from the five rubric scores.
22 
23 Prerequisites:
24     - A seed parquet file containing output from an upstream QA recipe
25       (e.g. 05-visual-qa-sdg.py, 06-single-page-qa-sdg.py, or
26       08-whole-document-qa-sdg.py) with at least:
27         * `png_images_base64` – JSON array of base64-encoded PNG(s) of
28           document pages.
29         * `question_type`  – classification of the question.
30         * `question`       – the generated question.
31         * `answer`         – the generated answer.
32     - Access to a frontier model endpoint that exposes an OpenAI-compatible
33       API. Provide the model ID, endpoint URL, and the name of the
34       environment variable holding the API key via the CLI flags
35       ``--model-id``, ``--endpoint``, and ``--api-key-env``.
36 
37 Run:
38     # Basic usage (judges 5 records by default)
39     uv run 09-frontier-judge-sdg.py --seed-path my_qa_output.parquet \
40         --model-id <model-id> --endpoint <endpoint-url> --api-key-env <ENV_VAR>
41 
42     # Custom record count
43     uv run 09-frontier-judge-sdg.py --seed-path my_qa_output.parquet \
44         --model-id <model-id> --endpoint <endpoint-url> --api-key-env <ENV_VAR> \
45         --num-records 100
46 
47     # For help message and available options
48     uv run 09-frontier-judge-sdg.py --help
49 """
50 
51 from pathlib import Path
52 
53 import data_designer.config as dd
54 from data_designer.interface import DataDesigner, DatasetCreationResults
55 
56 PROVIDER_NAME = "frontier"
57 
58 # =============================================================================
59 # Score weights for the weighted composite
60 # =============================================================================
61 
62 FINAL_SCORE_WEIGHTS = {
63     "Answer Correctness": 0.35,
64     "Training Signal Strength": 0.30,
65     "Question Quality": 0.15,
66     "Visual Grounding": 0.10,
67     "Format Compliance": 0.10,
68 }
69 
70 # =============================================================================
71 # Custom column: weighted composite score
72 # =============================================================================
73 
74 
75 @dd.custom_column_generator(required_columns=["qa_quality_judge"])
76 def compute_weighted_score(row: dict) -> dict:
77     """Weighted composite score normalized to 0-1."""
78     judge = row["qa_quality_judge"]
79     raw = sum(float(judge[k]["score"]) * w for k, w in FINAL_SCORE_WEIGHTS.items())
80     row["weighted_quality_score"] = round(raw / 5.0, 2)
81     return row
82 
83 
84 # =============================================================================
85 # Judge prompt
86 # =============================================================================
87 
88 PROMPT_JUDGE = """\
89 You are an expert evaluator of visual document question-answering (VQA) training data
90 for the MMLongBench-Doc benchmark.
91 
92 Your task is to assess the quality of a (question, answer) pair generated from a PDF
93 document image. The goal is to determine how strong of a training signal this example
94 would provide for improving VLM performance.
95 
96 You will be given:
97 - One or more images of document pages (with tables, charts, diagrams, etc.)
98 - A question type classification
99 - A question about the document
100 - An answer to the question
101 
102 <question-type>
103 {{ question_type }}
104 </question-type>
105 
106 <question>
107 {{ question }}
108 </question>
109 
110 <answer>
111 {{ answer }}
112 </answer>
113 
114 Evaluate the example across the following rubrics. For each rubric, provide a brief
115 reasoning and a score. Be objective and critical -- do not inflate scores.
116 
117 {
118   "Answer Correctness": {
119     "reasoning": "Your brief analysis here",
120     "score": "X"
121   },
122   "Question Quality": {
123     "reasoning": "Your brief analysis here",
124     "score": "X"
125   },
126   "Visual Grounding": {
127     "reasoning": "Your brief analysis here",
128     "score": "X"
129   },
130   "Format Compliance": {
131     "reasoning": "Your brief analysis here",
132     "score": "X"
133   },
134   "Training Signal Strength": {
135     "reasoning": "Your brief analysis here",
136     "score": "X"
137   }
138 }
139 
140 Provide your evaluation in the exact JSON format above with ALL 5 rubrics.
141 Keep your reasoning for each rubric short and to the point.
142 """
143 
144 # =============================================================================
145 # Score rubric definitions
146 # =============================================================================
147 
148 answer_correctness_score = dd.Score(
149     name="Answer Correctness",
150     description=(
151         "Is the answer factually correct given the visible document content? "
152         "Verify by examining the image yourself. For calculations, redo the math. "
153         "For counts, recount. For lists, check completeness."
154     ),
155     options={
156         "5": "Exactly correct: answer matches the visible content precisely, calculations are accurate, lists are complete",
157         "4": "Substantially correct: answer is right with minor imprecision (e.g., rounding differences within +/-5%, equivalent formats like '25%' vs '0.25')",
158         "3": "Partially correct: core answer is right but has notable issues (missing list items, slightly off calculation, incomplete but not wrong)",
159         "2": "Mostly incorrect: answer has the right idea but wrong values, wrong entity, or significant calculation errors",
160         "1": "Incorrect: answer contradicts the visible content, uses wrong data, or is completely off",
161         "0": "Not answerable or refused: answer is a refusal, 'Not answerable', or nonsensical when a real answer exists",
162     },
163 )
164 
165 question_quality_score = dd.Score(
166     name="Question Quality",
167     description=(
168         "Is the question well-formed, unambiguous, and appropriately challenging? "
169         "Does it require genuine reasoning (comparison, calculation, counting) rather than trivial lookup? "
170         "Is it specific to the visual content and not generic?"
171     ),
172     options={
173         "5": "Excellent: requires clear reasoning (comparison, calculation, or cross-element synthesis), unambiguous, has exactly one correct answer, well-matched to the visual element type",
174         "4": "Good: requires some reasoning, mostly unambiguous, well-grounded in the visual content with minor issues",
175         "3": "Adequate: reasonable question but either too easy (direct lookup), slightly ambiguous, or not well-matched to the visual element type",
176         "2": "Poor: trivial lookup, ambiguous wording, or asks about content not well-suited to the visual element type",
177         "1": "Very poor: unanswerable from the image, contains the answer, or is about irrelevant content",
178         "0": "Invalid: nonsensical, empty, or completely unrelated to the document",
179     },
180 )
181 
182 visual_grounding_score = dd.Score(
183     name="Visual Grounding",
184     description=(
185         "Does the question target the actual visual elements (tables, charts, diagrams) in the image? "
186         "Does answering require examining the visual structure, not just reading plain text? "
187         "Is the question grounded in specific, identifiable elements?"
188     ),
189     options={
190         "5": "Excellent: question directly targets specific visual elements (chart data, table cells, diagram nodes), answering requires visual perception and spatial understanding",
191         "4": "Good: question is grounded in visual content with clear references to identifiable elements, requires examining the visual structure",
192         "3": "Adequate: question relates to visual content but could partially be answered from text alone, or uses vague references ('the table' without specificity)",
193         "2": "Poor: question mostly targets plain text content, minimal visual grounding, could be answered without seeing the visual elements",
194         "1": "Very poor: question has no meaningful connection to the visual elements, purely text-based",
195         "0": "No grounding: question is about content not present in the image at all",
196     },
197 )
198 
199 format_compliance_score = dd.Score(
200     name="Format Compliance",
201     description=(
202         "Does the answer match the expected format for its question type? "
203         "Check: multiple choice uses 'A. option' format, yes/no is exactly 'Yes'/'No', "
204         "percentages include '%', integers are digits only, lists are JSON arrays, "
205         "and the answer contains no reasoning traces or meta-commentary."
206     ),
207     options={
208         "5": "Perfect compliance: answer format exactly matches the question type requirements, no extraneous content",
209         "4": "Good compliance: correct format with trivial deviations (e.g., extra whitespace, minor punctuation)",
210         "3": "Adequate: answer is usable but has format issues (e.g., missing units, prose instead of JSON array, includes 'Based on the image...')",
211         "2": "Poor: significant format violations (e.g., includes reasoning steps, wrong answer structure, contains <think> tags)",
212         "1": "Very poor: answer format is fundamentally wrong for the question type",
213         "0": "No compliance: answer is empty, garbled, or completely ignores format requirements",
214     },
215 )
216 
217 training_signal_score = dd.Score(
218     name="Training Signal Strength",
219     description=(
220         "Overall, how valuable is this (question, answer) pair as training data for improving "
221         "VLM performance on document understanding? Consider: does it exercise visual perception, "
222         "require non-trivial reasoning, demand multi-page evidence gathering, and provide a clear learning signal?"
223     ),
224     options={
225         "5": "Excellent: requires combining evidence from multiple pages, exercises visual perception + reasoning, non-trivial, clear correct answer. Would meaningfully improve a VLM on document QA benchmarks",
226         "4": "Strong: good training example with cross-page reasoning or strong single-page visual grounding and reasoning, minor issues don't significantly reduce value",
227         "3": "Moderate: decent training signal but answerable from a single page, or doesn't fully exercise multi-page or visual understanding",
228         "2": "Weak: limited training value -- trivial question, wrong answer, single-page lookup, or doesn't require visual reasoning",
229         "1": "Very weak: almost no training value -- incorrect, ambiguous, or completely text-based with no multi-page dependency",
230         "0": "No value: harmful to training -- wrong answer presented as correct, nonsensical, or would teach bad patterns",
231     },
232 )
233 
234 
235 # =============================================================================
236 # Config builder
237 # =============================================================================
238 
239 
240 def build_config(
241     seed_path: str = "seed.parquet",
242     model_alias: str = "frontier-judge-vlm",
243     model_id: str = "",
244 ) -> dd.DataDesignerConfigBuilder:
245     """Build the Data Designer config for frontier-model QA judging."""
246     config_builder = dd.DataDesignerConfigBuilder(
247         model_configs=[
248             dd.ModelConfig(
249                 alias=model_alias,
250                 model=model_id,
251                 provider=PROVIDER_NAME,
252                 inference_parameters=dd.ChatCompletionInferenceParams(
253                     timeout=300,
254                     max_tokens=40000,
255                     max_parallel_requests=32,
256                 ),
257             ),
258         ]
259     )
260 
261     config_builder.with_seed_dataset(
262         dd.LocalFileSeedSource(path=seed_path),
263         sampling_strategy=dd.SamplingStrategy.ORDERED,
264     )
265 
266     config_builder.add_column(
267         dd.LLMJudgeColumnConfig(
268             name="qa_quality_judge",
269             model_alias=model_alias,
270             prompt=PROMPT_JUDGE,
271             scores=[
272                 answer_correctness_score,
273                 question_quality_score,
274                 visual_grounding_score,
275                 format_compliance_score,
276                 training_signal_score,
277             ],
278             multi_modal_context=[
279                 dd.ImageContext(
280                     column_name="png_images_base64",
281                     data_type=dd.ModalityDataType.BASE64,
282                     image_format=dd.ImageFormat.PNG,
283                 ),
284             ],
285         )
286     )
287 
288     config_builder.add_column(
289         dd.CustomColumnConfig(
290             name="weighted_quality_score",
291             generator_function=compute_weighted_score,
292         )
293     )
294 
295     return config_builder
296 
297 
298 # =============================================================================
299 # Dataset creation
300 # =============================================================================
301 
302 
303 def create_dataset(
304     config_builder: dd.DataDesignerConfigBuilder,
305     num_records: int,
306     endpoint: str = "",
307     api_key_env: str = "",
308     artifact_path: Path | str | None = None,
309 ) -> DatasetCreationResults:
310     """Create the judged dataset."""
311     model_providers = [
312         dd.ModelProvider(
313             name=PROVIDER_NAME,
314             endpoint=endpoint,
315             provider_type="openai",
316             api_key=api_key_env,
317         ),
318     ]
319     data_designer = DataDesigner(
320         artifact_path=artifact_path,
321         model_providers=model_providers,
322     )
323     data_designer.set_run_config(
324         dd.RunConfig(disable_early_shutdown=True, display_tui=True),
325     )
326     results = data_designer.create(config_builder, num_records=num_records, dataset_name="frontier_judge")
327     return results
328 
329 
330 # =============================================================================
331 # CLI entry point
332 # =============================================================================
333 
334 if __name__ == "__main__":
335     from argparse import ArgumentParser
336 
337     parser = ArgumentParser()
338     parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file")
339     parser.add_argument("--model-alias", type=str, default="frontier-judge-vlm")
340     parser.add_argument("--model-id", type=str, required=True, help="ID of the model to use for judging")
341     parser.add_argument("--endpoint", type=str, required=True, help="OpenAI-compatible API endpoint URL")
342     parser.add_argument(
343         "--api-key-env", type=str, required=True, help="Environment variable name containing the API key"
344     )
345     parser.add_argument("--num-records", type=int, default=5)
346     parser.add_argument("--artifact-path", type=str, default=None)
347     args = parser.parse_args()
348 
349     config_builder = build_config(
350         seed_path=args.seed_path,
351         model_alias=args.model_alias,
352         model_id=args.model_id,
353     )
354     results = create_dataset(
355         config_builder,
356         num_records=args.num_records,
357         endpoint=args.endpoint,
358         api_key_env=args.api_key_env,
359         artifact_path=args.artifact_path,
360     )
361 
362     print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
363 
364     results.load_analysis().to_report()