| 1 | # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | # SPDX-License-Identifier: Apache-2.0 |
| 3 | # /// script |
| 4 | # requires-python = ">=3.10" |
| 5 | # dependencies = [ |
| 6 | # "data-designer>=0.5.6", |
| 7 | # ] |
| 8 | # /// |
| 9 | """Long-Document Understanding Frontier Model QA Judge Recipe |
| 10 | |
| 11 | Use a frontier VLM as an LLM-as-a-judge to evaluate the quality |
| 12 | of (question, answer) pairs generated by the upstream visual QA recipes. The |
| 13 | judge scores each example across five rubrics: |
| 14 | |
| 15 | 1. **Answer Correctness** – factual accuracy against the visible document |
| 16 | 2. **Question Quality** – reasoning depth, ambiguity, specificity |
| 17 | 3. **Visual Grounding** – reliance on visual elements vs. plain text |
| 18 | 4. **Format Compliance** – answer format matches the question type |
| 19 | 5. **Training Signal Strength** – overall value as VLM training data |
| 20 | |
| 21 | A weighted composite score (0–1) is computed from the five rubric scores. |
| 22 | |
| 23 | Prerequisites: |
| 24 | - A seed parquet file containing output from an upstream QA recipe |
| 25 | (e.g. 05-visual-qa-sdg.py, 06-single-page-qa-sdg.py, or |
| 26 | 08-whole-document-qa-sdg.py) with at least: |
| 27 | * `png_images_base64` – JSON array of base64-encoded PNG(s) of |
| 28 | document pages. |
| 29 | * `question_type` – classification of the question. |
| 30 | * `question` – the generated question. |
| 31 | * `answer` – the generated answer. |
| 32 | - Access to a frontier model endpoint that exposes an OpenAI-compatible |
| 33 | API. Provide the model ID, endpoint URL, and the name of the |
| 34 | environment variable holding the API key via the CLI flags |
| 35 | ``--model-id``, ``--endpoint``, and ``--api-key-env``. |
| 36 | |
| 37 | Run: |
| 38 | # Basic usage (judges 5 records by default) |
| 39 | uv run 09-frontier-judge-sdg.py --seed-path my_qa_output.parquet \ |
| 40 | --model-id <model-id> --endpoint <endpoint-url> --api-key-env <ENV_VAR> |
| 41 | |
| 42 | # Custom record count |
| 43 | uv run 09-frontier-judge-sdg.py --seed-path my_qa_output.parquet \ |
| 44 | --model-id <model-id> --endpoint <endpoint-url> --api-key-env <ENV_VAR> \ |
| 45 | --num-records 100 |
| 46 | |
| 47 | # For help message and available options |
| 48 | uv run 09-frontier-judge-sdg.py --help |
| 49 | """ |
| 50 | |
| 51 | from pathlib import Path |
| 52 | |
| 53 | import data_designer.config as dd |
| 54 | from data_designer.interface import DataDesigner, DatasetCreationResults |
| 55 | |
| 56 | PROVIDER_NAME = "frontier" |
| 57 | |
| 58 | # ============================================================================= |
| 59 | # Score weights for the weighted composite |
| 60 | # ============================================================================= |
| 61 | |
| 62 | FINAL_SCORE_WEIGHTS = { |
| 63 | "Answer Correctness": 0.35, |
| 64 | "Training Signal Strength": 0.30, |
| 65 | "Question Quality": 0.15, |
| 66 | "Visual Grounding": 0.10, |
| 67 | "Format Compliance": 0.10, |
| 68 | } |
| 69 | |
| 70 | # ============================================================================= |
| 71 | # Custom column: weighted composite score |
| 72 | # ============================================================================= |
| 73 | |
| 74 | |
| 75 | @dd.custom_column_generator(required_columns=["qa_quality_judge"]) |
| 76 | def compute_weighted_score(row: dict) -> dict: |
| 77 | """Weighted composite score normalized to 0-1.""" |
| 78 | judge = row["qa_quality_judge"] |
| 79 | raw = sum(float(judge[k]["score"]) * w for k, w in FINAL_SCORE_WEIGHTS.items()) |
| 80 | row["weighted_quality_score"] = round(raw / 5.0, 2) |
| 81 | return row |
| 82 | |
| 83 | |
| 84 | # ============================================================================= |
| 85 | # Judge prompt |
| 86 | # ============================================================================= |
| 87 | |
| 88 | PROMPT_JUDGE = """\ |
| 89 | You are an expert evaluator of visual document question-answering (VQA) training data |
| 90 | for the MMLongBench-Doc benchmark. |
| 91 | |
| 92 | Your task is to assess the quality of a (question, answer) pair generated from a PDF |
| 93 | document image. The goal is to determine how strong of a training signal this example |
| 94 | would provide for improving VLM performance. |
| 95 | |
| 96 | You will be given: |
| 97 | - One or more images of document pages (with tables, charts, diagrams, etc.) |
| 98 | - A question type classification |
| 99 | - A question about the document |
| 100 | - An answer to the question |
| 101 | |
| 102 | <question-type> |
| 103 | {{ question_type }} |
| 104 | </question-type> |
| 105 | |
| 106 | <question> |
| 107 | {{ question }} |
| 108 | </question> |
| 109 | |
| 110 | <answer> |
| 111 | {{ answer }} |
| 112 | </answer> |
| 113 | |
| 114 | Evaluate the example across the following rubrics. For each rubric, provide a brief |
| 115 | reasoning and a score. Be objective and critical -- do not inflate scores. |
| 116 | |
| 117 | { |
| 118 | "Answer Correctness": { |
| 119 | "reasoning": "Your brief analysis here", |
| 120 | "score": "X" |
| 121 | }, |
| 122 | "Question Quality": { |
| 123 | "reasoning": "Your brief analysis here", |
| 124 | "score": "X" |
| 125 | }, |
| 126 | "Visual Grounding": { |
| 127 | "reasoning": "Your brief analysis here", |
| 128 | "score": "X" |
| 129 | }, |
| 130 | "Format Compliance": { |
| 131 | "reasoning": "Your brief analysis here", |
| 132 | "score": "X" |
| 133 | }, |
| 134 | "Training Signal Strength": { |
| 135 | "reasoning": "Your brief analysis here", |
| 136 | "score": "X" |
| 137 | } |
| 138 | } |
| 139 | |
| 140 | Provide your evaluation in the exact JSON format above with ALL 5 rubrics. |
| 141 | Keep your reasoning for each rubric short and to the point. |
| 142 | """ |
| 143 | |
| 144 | # ============================================================================= |
| 145 | # Score rubric definitions |
| 146 | # ============================================================================= |
| 147 | |
| 148 | answer_correctness_score = dd.Score( |
| 149 | name="Answer Correctness", |
| 150 | description=( |
| 151 | "Is the answer factually correct given the visible document content? " |
| 152 | "Verify by examining the image yourself. For calculations, redo the math. " |
| 153 | "For counts, recount. For lists, check completeness." |
| 154 | ), |
| 155 | options={ |
| 156 | "5": "Exactly correct: answer matches the visible content precisely, calculations are accurate, lists are complete", |
| 157 | "4": "Substantially correct: answer is right with minor imprecision (e.g., rounding differences within +/-5%, equivalent formats like '25%' vs '0.25')", |
| 158 | "3": "Partially correct: core answer is right but has notable issues (missing list items, slightly off calculation, incomplete but not wrong)", |
| 159 | "2": "Mostly incorrect: answer has the right idea but wrong values, wrong entity, or significant calculation errors", |
| 160 | "1": "Incorrect: answer contradicts the visible content, uses wrong data, or is completely off", |
| 161 | "0": "Not answerable or refused: answer is a refusal, 'Not answerable', or nonsensical when a real answer exists", |
| 162 | }, |
| 163 | ) |
| 164 | |
| 165 | question_quality_score = dd.Score( |
| 166 | name="Question Quality", |
| 167 | description=( |
| 168 | "Is the question well-formed, unambiguous, and appropriately challenging? " |
| 169 | "Does it require genuine reasoning (comparison, calculation, counting) rather than trivial lookup? " |
| 170 | "Is it specific to the visual content and not generic?" |
| 171 | ), |
| 172 | options={ |
| 173 | "5": "Excellent: requires clear reasoning (comparison, calculation, or cross-element synthesis), unambiguous, has exactly one correct answer, well-matched to the visual element type", |
| 174 | "4": "Good: requires some reasoning, mostly unambiguous, well-grounded in the visual content with minor issues", |
| 175 | "3": "Adequate: reasonable question but either too easy (direct lookup), slightly ambiguous, or not well-matched to the visual element type", |
| 176 | "2": "Poor: trivial lookup, ambiguous wording, or asks about content not well-suited to the visual element type", |
| 177 | "1": "Very poor: unanswerable from the image, contains the answer, or is about irrelevant content", |
| 178 | "0": "Invalid: nonsensical, empty, or completely unrelated to the document", |
| 179 | }, |
| 180 | ) |
| 181 | |
| 182 | visual_grounding_score = dd.Score( |
| 183 | name="Visual Grounding", |
| 184 | description=( |
| 185 | "Does the question target the actual visual elements (tables, charts, diagrams) in the image? " |
| 186 | "Does answering require examining the visual structure, not just reading plain text? " |
| 187 | "Is the question grounded in specific, identifiable elements?" |
| 188 | ), |
| 189 | options={ |
| 190 | "5": "Excellent: question directly targets specific visual elements (chart data, table cells, diagram nodes), answering requires visual perception and spatial understanding", |
| 191 | "4": "Good: question is grounded in visual content with clear references to identifiable elements, requires examining the visual structure", |
| 192 | "3": "Adequate: question relates to visual content but could partially be answered from text alone, or uses vague references ('the table' without specificity)", |
| 193 | "2": "Poor: question mostly targets plain text content, minimal visual grounding, could be answered without seeing the visual elements", |
| 194 | "1": "Very poor: question has no meaningful connection to the visual elements, purely text-based", |
| 195 | "0": "No grounding: question is about content not present in the image at all", |
| 196 | }, |
| 197 | ) |
| 198 | |
| 199 | format_compliance_score = dd.Score( |
| 200 | name="Format Compliance", |
| 201 | description=( |
| 202 | "Does the answer match the expected format for its question type? " |
| 203 | "Check: multiple choice uses 'A. option' format, yes/no is exactly 'Yes'/'No', " |
| 204 | "percentages include '%', integers are digits only, lists are JSON arrays, " |
| 205 | "and the answer contains no reasoning traces or meta-commentary." |
| 206 | ), |
| 207 | options={ |
| 208 | "5": "Perfect compliance: answer format exactly matches the question type requirements, no extraneous content", |
| 209 | "4": "Good compliance: correct format with trivial deviations (e.g., extra whitespace, minor punctuation)", |
| 210 | "3": "Adequate: answer is usable but has format issues (e.g., missing units, prose instead of JSON array, includes 'Based on the image...')", |
| 211 | "2": "Poor: significant format violations (e.g., includes reasoning steps, wrong answer structure, contains <think> tags)", |
| 212 | "1": "Very poor: answer format is fundamentally wrong for the question type", |
| 213 | "0": "No compliance: answer is empty, garbled, or completely ignores format requirements", |
| 214 | }, |
| 215 | ) |
| 216 | |
| 217 | training_signal_score = dd.Score( |
| 218 | name="Training Signal Strength", |
| 219 | description=( |
| 220 | "Overall, how valuable is this (question, answer) pair as training data for improving " |
| 221 | "VLM performance on document understanding? Consider: does it exercise visual perception, " |
| 222 | "require non-trivial reasoning, demand multi-page evidence gathering, and provide a clear learning signal?" |
| 223 | ), |
| 224 | options={ |
| 225 | "5": "Excellent: requires combining evidence from multiple pages, exercises visual perception + reasoning, non-trivial, clear correct answer. Would meaningfully improve a VLM on document QA benchmarks", |
| 226 | "4": "Strong: good training example with cross-page reasoning or strong single-page visual grounding and reasoning, minor issues don't significantly reduce value", |
| 227 | "3": "Moderate: decent training signal but answerable from a single page, or doesn't fully exercise multi-page or visual understanding", |
| 228 | "2": "Weak: limited training value -- trivial question, wrong answer, single-page lookup, or doesn't require visual reasoning", |
| 229 | "1": "Very weak: almost no training value -- incorrect, ambiguous, or completely text-based with no multi-page dependency", |
| 230 | "0": "No value: harmful to training -- wrong answer presented as correct, nonsensical, or would teach bad patterns", |
| 231 | }, |
| 232 | ) |
| 233 | |
| 234 | |
| 235 | # ============================================================================= |
| 236 | # Config builder |
| 237 | # ============================================================================= |
| 238 | |
| 239 | |
| 240 | def build_config( |
| 241 | seed_path: str = "seed.parquet", |
| 242 | model_alias: str = "frontier-judge-vlm", |
| 243 | model_id: str = "", |
| 244 | ) -> dd.DataDesignerConfigBuilder: |
| 245 | """Build the Data Designer config for frontier-model QA judging.""" |
| 246 | config_builder = dd.DataDesignerConfigBuilder( |
| 247 | model_configs=[ |
| 248 | dd.ModelConfig( |
| 249 | alias=model_alias, |
| 250 | model=model_id, |
| 251 | provider=PROVIDER_NAME, |
| 252 | inference_parameters=dd.ChatCompletionInferenceParams( |
| 253 | timeout=300, |
| 254 | max_tokens=40000, |
| 255 | max_parallel_requests=32, |
| 256 | ), |
| 257 | ), |
| 258 | ] |
| 259 | ) |
| 260 | |
| 261 | config_builder.with_seed_dataset( |
| 262 | dd.LocalFileSeedSource(path=seed_path), |
| 263 | sampling_strategy=dd.SamplingStrategy.ORDERED, |
| 264 | ) |
| 265 | |
| 266 | config_builder.add_column( |
| 267 | dd.LLMJudgeColumnConfig( |
| 268 | name="qa_quality_judge", |
| 269 | model_alias=model_alias, |
| 270 | prompt=PROMPT_JUDGE, |
| 271 | scores=[ |
| 272 | answer_correctness_score, |
| 273 | question_quality_score, |
| 274 | visual_grounding_score, |
| 275 | format_compliance_score, |
| 276 | training_signal_score, |
| 277 | ], |
| 278 | multi_modal_context=[ |
| 279 | dd.ImageContext( |
| 280 | column_name="png_images_base64", |
| 281 | data_type=dd.ModalityDataType.BASE64, |
| 282 | image_format=dd.ImageFormat.PNG, |
| 283 | ), |
| 284 | ], |
| 285 | ) |
| 286 | ) |
| 287 | |
| 288 | config_builder.add_column( |
| 289 | dd.CustomColumnConfig( |
| 290 | name="weighted_quality_score", |
| 291 | generator_function=compute_weighted_score, |
| 292 | ) |
| 293 | ) |
| 294 | |
| 295 | return config_builder |
| 296 | |
| 297 | |
| 298 | # ============================================================================= |
| 299 | # Dataset creation |
| 300 | # ============================================================================= |
| 301 | |
| 302 | |
| 303 | def create_dataset( |
| 304 | config_builder: dd.DataDesignerConfigBuilder, |
| 305 | num_records: int, |
| 306 | endpoint: str = "", |
| 307 | api_key_env: str = "", |
| 308 | artifact_path: Path | str | None = None, |
| 309 | ) -> DatasetCreationResults: |
| 310 | """Create the judged dataset.""" |
| 311 | model_providers = [ |
| 312 | dd.ModelProvider( |
| 313 | name=PROVIDER_NAME, |
| 314 | endpoint=endpoint, |
| 315 | provider_type="openai", |
| 316 | api_key=api_key_env, |
| 317 | ), |
| 318 | ] |
| 319 | data_designer = DataDesigner( |
| 320 | artifact_path=artifact_path, |
| 321 | model_providers=model_providers, |
| 322 | ) |
| 323 | data_designer.set_run_config( |
| 324 | dd.RunConfig(disable_early_shutdown=True, progress_bar=True), |
| 325 | ) |
| 326 | results = data_designer.create(config_builder, num_records=num_records, dataset_name="frontier_judge") |
| 327 | return results |
| 328 | |
| 329 | |
| 330 | # ============================================================================= |
| 331 | # CLI entry point |
| 332 | # ============================================================================= |
| 333 | |
| 334 | if __name__ == "__main__": |
| 335 | from argparse import ArgumentParser |
| 336 | |
| 337 | parser = ArgumentParser() |
| 338 | parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file") |
| 339 | parser.add_argument("--model-alias", type=str, default="frontier-judge-vlm") |
| 340 | parser.add_argument("--model-id", type=str, required=True, help="ID of the model to use for judging") |
| 341 | parser.add_argument("--endpoint", type=str, required=True, help="OpenAI-compatible API endpoint URL") |
| 342 | parser.add_argument( |
| 343 | "--api-key-env", type=str, required=True, help="Environment variable name containing the API key" |
| 344 | ) |
| 345 | parser.add_argument("--num-records", type=int, default=5) |
| 346 | parser.add_argument("--artifact-path", type=str, default=None) |
| 347 | args = parser.parse_args() |
| 348 | |
| 349 | config_builder = build_config( |
| 350 | seed_path=args.seed_path, |
| 351 | model_alias=args.model_alias, |
| 352 | model_id=args.model_id, |
| 353 | ) |
| 354 | results = create_dataset( |
| 355 | config_builder, |
| 356 | num_records=args.num_records, |
| 357 | endpoint=args.endpoint, |
| 358 | api_key_env=args.api_key_env, |
| 359 | artifact_path=args.artifact_path, |
| 360 | ) |
| 361 | |
| 362 | print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") |
| 363 | |
| 364 | results.load_analysis().to_report() |