| 1 | # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | # SPDX-License-Identifier: Apache-2.0 |
| 3 | # /// script |
| 4 | # requires-python = ">=3.10" |
| 5 | # dependencies = [ |
| 6 | # "data-designer>=0.5.6", |
| 7 | # ] |
| 8 | # /// |
| 9 | """Long-Document Understanding Visual Question-Answering Recipe |
| 10 | |
| 11 | Generate question-answer pairs grounded in document page images using a |
| 12 | vision-language model (VLM). For each seed record the pipeline: |
| 13 | |
| 14 | 1. Samples a question type (multiple choice, yes/no, free-form, not answerable) |
| 15 | 2. Generates a question conditioned on the page image and its classification |
| 16 | 3. Generates an answer (with chain-of-thought reasoning captured separately) |
| 17 | 4. Evaluates question relevance against the visible content |
| 18 | 5. Evaluates answer correctness against the visible content |
| 19 | |
| 20 | Prerequisites: |
| 21 | - A seed parquet file containing: |
| 22 | * `png_images_base64` – JSON array of base64-encoded PNGs (one |
| 23 | element per page; single-page seeds have a one-element array). |
| 24 | * `page_classification` – JSON describing the visual element type and |
| 25 | reasoning complexity score (produced by 04-page-classification-sdg.py) |
| 26 | - A vLLM-compatible deployment of the VLM (default: Qwen/Qwen3-VL-235B-A22B-Thinking-FP8). |
| 27 | Recommended vLLM launch flags: |
| 28 | --tensor-parallel-size 4 |
| 29 | --max-model-len 50000 |
| 30 | --gpu-memory-utilization 0.90 |
| 31 | --reasoning-parser deepseek_r1 |
| 32 | --limit-mm-per-prompt '{"video": 0}' |
| 33 | --trust-remote-code |
| 34 | |
| 35 | Example launch script for 4× H100: |
| 36 | docker run --gpus all \ |
| 37 | -p 8000:8000 \ |
| 38 | vllm/vllm-openai:latest \ |
| 39 | --model Qwen/Qwen3-VL-235B-A22B-Thinking-FP8 \ |
| 40 | --tensor-parallel-size 4 \ |
| 41 | --max-model-len 50000 \ |
| 42 | --gpu-memory-utilization 0.90 \ |
| 43 | --reasoning-parser deepseek_r1 \ |
| 44 | --limit-mm-per-prompt '{"video": 0}' \ |
| 45 | --trust-remote-code |
| 46 | |
| 47 | Run: |
| 48 | # Basic usage (seed-path should point to the output of 04-page-classification-sdg.py) |
| 49 | uv run 05-visual-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path artifacts/page_classification/parquet-files/*.parquet |
| 50 | |
| 51 | # Custom model and record count |
| 52 | uv run 05-visual-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path artifacts/page_classification/parquet-files/*.parquet --num-records 100 |
| 53 | |
| 54 | # For help message and available options |
| 55 | uv run 05-visual-qa-sdg.py --help |
| 56 | """ |
| 57 | |
| 58 | from pathlib import Path |
| 59 | |
| 60 | import data_designer.config as dd |
| 61 | from data_designer.interface import DataDesigner, DatasetCreationResults |
| 62 | |
| 63 | DEFAULT_VLM_MODEL = "Qwen/Qwen3-VL-235B-A22B-Thinking-FP8" |
| 64 | VLLM_PROVIDER_NAME = "vllm" |
| 65 | |
| 66 | # ============================================================================= |
| 67 | # Image context helper |
| 68 | # ============================================================================= |
| 69 | |
| 70 | IMAGE_CONTEXT = [ |
| 71 | dd.ImageContext( |
| 72 | # Expects a single-element JSON array from the per-page seed. |
| 73 | column_name="png_images_base64", |
| 74 | data_type=dd.ModalityDataType.BASE64, |
| 75 | image_format=dd.ImageFormat.PNG, |
| 76 | ) |
| 77 | ] |
| 78 | |
| 79 | # ============================================================================= |
| 80 | # Prompt templates |
| 81 | # ============================================================================= |
| 82 | |
| 83 | PROMPT_QUESTION = """\ |
| 84 | You are an expert in creating meaningful questions that test comprehension and reasoning about visual document content. |
| 85 | |
| 86 | Your task: Create a targeted question of type <question-type> based on the visual element classification and visible content. |
| 87 | |
| 88 | <question-type> |
| 89 | {{question_type}} |
| 90 | </question-type> |
| 91 | |
| 92 | <page-classification> |
| 93 | {{page_classification}} |
| 94 | </page-classification> |
| 95 | |
| 96 | CRITICAL: Focus on the Visual Element |
| 97 | |
| 98 | The <page-classification> identifies the PRIMARY visual element type (e.g., TABULAR, QUANTITATIVE, LOGIC_DIAGRAMS) present in the image. |
| 99 | |
| 100 | IMPORTANT: When creating your question, focus EXCLUSIVELY on the area of the image that contains the visual element identified in <page-classification>. |
| 101 | - If primary_categories shows TABULAR with subcategory SIMPLE_TABLE, focus your question on the TABLE content specifically |
| 102 | - If primary_categories shows QUANTITATIVE with subcategory BAR_CHART, focus your question on the CHART data specifically |
| 103 | - Ignore any surrounding text, headers, footers, or decorative elements that are not part of the primary visual element |
| 104 | - Your question should be about the DATA/CONTENT within the visual element, not about peripheral information |
| 105 | |
| 106 | ═══════════════════════════════════════════════════════════════════════════════ |
| 107 | IMPORTANT: PREFER SIMPLE REASONING QUESTIONS |
| 108 | ═══════════════════════════════════════════════════════════════════════════════ |
| 109 | |
| 110 | Create questions that require ONE STEP of reasoning or calculation - not just reading a value, but not overly complex either. |
| 111 | |
| 112 | PREFERRED question types (aim for these): |
| 113 | |
| 114 | 1. SIMPLE COMPARISONS (no calculation needed): |
| 115 | ✓ "Which category has the highest/lowest value?" |
| 116 | ✓ "Is X greater than Y?" |
| 117 | ✓ "Which period showed the largest increase?" |
| 118 | → Requires comparing 2-4 values, no math |
| 119 | |
| 120 | 2. SIMPLE CALCULATIONS (one operation): |
| 121 | ✓ "What is the difference between X and Y?" (subtraction) |
| 122 | ✓ "What is the total of categories A and B?" (addition of 2-3 items) |
| 123 | ✓ "What percentage of the total does X represent?" (one division) |
| 124 | → Requires one simple calculation with clearly visible values |
| 125 | |
| 126 | 3. SIMPLE TRENDS/PATTERNS (observation): |
| 127 | ✓ "Did revenue increase or decrease from Q1 to Q4?" |
| 128 | ✓ "Which category grew the most?" |
| 129 | ✓ "List all items above/below value X" |
| 130 | → Requires identifying patterns without complex math |
| 131 | |
| 132 | 4. SIMPLE RATIOS (when values are obvious): |
| 133 | ✓ "How many times larger is A than B?" (when A=200, B=100 → answer: 2) |
| 134 | → Only when the ratio is simple (2x, 3x, etc.) |
| 135 | |
| 136 | AVOID these question types: |
| 137 | ✗ Direct single value lookup: "What was the revenue in Q3?" |
| 138 | ✗ Multi-step calculations: "What is the average growth rate across all periods?" |
| 139 | ✗ Complex aggregations: "What percentage do the top 5 categories represent combined?" |
| 140 | ✗ Statistical measures: "What is the variance?" or "What is the correlation?" |
| 141 | ✗ Ambiguous questions: "Which shows the most consistent pattern?" |
| 142 | |
| 143 | KEY PRINCIPLE: Questions should require examining 2-3 data points and ONE simple operation (compare, add, subtract, or divide). |
| 144 | |
| 145 | STEP 1: Analyze the Visual Element Type |
| 146 | |
| 147 | The <page-classification> tells you what TYPE of visual content is present. Use this to create appropriate reasoning-based questions. |
| 148 | |
| 149 | PRIMARY CATEGORIES and REASONING Question Strategies: |
| 150 | |
| 151 | QUANTITATIVE (Charts/Graphs): |
| 152 | - Subcategories: BAR_CHART, LINE_GRAPH, SCATTER_PLOT, PIE_CHART, AREA_GRAPH, HISTOGRAM, BOX_PLOT, HEATMAP, BUBBLE_CHART |
| 153 | - Best question types: numerical (simple calculations), comparisons, multiple choice |
| 154 | - SIMPLE REASONING Examples: |
| 155 | ✓ "Which category has the highest value?" (comparison) |
| 156 | ✓ "What is the difference between the highest and lowest values?" (one subtraction) |
| 157 | ✓ "Did sales increase or decrease from Q1 to Q4?" (simple trend) |
| 158 | ✓ "What is the total of the two largest categories?" (simple addition) |
| 159 | ✓ "Is Category A greater than Category B?" (simple comparison) |
| 160 | ✓ "How many categories have values above 100?" (counting with condition) |
| 161 | - AVOID: Direct lookups ("What was Q3 revenue?"), complex calculations ("What's the average growth rate?") |
| 162 | |
| 163 | TABULAR (Tables): |
| 164 | - Subcategories: SIMPLE_TABLE, NESTED_TABLE, PIVOT_TABLE, COMPARISON_TABLE, FINANCIAL_TABLE |
| 165 | - Best question types: numerical (simple calculations), comparisons, filtered lists |
| 166 | - SIMPLE REASONING Examples: |
| 167 | ✓ "Which fund has the highest budget?" (comparison) |
| 168 | ✓ "What is the total budget of Funds A and B?" (simple addition) |
| 169 | ✓ "How many funds have a budget over $1000?" (counting with condition) |
| 170 | ✓ "Is Fund A's budget greater than Fund B's?" (simple comparison) |
| 171 | ✓ "List all funds with 'Education' as their purpose" (filtering) |
| 172 | ✓ "What is the difference between the largest and smallest fund?" (one subtraction) |
| 173 | - AVOID: Direct cell lookups ("What is Fund 01's source?"), complex calculations ("What's the average of all funds meeting multiple conditions?") |
| 174 | |
| 175 | LOGIC_DIAGRAMS (Flowcharts/Process): |
| 176 | - Subcategories: FLOWCHART, DECISION_TREE, PROCESS_MAP, ALGORITHM_DIAGRAM, STATE_DIAGRAM, SEQUENCE_DIAGRAM |
| 177 | - Best question types: text (simple path tracing), yes/no, list |
| 178 | - SIMPLE REASONING Examples: |
| 179 | ✓ "If condition A is true, what is the next step?" (simple path following) |
| 180 | ✓ "How many decision points are shown in the flowchart?" (counting) |
| 181 | ✓ "Does the process include step X?" (yes/no) |
| 182 | ✓ "What happens immediately after step X?" (one-step trace) |
| 183 | ✓ "List all possible outcomes shown" (enumeration) |
| 184 | ✓ "Which step comes before the final outcome?" (simple reverse trace) |
| 185 | - AVOID: Complex path analysis ("What sequence of 5 conditions leads to Z?"), multi-hop reasoning |
| 186 | |
| 187 | HIERARCHICAL (Org Charts/Trees): |
| 188 | - Subcategories: ORG_CHART, MIND_MAP, TREE_STRUCTURE, TAXONOMY, DENDROGRAM |
| 189 | - Best question types: text (simple relationships), counting, lists |
| 190 | - SIMPLE REASONING Examples: |
| 191 | ✓ "How many people directly report to Manager X?" (counting direct connections) |
| 192 | ✓ "Who is Manager X's immediate supervisor?" (one-level relationship) |
| 193 | ✓ "Which manager has the most direct reports?" (comparison) |
| 194 | ✓ "List all people who report directly to the CEO" (enumeration) |
| 195 | ✓ "How many levels are in the organizational hierarchy?" (counting layers) |
| 196 | ✓ "Is Person A senior to Person B?" (relationship check) |
| 197 | - AVOID: Complex multi-level traversal ("How many total reports including indirect?"), percentage calculations |
| 198 | |
| 199 | SPATIAL_RELATIONAL (Maps/Diagrams): |
| 200 | - Subcategories: FLOOR_PLAN, BLUEPRINT, CHOROPLETH_MAP, POINT_MAP, TOPOGRAPHIC_MAP, NETWORK_DIAGRAM |
| 201 | - Best question types: text (simple spatial), yes/no, counting |
| 202 | - SIMPLE REASONING Examples: |
| 203 | ✓ "Which room is adjacent to Room X?" (one-step spatial) |
| 204 | ✓ "How many rooms are on the first floor?" (counting) |
| 205 | ✓ "Is Room A directly connected to Room B?" (yes/no spatial) |
| 206 | ✓ "Which area is the largest?" (comparison) |
| 207 | ✓ "List all rooms that connect to the hallway" (enumeration) |
| 208 | ✓ "What is located north of Building X?" (directional) |
| 209 | - AVOID: Complex path finding ("shortest path through 5 rooms"), density calculations, percentage of area |
| 210 | |
| 211 | SCHEMATIC (Technical Diagrams): |
| 212 | - Subcategories: CIRCUIT_DIAGRAM, MECHANICAL_DIAGRAM, ANATOMICAL_DIAGRAM, WIRING_DIAGRAM, PLUMBING_DIAGRAM |
| 213 | - Best question types: text (simple connections), counting, lists |
| 214 | - SIMPLE REASONING Examples: |
| 215 | ✓ "What component is directly connected to component X?" (one-step connection) |
| 216 | ✓ "How many components are of type X?" (counting) |
| 217 | ✓ "Is component A connected to component B?" (yes/no) |
| 218 | ✓ "List all components connected to the input" (enumeration) |
| 219 | ✓ "Which component has the most connections?" (comparison) |
| 220 | ✓ "What is the next component after X in the flow?" (one-step trace) |
| 221 | - AVOID: Path analysis ("all components in signal path"), failure analysis, impedance calculations |
| 222 | |
| 223 | INFOGRAPHIC (Visual Narratives): |
| 224 | - Subcategories: TIMELINE, STATISTICAL_INFOGRAPHIC, PROCESS_INFOGRAPHIC, COMPARISON_INFOGRAPHIC |
| 225 | - Best question types: text (simple analysis), comparisons, counting |
| 226 | - SIMPLE REASONING Examples: |
| 227 | ✓ "Which year had the most events?" (comparison) |
| 228 | ✓ "How many events occurred between Year X and Year Y?" (counting) |
| 229 | ✓ "Which category shows the largest value in the comparison?" (simple comparison) |
| 230 | ✓ "Did the trend increase or decrease over time?" (direction) |
| 231 | ✓ "List all events that occurred after Year X" (filtering) |
| 232 | ✓ "Is Category A greater than Category B?" (simple comparison) |
| 233 | - AVOID: Complex calculations ("average time intervals"), growth rates, statistical measures |
| 234 | |
| 235 | STEP 2: Match Question Type to Content |
| 236 | |
| 237 | NUMERICAL question types (int, float, percentage %): |
| 238 | ✓ Use for: QUANTITATIVE charts, TABULAR data with numbers, INFOGRAPHIC with statistics |
| 239 | ✓ ALWAYS require calculation, comparison, or aggregation - NEVER direct lookup |
| 240 | ✗ NEVER use for: LOGIC_DIAGRAMS, HIERARCHICAL, SCHEMATIC (unless they contain numerical labels) |
| 241 | |
| 242 | TEXT question types (short answer, list of items, yes/no): |
| 243 | ✓ Use for: TABULAR (if text content), LOGIC_DIAGRAMS, HIERARCHICAL, SPATIAL_RELATIONAL, SCHEMATIC, INFOGRAPHIC |
| 244 | ✓ Should require reasoning, filtering, or multi-step analysis |
| 245 | |
| 246 | MULTIPLE CHOICE: |
| 247 | ✓ Good for any category - create options based on calculated or derived values, not direct readings |
| 248 | ✓ Options should require the user to perform reasoning to eliminate incorrect choices |
| 249 | |
| 250 | NOT ANSWERABLE: |
| 251 | ✓ Create questions relevant to the visual element type but whose answer isn't present |
| 252 | ✓ Example: For a 2023 revenue table, ask "What percentage increase occurred from 2024 Q1 to Q2?" |
| 253 | |
| 254 | STEP 3: Match Complexity to Score |
| 255 | |
| 256 | The reasoning_complexity_score (1-10) in <page-classification> indicates the appropriate depth. |
| 257 | IMPORTANT: Keep questions simple and achievable. Most questions should be in the 3-6 range. |
| 258 | |
| 259 | - Score 1-3 (Low): Basic comparisons or simple observations |
| 260 | * Examples: "Which category has the highest value?", "Is A greater than B?" |
| 261 | * Requires comparing 2-3 values, no calculation needed |
| 262 | |
| 263 | - Score 4-6 (Medium): ONE simple calculation or counting with a condition |
| 264 | * This is the TARGET for most questions - requires one step of reasoning |
| 265 | * Examples: |
| 266 | - "What is the total of A and B?" (simple addition) |
| 267 | - "What is the difference between highest and lowest?" (simple subtraction) |
| 268 | - "How many items are above 100?" (counting with condition) |
| 269 | * Questions should require examining 2-4 data points and ONE simple operation |
| 270 | |
| 271 | - Score 7-8 (High): Use sparingly - slightly more complex but still straightforward |
| 272 | * Examples: "What percentage does X represent?" (requires division) |
| 273 | * Only use when the calculation is still simple and unambiguous |
| 274 | |
| 275 | - Score 9-10 (Expert): AVOID - Too complex for reliable VLM answering |
| 276 | * Do not create questions requiring: multi-step calculations, averages of many items, growth rates, statistical measures |
| 277 | * These lead to calculation errors and incorrect answers |
| 278 | |
| 279 | GENERAL RULE: If you need to do more than ONE calculation step in your head to answer it, the question is too complex. |
| 280 | |
| 281 | ═══════════════════════════════════════════════════════════════════════════════ |
| 282 | CRITICAL: CREATE VERIFIABLE QUESTIONS |
| 283 | ═══════════════════════════════════════════════════════════════════════════════ |
| 284 | |
| 285 | Before finalizing your question, ask yourself: |
| 286 | 1. "Can I answer this question clearly by looking at the visual?" |
| 287 | 2. "Can I verify if an answer is correct or incorrect?" |
| 288 | 3. "Is there a clear, unambiguous correct answer?" |
| 289 | |
| 290 | If you cannot easily answer and verify the question yourself, DO NOT use it. |
| 291 | |
| 292 | Examples: |
| 293 | ✓ GOOD: "Which category has the highest value?" |
| 294 | → You can look and determine: "Category A = 150, Category B = 120, so Category A is correct" |
| 295 | |
| 296 | ✓ GOOD: "What is the difference between Product A and Product B?" |
| 297 | → You can calculate: "Product A = 200, Product B = 150, difference = 50" |
| 298 | |
| 299 | ✗ BAD: "Which category shows the most consistent growth?" |
| 300 | → Ambiguous - what does "most consistent" mean? Hard to verify. |
| 301 | |
| 302 | ✗ BAD: "What is the average of all values shown?" |
| 303 | → If there are 10+ values, too tedious to verify correctly |
| 304 | |
| 305 | ✗ BAD: "What percentage of total do the top 5 categories represent?" |
| 306 | → Requires identifying top 5, summing them, calculating percentage - too many steps to verify reliably |
| 307 | |
| 308 | FORBIDDEN - DO NOT CREATE: |
| 309 | ✗ Questions answerable by reading a single trivial value (unless complexity score is 1-3) |
| 310 | ✗ Character/letter counting |
| 311 | ✗ Word counting (unless semantically meaningful) |
| 312 | ✗ Font style/size questions |
| 313 | ✗ Trivial string manipulation |
| 314 | ✗ Color or formatting questions |
| 315 | ✗ Generic questions that ignore the visual element type |
| 316 | |
| 317 | Question Framing Rules: |
| 318 | 1. Create questions SPECIFIC to the visual element type identified in <page-classification> |
| 319 | 2. Focus ONLY on the primary visual element (table, chart, diagram, etc.), not surrounding content |
| 320 | 3. Do NOT use: "the page", "the image", "the document", "according to" |
| 321 | 4. Ask about content directly using action verbs like: "Which", "What is", "How many", "Is" |
| 322 | 5. Prefer simple reasoning questions (one comparison or one calculation) over direct lookups |
| 323 | 6. Match question difficulty to the reasoning_complexity_score (target: 3-6) |
| 324 | 7. CRITICAL: You must be able to answer the question yourself and verify if an answer is correct |
| 325 | 8. Ensure questions have clear, unambiguous correct answers |
| 326 | 9. Keep questions achievable - avoid ambiguous terms like "most consistent" or "optimal" |
| 327 | |
| 328 | ═══════════════════════════════════════════════════════════════════════════════ |
| 329 | OUTPUT FORMAT INSTRUCTIONS |
| 330 | ═══════════════════════════════════════════════════════════════════════════════ |
| 331 | |
| 332 | Your response MUST contain ONLY the question text. |
| 333 | |
| 334 | If the question type is "multiple choices": |
| 335 | - Output the question on the first line |
| 336 | - Output each choice on a separate line, starting with a letter (A., B., C., D.) |
| 337 | - Example: |
| 338 | Which category has the highest value? |
| 339 | A. Category A |
| 340 | B. Category B |
| 341 | C. Category C |
| 342 | D. Category D |
| 343 | |
| 344 | For all other question types: |
| 345 | - Output ONLY the question text, nothing else |
| 346 | - Example: What is the difference between Product A and Product B? |
| 347 | |
| 348 | DO NOT include any explanations, reasoning, or additional text.\ |
| 349 | """ |
| 350 | |
| 351 | |
| 352 | PROMPT_ANSWER = """\ |
| 353 | You are an expert at providing accurate, comprehensive answers based on given information. |
| 354 | |
| 355 | Your task is to answer the <question> using ONLY the information visible in the image. |
| 356 | |
| 357 | <question-type> |
| 358 | {{question_type}} |
| 359 | </question-type> |
| 360 | |
| 361 | <page-classification> |
| 362 | {{page_classification}} |
| 363 | </page-classification> |
| 364 | |
| 365 | <question> |
| 366 | {{question}} |
| 367 | </question> |
| 368 | |
| 369 | Answer Guidelines: |
| 370 | - Base your answer ENTIRELY on the visible content |
| 371 | - Do not make assumptions or add information not present in the visible content |
| 372 | - Use the <page-classification> to understand the content type and provide appropriate answers: |
| 373 | * For QUANTITATIVE/TABULAR with numbers: Perform calculations accurately using visible data |
| 374 | * For LOGIC_DIAGRAMS: Describe process steps or decision flows |
| 375 | * For HIERARCHICAL: Explain relationships or structures |
| 376 | * For SPATIAL_RELATIONAL: Describe locations or spatial relationships |
| 377 | * For SCHEMATIC: Explain component connections or technical details |
| 378 | * For INFOGRAPHIC: Extract key facts or statistics |
| 379 | |
| 380 | CRITICAL - For Calculation-Based Questions: |
| 381 | When the question asks you to calculate percentages, ratios, differences, averages, or any derived values: |
| 382 | 1. Extract the relevant data points from the visible content |
| 383 | 2. Perform the calculation accurately |
| 384 | 3. Provide the final answer with appropriate units (%, ratio, currency, etc.) |
| 385 | 4. Round percentages to 1-2 decimal places (e.g., "25.5%" or "33.33%") |
| 386 | 5. Round decimal numbers to 2-3 significant figures unless the question specifies otherwise |
| 387 | |
| 388 | Examples of calculation questions: |
| 389 | - "What percentage of X?" → Calculate: (part/whole) × 100, answer as "XX.X%" |
| 390 | - "What is the ratio of A to B?" → Calculate: A/B, answer as "X:Y" or "X.XX" |
| 391 | - "What is the difference between X and Y?" → Calculate: |X - Y|, answer with units |
| 392 | - "What is the average of X, Y, Z?" → Calculate: (X+Y+Z)/3, answer with units |
| 393 | |
| 394 | Special Cases: |
| 395 | - If the question type is "not answerable", respond with "Not answerable" |
| 396 | - For multiple choice questions: Select the correct option based on the visible content (perform calculations if needed) |
| 397 | - For yes/no questions: Respond with "Yes" or "No" |
| 398 | - For list questions: Format your answer as a clear list |
| 399 | |
| 400 | Answer Format: |
| 401 | - Provide a direct answer without meta-commentary like "Based on the image..." or "According to the information provided..." |
| 402 | - Answer as if you are directly viewing the content |
| 403 | - Be precise and factual - do not speculate or infer beyond what is explicitly visible |
| 404 | - For numerical answers, include appropriate units and precision\ |
| 405 | """ |
| 406 | |
| 407 | |
| 408 | PROMPT_QUESTION_RELEVANCE = """\ |
| 409 | You are an expert at evaluating question quality and relevance. |
| 410 | |
| 411 | Your task is to determine if the <question> is relevant to the content visible in the image. |
| 412 | |
| 413 | <question-type> |
| 414 | {{question_type}} |
| 415 | </question-type> |
| 416 | |
| 417 | <page-classification> |
| 418 | {{page_classification}} |
| 419 | </page-classification> |
| 420 | |
| 421 | <question> |
| 422 | {{question}} |
| 423 | </question> |
| 424 | |
| 425 | ═══════════════════════════════════════════════════════════════════════════════ |
| 426 | CRITICAL: VERIFY STEP-BY-STEP BEFORE JUDGING |
| 427 | ═══════════════════════════════════════════════════════════════════════════════ |
| 428 | |
| 429 | Do NOT make a snap judgment. Follow this systematic verification process: |
| 430 | |
| 431 | STEP 1: Look at the image and identify what visual content is present |
| 432 | - What type of visual element do you see? (table, chart, diagram, etc.) |
| 433 | - What specific data or information is shown? |
| 434 | - What are the main topics, categories, or entities visible? |
| 435 | |
| 436 | STEP 2: Analyze what the question is asking about |
| 437 | - What topic or data does the question reference? |
| 438 | - What type of information would be needed to answer it? |
| 439 | - Does the question align with the visual element type from <page-classification>? |
| 440 | |
| 441 | STEP 3: Check if the question relates to visible content |
| 442 | - Are the entities/categories mentioned in the question actually present in the image? |
| 443 | - Is the type of data needed to answer visible in the image? |
| 444 | - Does the question make sense for this type of visual? |
| 445 | |
| 446 | STEP 4: Make your decision |
| 447 | - Mark "Relevant" if: The question asks about data/content that IS present in the visible image |
| 448 | - Mark "Relevant" if: The question is "not answerable" type AND is about the right domain/topic but specific data is missing |
| 449 | - Mark "Irrelevant" ONLY if: The question asks about content clearly NOT in the image OR is inappropriate for the visual type |
| 450 | |
| 451 | ═══════════════════════════════════════════════════════════════════════════════ |
| 452 | EVALUATION PHILOSOPHY: FOCUS ON CONTENT ALIGNMENT |
| 453 | ═══════════════════════════════════════════════════════════════════════════════ |
| 454 | |
| 455 | After verifying step-by-step, apply these standards: |
| 456 | |
| 457 | MARK as "Relevant" when: |
| 458 | - The question asks about data, entities, or topics that ARE visible in the image |
| 459 | - The question type matches the visual element type (charts for quantitative, tables for tabular, etc.) |
| 460 | - For "not answerable" questions: the domain/topic matches but specific data is missing |
| 461 | |
| 462 | MARK as "Irrelevant" when: |
| 463 | - The question asks about entities or data clearly NOT present in the image |
| 464 | - The question type is inappropriate for the visual element (e.g., asking about flowchart steps when showing a bar chart) |
| 465 | - The question topic has no connection to the visible content |
| 466 | |
| 467 | KEY PRINCIPLE: Verify that the question's subject matter aligns with what's actually visible in the image. |
| 468 | |
| 469 | ═══════════════════════════════════════════════════════════════════════════════ |
| 470 | |
| 471 | Detailed Evaluation Guidelines: |
| 472 | |
| 473 | 1. RELEVANT questions are those that: |
| 474 | ✓ Ask about data, entities, or relationships visible in the image |
| 475 | ✓ Are appropriate for the visual element type (e.g., asking about values in a chart, rows in a table) |
| 476 | ✓ Can potentially be answered from or reasoned about using the visible content |
| 477 | ✓ For "not answerable" type: relate to the domain/topic but specific answer is not present |
| 478 | |
| 479 | 2. IRRELEVANT questions are those that: |
| 480 | ✗ Ask about entities, data, or topics completely absent from the image |
| 481 | ✗ Are inappropriate for the visual type (e.g., asking about flowchart steps when image shows a bar chart) |
| 482 | ✗ Reference information that has nothing to do with the visible content |
| 483 | |
| 484 | Examples by Visual Type: |
| 485 | |
| 486 | For TABULAR content: |
| 487 | - "What is Fund A's budget?" → Relevant (if Fund A is in the table) |
| 488 | - "Which fund has the highest value?" → Relevant (if funds and values are shown) |
| 489 | - "What is the CEO's salary?" → Irrelevant (if no CEO or salary data visible) |
| 490 | |
| 491 | For QUANTITATIVE (Charts): |
| 492 | - "Which category has the highest value?" → Relevant (if categories are shown) |
| 493 | - "What is the total of A and B?" → Relevant (if A and B are in the chart) |
| 494 | - "What was the value in 2025?" → Irrelevant (if only 2020-2023 data shown) |
| 495 | |
| 496 | For LOGIC_DIAGRAMS (Flowcharts): |
| 497 | - "What happens after step X?" → Relevant (if step X is in the flowchart) |
| 498 | - "How many decision points are there?" → Relevant (if diagram shows decision points) |
| 499 | - "What is the database schema?" → Irrelevant (if image shows a process flow, not database) |
| 500 | |
| 501 | Special Case - "not answerable" questions: |
| 502 | - These should be relevant to the DOMAIN but the specific answer should not be present |
| 503 | - Example: Image shows 2023 revenue table, Question: "What was 2024 Q1 revenue?" → Relevant domain, but answer not present |
| 504 | |
| 505 | Your response should be: |
| 506 | - "Relevant" - if the question relates to content visible in the image (DEFAULT choice) |
| 507 | - "Irrelevant" - ONLY if the question is clearly about something not in the image |
| 508 | |
| 509 | ═══════════════════════════════════════════════════════════════════════════════ |
| 510 | OUTPUT FORMAT INSTRUCTIONS |
| 511 | ═══════════════════════════════════════════════════════════════════════════════ |
| 512 | |
| 513 | Your response MUST contain ONLY ONE WORD: |
| 514 | - "Relevant" OR |
| 515 | - "Irrelevant" |
| 516 | |
| 517 | DO NOT include any explanations, reasoning, or additional text. |
| 518 | Output ONLY the single word.\ |
| 519 | """ |
| 520 | |
| 521 | |
| 522 | PROMPT_ANSWER_CORRECTNESS = """\ |
| 523 | You are an expert at evaluating answer accuracy and correctness. |
| 524 | |
| 525 | Your task is to determine if the <answer> reasonably addresses the <question> based on the visible content. |
| 526 | |
| 527 | <question-type> |
| 528 | {{question_type}} |
| 529 | </question-type> |
| 530 | |
| 531 | <page-classification> |
| 532 | {{page_classification}} |
| 533 | </page-classification> |
| 534 | |
| 535 | <question> |
| 536 | {{question}} |
| 537 | </question> |
| 538 | |
| 539 | <answer> |
| 540 | {{answer}} |
| 541 | </answer> |
| 542 | |
| 543 | ═══════════════════════════════════════════════════════════════════════════════ |
| 544 | CRITICAL: VERIFY STEP-BY-STEP BEFORE JUDGING |
| 545 | ═══════════════════════════════════════════════════════════════════════════════ |
| 546 | |
| 547 | Do NOT make a snap judgment. Follow this systematic verification process: |
| 548 | |
| 549 | STEP 1: Understand what the question is asking |
| 550 | - What type of answer is expected? (comparison, calculation, value, list, yes/no) |
| 551 | - What specific information needs to be extracted or computed? |
| 552 | |
| 553 | STEP 2: Look at the visible content and verify the answer yourself |
| 554 | - Identify the relevant data points in the image |
| 555 | - If the question requires calculation, do the calculation yourself |
| 556 | - If the question requires comparison, compare the values yourself |
| 557 | - If the question requires counting or listing, count/list them yourself |
| 558 | |
| 559 | STEP 3: Compare YOUR answer to the PROVIDED answer |
| 560 | - Does the provided answer match what you found? |
| 561 | - Is it in the right ballpark? (within ±5% for numbers) |
| 562 | - Is it semantically equivalent even if worded differently? |
| 563 | - Does it make sense given the data? |
| 564 | |
| 565 | STEP 4: Make your decision |
| 566 | - Mark "Correct" if: Your answer and provided answer align (exact or close enough) |
| 567 | - Mark "Incorrect" ONLY if: Provided answer is clearly wrong based on your verification |
| 568 | |
| 569 | ═══════════════════════════════════════════════════════════════════════════════ |
| 570 | EVALUATION PHILOSOPHY: FOCUS ON SUBSTANTIVE CORRECTNESS |
| 571 | ═══════════════════════════════════════════════════════════════════════════════ |
| 572 | |
| 573 | After verifying step-by-step, apply these standards: |
| 574 | |
| 575 | ACCEPT as "Correct" when: |
| 576 | - The answer is factually accurate based on the visible content |
| 577 | - Numbers are close enough (within ±5% for calculations due to rounding) |
| 578 | - Wording differs but the meaning/value is the same |
| 579 | - Format differs ("25%" vs "0.25" vs "1/4") but represents the same value |
| 580 | |
| 581 | MARK as "Incorrect" when: |
| 582 | - The answer contradicts the visible content |
| 583 | - Numbers are significantly wrong (beyond ±5% tolerance) |
| 584 | - The answer uses wrong data from the image |
| 585 | - The answer doesn't address what was asked |
| 586 | |
| 587 | KEY PRINCIPLE: Distinguish between minor variations (format, rounding) and actual errors (wrong data, wrong calculation). |
| 588 | |
| 589 | ═══════════════════════════════════════════════════════════════════════════════ |
| 590 | OUTPUT FORMAT INSTRUCTIONS |
| 591 | ═══════════════════════════════════════════════════════════════════════════════ |
| 592 | |
| 593 | Your response MUST contain ONLY ONE WORD: |
| 594 | - "Correct" OR |
| 595 | - "Incorrect" |
| 596 | |
| 597 | DO NOT include any explanations, reasoning, or additional text. |
| 598 | Output ONLY the single word. |
| 599 | |
| 600 | ═══════════════════════════════════════════════════════════════════════════════ |
| 601 | |
| 602 | Detailed Evaluation Guidelines: |
| 603 | |
| 604 | 1. FOR "not answerable" QUESTIONS: |
| 605 | - Mark "Correct" if the answer indicates it's not answerable (e.g., "Not answerable", "Cannot be determined", "Information not provided", etc.) |
| 606 | - Only mark "Incorrect" if the answer provides a specific answer when it should say "not answerable", OR if it says "not answerable" but the information is clearly present |
| 607 | |
| 608 | 2. FOR CALCULATION/REASONING QUESTIONS (percentages, ratios, trends, comparisons): |
| 609 | |
| 610 | MARK AS "Correct" IF ANY OF THESE ARE TRUE: |
| 611 | ✓ The answer is in the right ballpark (within ±5% for calculations) |
| 612 | ✓ The answer uses a reasonable interpretation of the data |
| 613 | ✓ The answer shows correct reasoning even if numbers differ slightly |
| 614 | ✓ The answer addresses the question asked, even if format varies |
| 615 | ✓ Minor calculation differences due to rounding (e.g., 33% vs 33.33%) |
| 616 | ✓ Equivalent representations (e.g., "1/4" vs "25%" vs "0.25") |
| 617 | ✓ Different but reasonable ways to express the same concept |
| 618 | |
| 619 | Examples of answers to mark "Correct": |
| 620 | - Question: "What percentage does X represent?" |
| 621 | Answer: "25%" when exact is 24.8% → CORRECT (close enough) |
| 622 | - Question: "What's the ratio of A to B?" |
| 623 | Answer: "about 2 to 1" when exact is 1.9:1 → CORRECT (reasonable) |
| 624 | - Question: "By how much did X increase?" |
| 625 | Answer: "doubled" when exact increase is 95% → CORRECT (reasonable interpretation) |
| 626 | |
| 627 | ONLY MARK AS "Incorrect" IF: |
| 628 | ✗ The answer is wildly off (e.g., says 80% when it's actually 20%) |
| 629 | ✗ The answer clearly misidentifies what data to use |
| 630 | ✗ The answer provides a specific value when asked for a calculation but didn't calculate |
| 631 | ✗ The answer is completely unrelated to what was asked |
| 632 | |
| 633 | 3. FOR EXTRACTION QUESTIONS (specific values, items from lists): |
| 634 | |
| 635 | MARK AS "Correct" IF: |
| 636 | ✓ The answer matches the visible content |
| 637 | ✓ Minor wording differences that don't change meaning |
| 638 | ✓ The answer captures the key information even if not word-for-word |
| 639 | |
| 640 | ONLY MARK AS "Incorrect" IF: |
| 641 | ✗ The answer states information not present in the visible content |
| 642 | ✗ The answer contradicts what's visible |
| 643 | |
| 644 | 4. FOR COMPARISON/ANALYSIS QUESTIONS: |
| 645 | |
| 646 | MARK AS "Correct" IF: |
| 647 | ✓ The answer shows reasonable analysis of the visible content |
| 648 | ✓ The conclusion is defensible based on the data |
| 649 | ✓ The reasoning makes sense even if you might analyze it differently |
| 650 | |
| 651 | ONLY MARK AS "Incorrect" IF: |
| 652 | ✗ The conclusion clearly contradicts the visible data |
| 653 | ✗ The reasoning is fundamentally flawed |
| 654 | |
| 655 | 5. FOR MULTIPLE CHOICE QUESTIONS: |
| 656 | |
| 657 | MARK AS "Correct" IF: |
| 658 | ✓ The selected option is correct or defensible |
| 659 | |
| 660 | MARK AS "Incorrect" IF: |
| 661 | ✗ The selected option is clearly wrong |
| 662 | |
| 663 | 6. FOR YES/NO QUESTIONS: |
| 664 | |
| 665 | MARK AS "Correct" IF: |
| 666 | ✓ The yes/no answer is reasonable based on visible content |
| 667 | |
| 668 | MARK AS "Incorrect" IF: |
| 669 | ✗ The yes/no answer clearly contradicts visible content\ |
| 670 | """ |
| 671 | |
| 672 | |
| 673 | # ============================================================================= |
| 674 | # Pipeline configuration |
| 675 | # ============================================================================= |
| 676 | |
| 677 | |
| 678 | def build_config( |
| 679 | seed_path: str = "seed.parquet", |
| 680 | model_alias: str = "qwen-vl", |
| 681 | model_id: str = DEFAULT_VLM_MODEL, |
| 682 | ) -> dd.DataDesignerConfigBuilder: |
| 683 | model_configs = [ |
| 684 | dd.ModelConfig( |
| 685 | alias=model_alias, |
| 686 | model=model_id, |
| 687 | provider=VLLM_PROVIDER_NAME, |
| 688 | inference_parameters=dd.ChatCompletionInferenceParams( |
| 689 | timeout=1200, |
| 690 | max_tokens=40000, |
| 691 | max_parallel_requests=32, |
| 692 | ), |
| 693 | ), |
| 694 | ] |
| 695 | |
| 696 | config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs) |
| 697 | |
| 698 | config_builder.with_seed_dataset( |
| 699 | dd.LocalFileSeedSource(path=seed_path), |
| 700 | sampling_strategy=dd.SamplingStrategy.ORDERED, |
| 701 | ) |
| 702 | |
| 703 | config_builder.add_column( |
| 704 | dd.SamplerColumnConfig( |
| 705 | name="question_type", |
| 706 | sampler_type=dd.SamplerType.CATEGORY, |
| 707 | params=dd.CategorySamplerParams( |
| 708 | values=[ |
| 709 | "multiple choices", |
| 710 | "yes or no", |
| 711 | "number, word, phrase, short sentence (string), list of items (int, string, float or mixed)", |
| 712 | "not answerable", |
| 713 | ], |
| 714 | weights=[0.05, 0.1, 2, 0.01], |
| 715 | ), |
| 716 | ) |
| 717 | ) |
| 718 | |
| 719 | config_builder.add_column( |
| 720 | dd.LLMTextColumnConfig( |
| 721 | name="question", |
| 722 | model_alias=model_alias, |
| 723 | prompt=PROMPT_QUESTION, |
| 724 | multi_modal_context=IMAGE_CONTEXT, |
| 725 | ) |
| 726 | ) |
| 727 | |
| 728 | config_builder.add_column( |
| 729 | dd.LLMTextColumnConfig( |
| 730 | name="answer", |
| 731 | model_alias=model_alias, |
| 732 | prompt=PROMPT_ANSWER, |
| 733 | multi_modal_context=IMAGE_CONTEXT, |
| 734 | extract_reasoning_content=True, |
| 735 | ) |
| 736 | ) |
| 737 | |
| 738 | config_builder.add_column( |
| 739 | dd.LLMTextColumnConfig( |
| 740 | name="question_relevance", |
| 741 | model_alias=model_alias, |
| 742 | prompt=PROMPT_QUESTION_RELEVANCE, |
| 743 | multi_modal_context=IMAGE_CONTEXT, |
| 744 | ) |
| 745 | ) |
| 746 | |
| 747 | config_builder.add_column( |
| 748 | dd.LLMTextColumnConfig( |
| 749 | name="answer_correctness", |
| 750 | model_alias=model_alias, |
| 751 | prompt=PROMPT_ANSWER_CORRECTNESS, |
| 752 | multi_modal_context=IMAGE_CONTEXT, |
| 753 | ) |
| 754 | ) |
| 755 | |
| 756 | return config_builder |
| 757 | |
| 758 | |
| 759 | def create_dataset( |
| 760 | config_builder: dd.DataDesignerConfigBuilder, |
| 761 | num_records: int, |
| 762 | vllm_endpoint: str, |
| 763 | artifact_path: Path | str | None = None, |
| 764 | ) -> DatasetCreationResults: |
| 765 | model_providers = [ |
| 766 | dd.ModelProvider( |
| 767 | name=VLLM_PROVIDER_NAME, |
| 768 | endpoint=vllm_endpoint, |
| 769 | ), |
| 770 | ] |
| 771 | data_designer = DataDesigner( |
| 772 | artifact_path=artifact_path, |
| 773 | model_providers=model_providers, |
| 774 | ) |
| 775 | data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True)) |
| 776 | results = data_designer.create(config_builder, num_records=num_records, dataset_name="visual_qa") |
| 777 | return results |
| 778 | |
| 779 | |
| 780 | if __name__ == "__main__": |
| 781 | from argparse import ArgumentParser |
| 782 | |
| 783 | parser = ArgumentParser() |
| 784 | parser.add_argument( |
| 785 | "--vllm-endpoint", |
| 786 | type=str, |
| 787 | required=True, |
| 788 | help="Base URL of the vLLM server hosting the VLM (e.g. http://localhost:8000/v1)", |
| 789 | ) |
| 790 | parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file") |
| 791 | parser.add_argument("--model-alias", type=str, default="qwen-vl") |
| 792 | parser.add_argument("--model-id", type=str, default=DEFAULT_VLM_MODEL) |
| 793 | parser.add_argument("--num-records", type=int, default=5) |
| 794 | parser.add_argument("--artifact-path", type=str, default=None) |
| 795 | args = parser.parse_args() |
| 796 | |
| 797 | config_builder = build_config( |
| 798 | seed_path=args.seed_path, |
| 799 | model_alias=args.model_alias, |
| 800 | model_id=args.model_id, |
| 801 | ) |
| 802 | results = create_dataset( |
| 803 | config_builder, |
| 804 | num_records=args.num_records, |
| 805 | vllm_endpoint=args.vllm_endpoint, |
| 806 | artifact_path=args.artifact_path, |
| 807 | ) |
| 808 | |
| 809 | print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") |
| 810 | |
| 811 | results.load_analysis().to_report() |
|