Multi-Page Windowed QA
Multi-Page Windowed QA
Multi-Page Windowed QA
1 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 # SPDX-License-Identifier: Apache-2.0 3 # /// script 4 # requires-python = ">=3.10" 5 # dependencies = [ 6 # "data-designer>=0.5.6", 7 # ] 8 # /// 9 """Long-Document Understanding Multi-Page Windowed QA Recipe 10 11 Generate multi-page question-answer pairs from a sliding window of consecutive 12 PDF pages. Each question requires combining information from at least 2 pages 13 within the window, with strong anchoring so it remains unambiguous when 14 collated into a full-document training sample. 15 16 For each seed record the pipeline: 17 18 1. Samples a question type (multiple choice, yes/no, string, layout, 19 numerical int/float/percentage, list, not answerable) 20 2. Generates a question that requires examining 2+ pages within the window 21 3. Generates an answer with chain-of-thought reasoning (captured separately) 22 4. Evaluates overall quality including multi-page requirement, anchor quality, 23 answer correctness, reasoning thoroughness, and format compliance (0/1/2) 24 25 Prerequisites: 26 - A seed parquet file containing: 27 * `png_images_base64` – JSON array of base64-encoded PNGs for the 28 pages in each window (produced by 01-seed-dataset-preparation.py as 29 ``seed_windowed.parquet``). 30 - A vLLM-compatible deployment of the VLM 31 (default: Qwen/Qwen3-VL-235B-A22B-Thinking-FP8). 32 Recommended vLLM launch flags: 33 --tensor-parallel-size 4 34 --max-model-len 50000 35 --gpu-memory-utilization 0.90 36 --reasoning-parser deepseek_r1 37 --limit-mm-per-prompt '{"video": 0}' 38 --trust-remote-code 39 40 Example launch script for 4× H100: 41 docker run --gpus all \ 42 -p 8000:8000 \ 43 vllm/vllm-openai:latest \ 44 --model Qwen/Qwen3-VL-235B-A22B-Thinking-FP8 \ 45 --tensor-parallel-size 4 \ 46 --max-model-len 50000 \ 47 --gpu-memory-utilization 0.90 \ 48 --reasoning-parser deepseek_r1 \ 49 --limit-mm-per-prompt '{"video": 0}' \ 50 --trust-remote-code 51 52 Run: 53 # Basic usage (generates 5 records by default) 54 uv run 07-multi-page-windowed-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_windowed.parquet 55 56 # Custom model and record count 57 uv run 07-multi-page-windowed-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_windowed.parquet --num-records 100 58 59 # For help message and available options 60 uv run 07-multi-page-windowed-qa-sdg.py --help 61 """ 62 63 from pathlib import Path 64 65 import data_designer.config as dd 66 from data_designer.interface import DataDesigner, DatasetCreationResults 67 68 DEFAULT_VLM_MODEL = "Qwen/Qwen3-VL-235B-A22B-Thinking-FP8" 69 VLLM_PROVIDER_NAME = "vllm" 70 71 72 def _inference_params(model_id: str, reasoning: bool = True) -> dd.ChatCompletionInferenceParams: 73 """Select inference parameters based on model and reasoning mode.""" 74 if "Qwen/Qwen3.5-397B-A17B" in model_id: 75 extra_body = { 76 "temperature": 0.6, 77 "top_p": 0.95, 78 "top_k": 20, 79 "min_p": 0.0, 80 "presence_penalty": 0.0, 81 "repetition_penalty": 1.0, 82 } 83 temperature = 0.6 84 top_p = 0.95 85 elif "Qwen/Qwen3.5-122B-A10B" in model_id: 86 if reasoning: 87 extra_body = { 88 "temperature": 1.0, 89 "top_p": 0.95, 90 "top_k": 20, 91 "min_p": 0.0, 92 "presence_penalty": 1.5, 93 "repetition_penalty": 1.0, 94 } 95 temperature = 1.0 96 top_p = 0.95 97 else: 98 extra_body = { 99 "temperature": 0.7, 100 "top_p": 0.8, 101 "top_k": 20, 102 "min_p": 0.0, 103 "presence_penalty": 1.5, 104 "repetition_penalty": 1.0, 105 } 106 temperature = 1.0 107 top_p = 0.95 108 else: 109 extra_body = { 110 "top_k": 20, 111 "min_p": 0.0, 112 "presence_penalty": 1.5, 113 "repetition_penalty": 1.0, 114 } 115 temperature = 1.0 116 top_p = 0.95 117 118 return dd.ChatCompletionInferenceParams( 119 timeout=1200, 120 temperature=temperature, 121 top_p=top_p, 122 max_parallel_requests=32, 123 extra_body=extra_body, 124 ) 125 126 127 # ============================================================================= 128 # Image context helper 129 # ============================================================================= 130 131 IMAGE_CONTEXT = [ 132 dd.ImageContext( 133 column_name="png_images_base64", 134 data_type=dd.ModalityDataType.BASE64, 135 image_format=dd.ImageFormat.PNG, 136 ), 137 ] 138 139 # ============================================================================= 140 # Prompt templates 141 # ============================================================================= 142 143 PROMPT_QUESTION = """ 144 <question-type> 145 {{ question_type }} 146 </question-type> 147 148 You are given images of a WINDOW of 2-16 consecutive pages from a longer PDF. Create one 149 question of the given <question-type> that can only be answered by examining these pages. 150 151 Before finalizing the question, verify that EVERY fact needed to answer it is contained inside 152 the current window. If any required fact is outside the window, reject that question idea and 153 choose another. 154 155 At training time your question is collated with ALL pages of the full document, so every 156 reference must be unambiguous within the full document (see GROUNDING below). 157 158 CORE RULES 159 - The question MUST require information from at least 2 different pages. 2-4 evidence pages 160 is the sweet spot — focus on DEPTH of reasoning (computation, comparison, lookup chain) 161 rather than breadth (touching many pages). Do not force artificial connections just to span 162 more pages. 163 - Prefer questions where the model must continue scanning AFTER the first relevant page. 164 - If multiple pages in the window share the same template/layout (repeated profiles, repeated 165 entries, repeated charts/cards/tables), prefer questions that require exhaustive scanning 166 across all matching pages. 167 - Reject questions that can be answered correctly by looking only at the first matching page. 168 - Prefer tables, charts, figures, and infographics over plain text. 169 - Do NOT include the answer in the question. ONLY output the question text. 170 - Do NOT mention <question-type> or the page window. The trainee sees the full document. 171 - At training time the trainee sees the FULL document (potentially 50-100+ pages), not your 172 window. Strong anchoring (page numbers, element titles) is critical so the trainee can 173 LOCATE the 2-3 relevant pages among many. 174 - Reject questions where a plausible but wrong shortcut answer exists on the first relevant page 175 unless the question wording explicitly forces the model to use the later page(s) as well. 176 177 WINDOW-INTERNAL REASONING CHAIN (critical) 178 Before writing the question, identify which page(s) in the current window provide: 179 - the lookup key / entity / subgroup 180 - the target value 181 - any denominator or comparison value 182 - any final filtering or aggregation criterion 183 Prefer questions where this chain is explicit and distributed across 2-4 pages. 184 Good patterns: 185 - page A identifies the entity, page B gives the metric 186 - page A gives a percentage, page B gives a sample size, answer is a count 187 - page A and page B show the same metric for different groups/years, answer is a difference 188 - page A contains the first half of a sentence/table/figure and page B contains the completion 189 190 HIGH-VALUE QUESTION TARGETS (based on model failure analysis) 191 These question types expose the biggest model weaknesses — prioritize them: 192 - VISUAL PERCEPTION: questions about icon colors, line colors in charts, small labels on maps, 193 visual groupings separated by brackets/braces (e.g., "What are the four Business Analytics 194 activities in the chart titled 'Levels of Analytics'?" where the model must read the correct 195 section divider). 196 - COUNTING across pages: "How many X on pages N through M?" where X are scattered small 197 elements (map markers, icons, figures, organizations). The model undercounts by ~2x, so these 198 questions are high-value training signal. Require objective, unambiguous counting criteria. 199 - CROSS-PAGE COMPUTATION: financial ratios, sums, or comparisons requiring values from different 200 pages/tables (e.g., inventory turnover from Income Statement + Balance Sheet). 201 - INFOGRAPHIC SPATIAL: binding numbers/labels to their correct spatial region on maps, 202 flowcharts, or diagrams (the model confuses which number belongs to which region). 203 - LOOKUP CHAIN / CROSS-PAGE BINDING: page A identifies the correct entity/series/subgroup, 204 page B provides the target value, and optionally page C provides a denominator or comparison. 205 - EXHAUSTIVE MULTI-PAGE AGGREGATION: repeated page layouts where the answer requires scanning 206 every matching page in the window (cover-page models, museum entries, FAQ cards, chart panels, 207 guidebook cards), not stopping after the first hit. 208 - PAGE-BREAK CONTINUATION: the answer requires continuing a sentence, paragraph, table row, 209 or figure explanation from one page to the next. 210 211 QUALIFIER FIDELITY (critical) 212 - If multiple nearby answers exist, the question MUST include the qualifier that makes the target unique. 213 - Prefer qualifiers like: strongly / somewhat / overall / net, displayed / shown vs listed / mentioned, 214 exact row / column / year / fiscal year / subgroup / legend item. 215 - The question must not be answerable by selecting a nearby but broader fact. 216 217 GROUNDING & ANCHORING (critical) 218 Anchor priority (use the first available): 219 1. Page number: "On page 42, ..." — read the PRINTED page number from the image. 220 2. Numbered element: "In Table 3 on page 42, ..." / "In Figure 7, ..." 221 3. Element title: "In the chart titled 'X', ..." 222 4. Named statement: "In the Consolidated Balance Sheets, ..." 223 5. Section heading: "In the section titled 'Methodology', ..." 224 6. Structural fallback: "In the bar chart with y-axis 'Revenue ($M)', ..." 225 BANNED — these are ambiguous in the full document: 226 "the document/report/paper/slides" without anchor; 227 "the table/chart/figure" without title or page number; 228 "across the pages" / "in the provided pages". 229 For charts: always use FULL TITLE + distinguishing axis/column. If a chart has a "Change" 230 column, reference it explicitly. For maps/infographics: bind numbers to their labeled regions. 231 232 QUESTION-TYPE TEMPLATES (use the pages and random number {{ range(1, 1001) | random }} in choosing) 233 234 {% if "not answerable" in question_type %} 235 Create a question relevant to the visible window whose answer is NOT present anywhere in this window. 236 IMPORTANT: 237 - The question must be rejectable using ONLY the current window. 238 - Do NOT create a "not answerable" question only because the required page is outside the window. 239 - Build near-miss negatives by changing exactly ONE required qualifier from a visible multi-page fact: 240 - wrong year/date 241 - wrong subgroup/series 242 - wrong legend item 243 - wrong row/column 244 - wrong position 245 - wrong displayed-vs-mentioned relation 246 - wrong denominator/base requirement 247 Templates: 248 - "Using Table 2 on page X and the chart on page Y, what is [METRIC] for [YEAR not shown in either page]?" 249 - "In Figure N on page X and its continuation on page Y, what is [ATTRIBUTE not labeled anywhere in the window]?" 250 - "Across pages X-Y, which [ENTITY] satisfies [CONDITION not met by any item in the visible window]?" 251 252 {% elif "numerical" in question_type %} 253 Use visible numbers from tables/charts/text. Require arithmetic or counting across pages. 254 {% if "int" in question_type %} 255 Add "Answer with an integer." to the question. 256 Templates: 257 - "How many organisations are introduced in detail (at least one paragraph) on pages X through Y?" [count across pages] 258 - "In Figure N, how many distinct icons/nodes/colors are shown?" [visual counting] 259 - "How many charts on pages X through Y have their horizontal axis set as year?" [cross-page chart counting] 260 - "What is the sum of [METRIC] in Table A on page X and Table B on page Y?" 261 - "How many rows in Table N on page X have [COLUMN] above [THRESHOLD]?" 262 - "In the map on page X, how many [MARKERS/SYMBOLS] are shown?" 263 - "How many [SMALL ELEMENTS: WC markers, logos, footnote symbols] appear on pages X through Y?" [dense scattered counting]" 264 - "Across pages X-Y that share the same template/layout, how many entries satisfy [CONDITION]?" [exhaustive repeated-layout counting] 265 - "Using the percentage on page X and the sample size on page Y, how many [GROUP] does that correspond to? Round to the nearest hundred and answer with an integer." [cross-page denominator binding] 266 - "How many total [ITEMS] are introduced across pages X-Y? Count every numbered entry exactly once." [avoid early stopping] 267 {% elif "float" in question_type %} 268 Specify rounding (e.g., "Round to two decimal places."). 269 Templates: 270 - "What is inventory turnover (Cost of Sales / Average Inventory) for FY2021? Use the Income Statement on page X and Balance Sheet on page Y. Round to two decimal places." 271 - "What is the sum of the two smallest file sizes in the table on page X?" 272 - "How much did [METRIC] change between Table A on page X and Table B on page Y?" 273 - "What is the ratio of [CELL in Table A] to [CELL in Table B]?" 274 {% elif "percentage" in question_type %} 275 Add "Answer with a % sign.". 276 Templates: 277 - "What is the percentage difference between [GROUP A] and [GROUP B] in the chart titled 'X'?" 278 - "What percentage of [ENTITY] have [ATTRIBUTE] according to Tables on pages X and Y?" 279 - "In the chart titled 'X', how much higher is [SERIES A] than [SERIES B] in [YEAR]?" 280 - "What is [METRIC A on page X] as a percentage of [METRIC B on page Y]?" 281 - "Using the count on page X and the total on page Y, what percentage does this represent? Answer with a % sign." [cross-page denominator binding] 282 - "Using the values on pages X and Y, what is the percentage-point difference between [SERIES A] and [SERIES B]? Answer with a % sign." [cross-page comparison] 283 {% endif %} 284 285 {% elif "list" in question_type %} 286 Answer should be 2-8 short items. Add "Return a JSON array of strings, e.g., ["A", "B"]." 287 Do not list the items in the question. 288 The question itself must be a natural language sentence — NEVER output a JSON array as the question. 289 Templates: 290 - "What are the uses of [SYSTEM] described in the section titled 'X' on pages N-M?" [text list] 291 - "In the chart titled 'X', what are the four [CATEGORY] activities?" [chart label list] 292 - "In Figure N's demonstration, what are the colors of the nodes that appear in more than one cluster?" [visual list] 293 - "List all items in Table N on page X that meet [CONDITION]." [filtered table list] 294 - "What are the [FIELD] values for [ENTITY] in Tables on pages X and Y?" [cross-page list] 295 - "What are the colors of the icons that perform [ACTION A] and [ACTION B] on page X?" [UI element list] 296 - "Across pages X-Y that share the same layout, which entries satisfy [CONDITION]? Return a JSON array of strings." [exhaustive repeated-layout list] 297 - "Across pages X-Y, which schools/colleges/sections use a [QUALIFIER] cover-page model? Return a JSON array of strings." [avoid early stopping after first hit] 298 299 {% elif "yes" in question_type %} 300 Templates: 301 - "In Table A on page X, is [METRIC for ENTITY A] greater than [METRIC for ENTITY B] in Table B on page Y?" 302 - "In the chart titled 'X', was [SERIES A] higher than [SERIES B] in [YEAR]?" 303 - "Does the table on page X have more rows than the table on page Y?" 304 305 {% elif "multiple choice" in question_type %} 306 Provide exactly 4 options (A-D), plausible and mutually exclusive. 307 Templates: 308 - "In the chart titled 'X', which [ENTITY] has the highest [METRIC]? A. ... B. ... C. ... D. ..." 309 - "Based on Table A (page X) and Table B (page Y), which statement is true? A. ... B. ... C. ... D. ..." 310 311 {% elif "string:" in question_type %} 312 Answer is a word, phrase, or short sentence. 313 Templates: 314 - "In the chart titled 'X', in the 'Change' column, which subgroup shows the largest increase?" [chart with derived column] 315 - "In Figure N on page X, which element has the highest/lowest [METRIC]?" [figure reading] 316 - "In the world map on page X, which region has the largest number of [ENTITY]?" [infographic spatial] 317 - "Using the definition in Section 'X' on page A and the data in Table N on page B, what is [DERIVED FACT]?" [cross-page reasoning] 318 - "In Table N, which [ENTITY] has [SUPERLATIVE] [ATTRIBUTE]?" [table lookup] 319 - "What is the paper's full title referenced in Table N on page X for the method with [ATTRIBUTE]?" [cross-reference] 320 - "Using the statement on page X and its continuation on page Y, what is the missing/full condition?" [page-break continuation] 321 - "Using the chart on page X to identify the subgroup and the table on page Y to find its count, what is the resulting rounded answer?" [lookup chain] 322 323 {% elif "layout" in question_type %} 324 Answer requires understanding visual/spatial structure. Answer is a number, word, or phrase. 325 Templates: 326 - "What range does [COLOR] represent in the legend of the chart titled 'X' on page N?" [legend reading] 327 - "What is the URL/email in the [COLOR] box on page N?" [spatial anchor] 328 - "In Figure N on page X, which nodes are connected to node [LABEL]?" [graph structure] 329 - "What text appears inside the [COLOR/POSITION] box on page N?" [spatial extraction] 330 - "In the flowchart in Figure N, what step follows [LABEL]?" [process flow] 331 - "On page X, what is the heading directly above Table/Figure N?" [structural navigation] 332 - "What are the colors of the icons for [ACTION A] and [ACTION B] on page N?" [icon color perception] 333 - "In the chart titled 'X', which group of activities is labeled [SECTION] (above/below the bracket)?" [visual grouping] 334 {% endif %} 335 336 These templates are for inspiration. Create a question specific to the actual visible content. 337 ONLY output the question text, nothing else. 338 """ 339 340 341 PROMPT_ANSWER = """\ 342 <question-type> 343 {{ question_type }} 344 </question-type> 345 346 <question> 347 {{ question }} 348 </question> 349 350 You are given images of pages extracted from a PDF document. Answer using ONLY information visible in the pages. 351 352 You MUST use this exact output structure: 353 <think> 354 [all reasoning here] 355 </think> 356 [bare final answer here — no explanation, no labels, no extra text] 357 358 In your THINKING (inside <think> tags), follow this thinking protocol. 359 360 QUALIFIER LOCK (critical) 361 Before extracting any answer, copy the restrictive qualifiers from the question and keep them fixed: 362 - page / section / table / figure / chart identity 363 - year / date / fiscal year 364 - subgroup / series / legend item 365 - exact metric (count vs percentage vs percentage-point difference) 366 - displayed / shown / visible vs listed / mentioned 367 - first / second / last / nearest / highest / lowest 368 369 Do NOT substitute a nearby year, nearby subgroup, nearby series, nearby row, or nearby fact. 370 If the question asks for a specific subgroup or metric, read exactly that one and no other. 371 372 THINKING PROTOCOL — scan-locate-synthesize (follow in order) 373 374 Use PRINTED page numbers (e.g., "page 42"), never ordinal positions ("image 1", "the first page"). 375 Your reasoning trace becomes training data for a model that sees the full document — ordinals are meaningless there. 376 377 1) PARSE: Decompose the question into concrete lookup targets. 378 2) SCAN: First, read the PRINTED page number from each page (usually at the top or bottom 379 margin) and build a mapping. Then note relevant elements: 380 "Page 31 (printed at bottom): Income Statement. Page 59 (printed at bottom): Balance Sheets." 381 If a page has no printed number, use its heading instead: "Untitled page with 'Methodology' heading." 382 Use ONLY these printed page numbers for the rest of your reasoning — never "image 1" etc. 383 3) LOCATE: State where each target was found: 384 "Target A found on page 31 in the Income Statement." 385 "Target B found on page 59 in the Balance Sheets." 386 Keep track of each target. 387 4) MATCH: Match elements by TITLE or CAPTION, not position. 388 - Charts: match TITLE + axis labels. If a chart has a "Change" column, read it directly. 389 - Tables: match by caption/heading (e.g., "Consolidated Balance Sheets"). 390 - Maps/infographics: match each number to its spatially closest labeled region. 391 - Visual groupings: charts may use brackets or dividers to separate sections 392 (e.g., "Business Analytics" vs "Business Intelligence") — read the correct group. 393 - Colors: look carefully at actual colors of icons, lines, and legends. 394 Do NOT assume a page is grayscale. Describe what you see before answering. 395 396 PAGE-BREAK CONTINUATION 397 If a sentence, paragraph, table row, caption, or figure explanation appears to continue onto the next page, 398 combine the text before deciding the answer is missing. 399 400 5) READ: Extract values from the matched element. 401 - Tables: correct column for the fiscal year; parentheses = negative; check unit scale. 402 - Charts: read the exact value from labels, not approximations. 403 404 UNIT DISCIPLINE 405 - Preserve units exactly when present or requested (%, $, million, etc.). 406 - Financial: parentheses = negative; check table header for unit scale; "how much higher" = positive. 407 408 6) SYNTHESIZE: Combine information from multiple pages. Show the cross-page chain: 409 "From page 31: Cost of Sales = 24,576. From page 59: Avg Inventory = 7,110.5. 410 Turnover = 24,576 / 7,110.5 = 3.46." 411 412 UNIT DISCIPLINE 413 - Preserve units exactly when present or requested (%, $, million, etc.). 414 - Financial reports: parentheses = negative; check table header for unit scale (e.g., "In millions"); 415 "how much higher/more" = positive number; "change" = positive for increase, negative for decrease. 416 417 COUNT / PERCENT / DENOMINATOR DISCIPLINE 418 - If one page provides a percentage and another page provides a sample size, the percentage is NOT the final answer until it is converted using the sample size. 419 - Distinguish carefully between count, percentage, percentage-point difference, and ratio. 420 - Only round after the final computation, never before. 421 422 EXHAUSTIVE REPEATED-LAYOUT SCAN 423 If multiple pages in the window share the same layout or template, scan ALL matching pages before concluding. 424 Do not stop after the first valid hit. 425 For counts/lists, maintain a running page-by-page tally or item list until the last relevant page in the window. 426 Bad: "I see items 14-29 on pages 28-31. Count = 29 - 14 + 1 = 16." (stopped early, missed pages 32-34) 427 Good: "Page 28: items 14-21 (running total: 8). Page 30: items 22-29 (running total: 16). Page 32: items 30-37 (running total: 24). Page 34: items 38-44 (running total: 31). Final count: 31." 428 429 7) VERIFY: Re-check your extraction against the pages. 430 - COUNTING (models undercount by ~2x): count PER PAGE with a running total. 431 Enumerate each item explicitly. Re-scan for missed items on each page. 432 - LISTS: re-check each page for missed items. 433 - VALUES: re-read the specific cell/label to confirm. 434 435 THINKING STABILITY (critical) 436 - Follow the protocol once from top to bottom. Do NOT restart from step 1 after you already found the relevant pages. 437 - Do at most one scan pass and one verification pass. 438 - Be concise: go directly to pages matching the question's anchor. Do NOT describe every page — 439 skip irrelevant pages silently. Only mention pages that contain evidence. 440 - If there are two plausible candidates, compare them once using the question's qualifiers, choose the best-supported one, and continue. Do NOT keep generating new alternatives. 441 - Do NOT repeat the same scan, recount, or conclusion more than once. 442 - As soon as the answer is found and verified, stop thinking and produce the final answer. 443 - Do NOT use filler loops such as repeating a phrase, title, entity name, or page reference many times. 444 - If you have a complete answer supported by the required pages and qualifiers, commit to it. Do not reopen the search. 445 446 REASONING TRACE REQUIREMENTS 447 In your reasoning, always: 448 - Use the printed page numbers from your SCAN mapping. NEVER write "image 1", "image 2", 449 "the first/second/third page", or any ordinal reference. These will be rejected. 450 - Cite element titles: "In the Income Statement on page 31" not "in the table". 451 - Quote the specific values you extracted. 452 - For multi-page synthesis, explicitly connect values from different pages. 453 - For computations, show the formula with named sources and page numbers. 454 455 REFUSAL POLICY 456 {% if "not answerable" in question_type %} 457 - Output exactly: Not answerable 458 - Only if the page(s) lack the exact requested qualifier. 459 - In your thinking, briefly name what is missing. 460 Bad: "The information is not available in the provided pages." 461 Good: "Page 31 shows the Balance Sheet for 2020 and 2021, but the question asks for 2018. Year 2018 is not present on any page." 462 Good: "Pages 10-11 list subsections A, B, C, but none match 'Generative Retrieval'. Subsection missing." 463 {% else %} 464 - NEVER output "Not answerable", "Cannot determine", "Fail to answer", or any refusal. 465 - If the question uses a term (e.g., "receive turnover") that does not appear verbatim, 466 look for the underlying data needed to compute it. The term not appearing does not mean 467 the answer is unavailable. 468 - If you found partial evidence, provide what you found — partial > refusal. 469 - If the answer requires inference or computation from visible data, do it. 470 {% endif %} 471 472 FINAL ANSWER: 473 - Put ALL reasoning inside <think>...</think>. 474 - After </think>, output ONLY the final answer. 475 - Do NOT repeat reasoning outside <think> tags. 476 - Do NOT output protocol labels, explanations, or extra text after </think>. 477 478 OUTPUT FORMAT 479 {% if "multiple choice" in question_type %} 480 - Output: "<LETTER>. <option text>", e.g., "B. 92%" 481 {% elif "yes" in question_type %} 482 - Output exactly "Yes" or "No". 483 {% elif "numerical (percentage)" in question_type %} 484 - Output a number WITH a percent sign, e.g., "29%". 485 {% elif "numerical (int)" in question_type %} 486 - Output an integer only (digits, optional commas). 487 {% elif "numerical (float)" in question_type %} 488 - Output a decimal number only, unless the question requests a unit. 489 {% elif "string:" in question_type %} 490 - Output a short phrase/sentence only. 491 {% elif "layout" in question_type %} 492 - Output only the extracted content (string/number/list). 493 {% elif "list" in question_type %} 494 - Output a JSON array on ONE line: ["gray", "red"]. 495 - NEVER use comma-separated plain text. ALWAYS use ["..."] syntax. 496 {% elif "not answerable" in question_type %} 497 - Output exactly: Not answerable 498 {% else %} 499 - Output a short, direct answer. 500 {% endif %} 501 """ 502 503 504 PROMPT_QUALITY_SCORE = """\ 505 <question-type> 506 {{ question_type }} 507 </question-type> 508 509 <question> 510 {{ question }} 511 </question> 512 513 <answer> 514 {{ answer }} 515 </answer> 516 517 <answer_reasoning_trace> 518 {{ answer__reasoning_content }} 519 </answer_reasoning_trace> 520 521 You are given images of pages extracted from a PDF document. Evaluate the QUALITY of this (question, answer) pair. 522 523 Be STRICT. Any check failure => score 0. 524 525 CHECKS 526 527 1. DOCUMENT QUALITY: Are the document pages clear, readable, and not low quality? 528 529 2. RELEVANCE: Is the question relevant to the content visible in the pages? 530 {% if "not answerable" in question_type %}For "not answerable" questions, the question should be relevant but the answer must NOT be present anywhere in the visible window. 531 Score 0 if the question is unanswerable ONLY because it refers to a page outside the window or the whole document. The question must be a near-miss negative where a specific qualifier (year, subgroup, row, region, etc.) is absent from the visible pages.{% endif %} 532 533 3. ANSWER CORRECTNESS: Is the answer correct given the pages? 534 {% if "not answerable" in question_type %}The correct answer should be "Not answerable".{% endif %} 535 536 4. QUESTION QUALITY: Is the question challenging, unambiguous, and well-formed? 537 - If the question includes the answer, give a score of 0. 538 - The question must genuinely require information from multiple pages. 539 - Score 0 if a plausible shortcut answer exists on the first relevant page and the question does not force use of the later page(s). 540 541 5. FORMAT COMPLIANCE: Does the answer match the expected format? 542 - Score 0 if the answer contains <think> tags, reasoning steps, protocol labels, explanations, or anything beyond the bare final result. 543 {% if "multiple choice" in question_type %} 544 - Answer must be exactly "<LETTER>. <option text>" (A-D). Reject digit-only ("2") or letter-only ("B"). 545 {% elif "yes" in question_type %} 546 - Answer must be exactly "Yes" or "No". 547 {% elif "numerical (percentage)" in question_type %} 548 - Answer MUST include a percent sign (e.g., "29%"). 549 {% elif "numerical (int)" in question_type %} 550 - Answer must be an integer only (digits, optional commas), no extra words. 551 {% elif "numerical (float)" in question_type %} 552 - Answer must be a decimal number only, no extra words. 553 {% elif "list" in question_type %} 554 - Answer must be a ONE-LINE JSON array (e.g., ["gray", "red"]). 555 - Score 0 if comma-separated text instead of JSON array. 556 {% elif "not answerable" in question_type %} 557 - Answer must be exactly "Not answerable". 558 {% else %} 559 - Answer MUST NOT be "Not answerable" or any refusal. 560 {% endif %} 561 562 6. ANCHOR QUALITY (critical — questions are generated from a page window but collated into 563 the FULL document at training time. Vague references become ambiguous in the full document): 564 - Score 0 if the question uses any of these vague phrases: 565 "the document", "the report", "the paper", "the slides", "the table" (unqualified), 566 "the chart" (unqualified), "across the pages", "in the provided pages". 567 - Score 0 if the <answer_reasoning_trace> does not use explicit anchors such as page numbers, chart/table titles, or section headings. 568 Ordinal positions ("image 1", "the first page") are not allowed as anchors. 569 - Score 0 if the question lacks a specific anchor. Must have at least one of: 570 page number, chart/table title, numbered element (Table 3, Figure 7), named financial 571 statement, or section heading that uniquely identifies the target in the full document. 572 - Ask: "If a reader saw the full 50-page document, would they know EXACTLY which 573 table/chart/section this question refers to?" If not, score 0. 574 575 7. MULTI-PAGE REQUIREMENT: 576 - The answer must genuinely require information from at least 2 different pages. 577 - Score 0 if the answer can be determined from a single page alone. 578 579 8. REASONING QUALITY (critical — reasoning is used as chain-of-thought training data): 580 Score 0 if any of the following hold: 581 - The reasoning does NOT use explicit anchors such as page numbers, chart/table titles, or section headings. 582 - It uses ordinal references like "image 1", "the first page", etc. 583 - It does NOT quote the specific values extracted. 584 - If computation is required, it does NOT show the operation with named sources. 585 - It repeats the same scan, recount, candidate answer, page reference, title, entity name, or conclusion without adding new evidence. 586 - It restarts the reasoning process after already finding the relevant page(s) or elements. 587 - It contains obvious loop markers such as repeated "Wait, let me", "Actually", or "Let's look again". 588 - It keeps generating new alternatives after already having enough evidence to answer. 589 - It ends in an unfinished or truncated way, or appears to stop mid-thought. 590 - For count/list questions, it does not maintain a page-by-page tally or explicit item list. 591 - For repeated-layout or repeated-entry questions, it stops after the first valid hit instead of scanning all matching pages in the visible window. 592 - For cross-page computations, it does not clearly distinguish which page provides the key, target value, denominator, or comparison value. 593 - It confuses count vs percentage vs percentage-point difference vs ratio, or rounds before the final computation rather than after it. 594 595 9. VISUAL PERCEPTION (if applicable): 596 - If the question asks about colors, icons, or small visual elements, verify the answer 597 correctly describes what is visible. Score 0 if the answer claims the page is grayscale 598 when it is in color, or misidentifies a visual grouping. 599 - If the question involves counting scattered elements (markers, icons, figures across pages), 600 verify the count is plausible. Score 0 if the count is clearly too low (e.g., 3 when many 601 more are visible) or if the reasoning doesn't enumerate items. 602 603 SCORING 604 - Score 0: Any check fails. 605 - Score 1: All checks pass. 606 - Score 2: All checks pass AND the question involves at least one high-value signal: visual 607 perception (icon colors, chart groupings), counting scattered elements, financial computation 608 across statements, chart disambiguation, infographic spatial reasoning, or cross-page 609 table operation. Must still be unambiguous. 610 611 Respond with ONLY the score as a single digit: 0, 1, or 2. 612 """ 613 614 615 # ============================================================================= 616 # Pipeline configuration 617 # ============================================================================= 618 619 620 def build_config( 621 seed_path: str = "seed.parquet", 622 model_alias: str = "vl", 623 model_id: str = DEFAULT_VLM_MODEL, 624 reasoning: bool = True, 625 ) -> dd.DataDesignerConfigBuilder: 626 model_configs = [ 627 dd.ModelConfig( 628 alias=model_alias, 629 model=model_id, 630 provider=VLLM_PROVIDER_NAME, 631 inference_parameters=_inference_params(model_id, reasoning=reasoning), 632 ), 633 ] 634 635 config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs) 636 637 config_builder.with_seed_dataset( 638 dd.LocalFileSeedSource(path=seed_path), 639 sampling_strategy=dd.SamplingStrategy.SHUFFLE, 640 ) 641 642 config_builder.add_column( 643 dd.SamplerColumnConfig( 644 name="question_type", 645 sampler_type=dd.SamplerType.CATEGORY, 646 params=dd.CategorySamplerParams( 647 values=[ 648 "multiple choice", 649 "yes or no", 650 "string: word, phrase or short sentence", 651 "layout", 652 "numerical (int)", 653 "numerical (float)", 654 "numerical (percentage)", 655 "list of items (int, string, float or mixed)", 656 "not answerable", 657 ], 658 weights=[0.025, 0.025, 2, 2, 2, 2, 2, 2, 0.2], 659 ), 660 ) 661 ) 662 663 config_builder.add_column( 664 dd.LLMTextColumnConfig( 665 name="question", 666 model_alias=model_alias, 667 prompt=PROMPT_QUESTION, 668 multi_modal_context=IMAGE_CONTEXT, 669 ) 670 ) 671 672 config_builder.add_column( 673 dd.LLMTextColumnConfig( 674 name="answer", 675 model_alias=model_alias, 676 prompt=PROMPT_ANSWER, 677 multi_modal_context=IMAGE_CONTEXT, 678 extract_reasoning_content=True, 679 ) 680 ) 681 682 config_builder.add_column( 683 dd.LLMTextColumnConfig( 684 name="quality_score", 685 model_alias=model_alias, 686 prompt=PROMPT_QUALITY_SCORE, 687 multi_modal_context=IMAGE_CONTEXT, 688 ) 689 ) 690 691 return config_builder 692 693 694 def create_dataset( 695 config_builder: dd.DataDesignerConfigBuilder, 696 num_records: int, 697 vllm_endpoint: str, 698 artifact_path: Path | str | None = None, 699 ) -> DatasetCreationResults: 700 model_providers = [ 701 dd.ModelProvider( 702 name=VLLM_PROVIDER_NAME, 703 endpoint=vllm_endpoint, 704 ), 705 ] 706 data_designer = DataDesigner( 707 artifact_path=artifact_path, 708 model_providers=model_providers, 709 ) 710 data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True)) 711 results = data_designer.create(config_builder, num_records=num_records, dataset_name="multi_page_windowed_qa") 712 return results 713 714 715 if __name__ == "__main__": 716 from argparse import ArgumentParser 717 718 parser = ArgumentParser() 719 parser.add_argument( 720 "--vllm-endpoint", 721 type=str, 722 required=True, 723 help="Base URL of the vLLM server hosting the VLM (e.g. http://localhost:8000/v1)", 724 ) 725 parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file") 726 parser.add_argument("--model-alias", type=str, default="vl") 727 parser.add_argument("--model-id", type=str, default=DEFAULT_VLM_MODEL) 728 parser.add_argument("--num-records", type=int, default=5) 729 parser.add_argument( 730 "--reasoning", 731 action="store_true", 732 default=True, 733 help="Use reasoning-mode inference parameters (default: True)", 734 ) 735 parser.add_argument( 736 "--no-reasoning", 737 dest="reasoning", 738 action="store_false", 739 help="Use non-reasoning inference parameters", 740 ) 741 parser.add_argument("--artifact-path", type=str, default=None) 742 args = parser.parse_args() 743 744 config_builder = build_config( 745 seed_path=args.seed_path, 746 model_alias=args.model_alias, 747 model_id=args.model_id, 748 reasoning=args.reasoning, 749 ) 750 results = create_dataset( 751 config_builder, 752 num_records=args.num_records, 753 vllm_endpoint=args.vllm_endpoint, 754 artifact_path=args.artifact_path, 755 ) 756 757 print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") 758 759 results.load_analysis().to_report()