For AI agents: a documentation index is available at the root level at /llms.txt and /llms-full.txt. Append /llms.txt to any URL for a page-level index, or .md for the markdown version of any page.
  • Getting Started
    • Welcome
    • Contributing
  • Concepts
    • Columns
    • Seed Datasets
    • Agent Rollout Ingestion
    • Custom Columns
    • Validators
    • Processors
    • Person Sampling
    • Traces
    • Architecture & Performance
    • Deployment Options
    • Security
  • Tutorials
    • Overview
    • The Basics
    • Structured Outputs, Jinja Expressions, and Conditional Generation
    • Seeding with an External Dataset
    • Providing Images as Context
    • Generating Images
    • Image-to-Image Editing
  • Recipes
    • Recipe Cards
      • Seed Dataset Preparation
      • Nemotron Parse OCR
      • Text QA from OCR Transcripts
      • Page Classification
      • Visual QA
      • Single-Page QA
      • Multi-Page Windowed QA
      • Whole-Document QA
      • Frontier Judge QA Filter
  • Plugins
    • Overview
    • Example Plugin
    • FileSystemSeedReader Plugins
    • Discover
  • Code Reference
    • Overview
  • Dev Notes
    • Overview
    • Prompt Sensitivity
    • Retriever SDG Toolkit
    • Have It Your Way
    • VLM Long Document Understanding
    • Push Datasets to Hugging Face Hub
    • Text-to-SQL for Nemotron Super
    • Async All the Way Down
    • Owning the Model Stack
NVIDIANVIDIA
Developer-friendly docs for your API
Privacy Policy | Manage My Privacy | Do Not Sell or Share My Data | Terms of Service | Accessibility | Corporate Policies | Product Security | Contact

Copyright © 2026, NVIDIA Corporation.

LogoLogoNeMo Data Designer
RecipesVLM Long-Document Understanding

Multi-Page Windowed QA

||View as Markdown|
Previous

Single-Page QA

Next

Whole-Document QA

Download Recipe

Download the complete recipe script

1# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2# SPDX-License-Identifier: Apache-2.0
3# /// script
4# requires-python = ">=3.10"
5# dependencies = [
6# "data-designer>=0.5.6",
7# ]
8# ///
9"""Long-Document Understanding Multi-Page Windowed QA Recipe
10
11Generate multi-page question-answer pairs from a sliding window of consecutive
12PDF pages. Each question requires combining information from at least 2 pages
13within the window, with strong anchoring so it remains unambiguous when
14collated into a full-document training sample.
15
16For each seed record the pipeline:
17
18 1. Samples a question type (multiple choice, yes/no, string, layout,
19 numerical int/float/percentage, list, not answerable)
20 2. Generates a question that requires examining 2+ pages within the window
21 3. Generates an answer with chain-of-thought reasoning (captured separately)
22 4. Evaluates overall quality including multi-page requirement, anchor quality,
23 answer correctness, reasoning thoroughness, and format compliance (0/1/2)
24
25Prerequisites:
26 - A seed parquet file containing:
27 * `png_images_base64` – JSON array of base64-encoded PNGs for the
28 pages in each window (produced by 01-seed-dataset-preparation.py as
29 ``seed_windowed.parquet``).
30 - A vLLM-compatible deployment of the VLM
31 (default: Qwen/Qwen3-VL-235B-A22B-Thinking-FP8).
32 Recommended vLLM launch flags:
33 --tensor-parallel-size 4
34 --max-model-len 50000
35 --gpu-memory-utilization 0.90
36 --reasoning-parser deepseek_r1
37 --limit-mm-per-prompt '{"video": 0}'
38 --trust-remote-code
39
40 Example launch script for 4× H100:
41 docker run --gpus all \
42 -p 8000:8000 \
43 vllm/vllm-openai:latest \
44 --model Qwen/Qwen3-VL-235B-A22B-Thinking-FP8 \
45 --tensor-parallel-size 4 \
46 --max-model-len 50000 \
47 --gpu-memory-utilization 0.90 \
48 --reasoning-parser deepseek_r1 \
49 --limit-mm-per-prompt '{"video": 0}' \
50 --trust-remote-code
51
52Run:
53 # Basic usage (generates 5 records by default)
54 uv run 07-multi-page-windowed-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_windowed.parquet
55
56 # Custom model and record count
57 uv run 07-multi-page-windowed-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_windowed.parquet --num-records 100
58
59 # For help message and available options
60 uv run 07-multi-page-windowed-qa-sdg.py --help
61"""
62
63from pathlib import Path
64
65import data_designer.config as dd
66from data_designer.interface import DataDesigner, DatasetCreationResults
67
68DEFAULT_VLM_MODEL = "Qwen/Qwen3-VL-235B-A22B-Thinking-FP8"
69VLLM_PROVIDER_NAME = "vllm"
70
71
72def _inference_params(model_id: str, reasoning: bool = True) -> dd.ChatCompletionInferenceParams:
73 """Select inference parameters based on model and reasoning mode."""
74 if "Qwen/Qwen3.5-397B-A17B" in model_id:
75 extra_body = {
76 "temperature": 0.6,
77 "top_p": 0.95,
78 "top_k": 20,
79 "min_p": 0.0,
80 "presence_penalty": 0.0,
81 "repetition_penalty": 1.0,
82 }
83 temperature = 0.6
84 top_p = 0.95
85 elif "Qwen/Qwen3.5-122B-A10B" in model_id:
86 if reasoning:
87 extra_body = {
88 "temperature": 1.0,
89 "top_p": 0.95,
90 "top_k": 20,
91 "min_p": 0.0,
92 "presence_penalty": 1.5,
93 "repetition_penalty": 1.0,
94 }
95 temperature = 1.0
96 top_p = 0.95
97 else:
98 extra_body = {
99 "temperature": 0.7,
100 "top_p": 0.8,
101 "top_k": 20,
102 "min_p": 0.0,
103 "presence_penalty": 1.5,
104 "repetition_penalty": 1.0,
105 }
106 temperature = 1.0
107 top_p = 0.95
108 else:
109 extra_body = {
110 "top_k": 20,
111 "min_p": 0.0,
112 "presence_penalty": 1.5,
113 "repetition_penalty": 1.0,
114 }
115 temperature = 1.0
116 top_p = 0.95
117
118 return dd.ChatCompletionInferenceParams(
119 timeout=1200,
120 temperature=temperature,
121 top_p=top_p,
122 max_parallel_requests=32,
123 extra_body=extra_body,
124 )
125
126
127# =============================================================================
128# Image context helper
129# =============================================================================
130
131IMAGE_CONTEXT = [
132 dd.ImageContext(
133 column_name="png_images_base64",
134 data_type=dd.ModalityDataType.BASE64,
135 image_format=dd.ImageFormat.PNG,
136 ),
137]
138
139# =============================================================================
140# Prompt templates
141# =============================================================================
142
143PROMPT_QUESTION = """
144<question-type>
145{{ question_type }}
146</question-type>
147
148You are given images of a WINDOW of 2-16 consecutive pages from a longer PDF. Create one
149question of the given <question-type> that can only be answered by examining these pages.
150
151Before finalizing the question, verify that EVERY fact needed to answer it is contained inside
152the current window. If any required fact is outside the window, reject that question idea and
153choose another.
154
155At training time your question is collated with ALL pages of the full document, so every
156reference must be unambiguous within the full document (see GROUNDING below).
157
158CORE RULES
159- The question MUST require information from at least 2 different pages. 2-4 evidence pages
160 is the sweet spot — focus on DEPTH of reasoning (computation, comparison, lookup chain)
161 rather than breadth (touching many pages). Do not force artificial connections just to span
162 more pages.
163- Prefer questions where the model must continue scanning AFTER the first relevant page.
164- If multiple pages in the window share the same template/layout (repeated profiles, repeated
165 entries, repeated charts/cards/tables), prefer questions that require exhaustive scanning
166 across all matching pages.
167- Reject questions that can be answered correctly by looking only at the first matching page.
168- Prefer tables, charts, figures, and infographics over plain text.
169- Do NOT include the answer in the question. ONLY output the question text.
170- Do NOT mention <question-type> or the page window. The trainee sees the full document.
171- At training time the trainee sees the FULL document (potentially 50-100+ pages), not your
172 window. Strong anchoring (page numbers, element titles) is critical so the trainee can
173 LOCATE the 2-3 relevant pages among many.
174- Reject questions where a plausible but wrong shortcut answer exists on the first relevant page
175 unless the question wording explicitly forces the model to use the later page(s) as well.
176
177WINDOW-INTERNAL REASONING CHAIN (critical)
178Before writing the question, identify which page(s) in the current window provide:
179- the lookup key / entity / subgroup
180- the target value
181- any denominator or comparison value
182- any final filtering or aggregation criterion
183Prefer questions where this chain is explicit and distributed across 2-4 pages.
184Good patterns:
185- page A identifies the entity, page B gives the metric
186- page A gives a percentage, page B gives a sample size, answer is a count
187- page A and page B show the same metric for different groups/years, answer is a difference
188- page A contains the first half of a sentence/table/figure and page B contains the completion
189
190HIGH-VALUE QUESTION TARGETS (based on model failure analysis)
191These question types expose the biggest model weaknesses — prioritize them:
192- VISUAL PERCEPTION: questions about icon colors, line colors in charts, small labels on maps,
193 visual groupings separated by brackets/braces (e.g., "What are the four Business Analytics
194 activities in the chart titled 'Levels of Analytics'?" where the model must read the correct
195 section divider).
196- COUNTING across pages: "How many X on pages N through M?" where X are scattered small
197 elements (map markers, icons, figures, organizations). The model undercounts by ~2x, so these
198 questions are high-value training signal. Require objective, unambiguous counting criteria.
199- CROSS-PAGE COMPUTATION: financial ratios, sums, or comparisons requiring values from different
200 pages/tables (e.g., inventory turnover from Income Statement + Balance Sheet).
201- INFOGRAPHIC SPATIAL: binding numbers/labels to their correct spatial region on maps,
202 flowcharts, or diagrams (the model confuses which number belongs to which region).
203- LOOKUP CHAIN / CROSS-PAGE BINDING: page A identifies the correct entity/series/subgroup,
204 page B provides the target value, and optionally page C provides a denominator or comparison.
205- EXHAUSTIVE MULTI-PAGE AGGREGATION: repeated page layouts where the answer requires scanning
206 every matching page in the window (cover-page models, museum entries, FAQ cards, chart panels,
207 guidebook cards), not stopping after the first hit.
208- PAGE-BREAK CONTINUATION: the answer requires continuing a sentence, paragraph, table row,
209 or figure explanation from one page to the next.
210
211QUALIFIER FIDELITY (critical)
212- If multiple nearby answers exist, the question MUST include the qualifier that makes the target unique.
213- Prefer qualifiers like: strongly / somewhat / overall / net, displayed / shown vs listed / mentioned,
214 exact row / column / year / fiscal year / subgroup / legend item.
215- The question must not be answerable by selecting a nearby but broader fact.
216
217GROUNDING & ANCHORING (critical)
218Anchor priority (use the first available):
219 1. Page number: "On page 42, ..." — read the PRINTED page number from the image.
220 2. Numbered element: "In Table 3 on page 42, ..." / "In Figure 7, ..."
221 3. Element title: "In the chart titled 'X', ..."
222 4. Named statement: "In the Consolidated Balance Sheets, ..."
223 5. Section heading: "In the section titled 'Methodology', ..."
224 6. Structural fallback: "In the bar chart with y-axis 'Revenue ($M)', ..."
225BANNED — these are ambiguous in the full document:
226 "the document/report/paper/slides" without anchor;
227 "the table/chart/figure" without title or page number;
228 "across the pages" / "in the provided pages".
229For charts: always use FULL TITLE + distinguishing axis/column. If a chart has a "Change"
230column, reference it explicitly. For maps/infographics: bind numbers to their labeled regions.
231
232QUESTION-TYPE TEMPLATES (use the pages and random number {{ range(1, 1001) | random }} in choosing)
233
234{% if "not answerable" in question_type %}
235Create a question relevant to the visible window whose answer is NOT present anywhere in this window.
236IMPORTANT:
237- The question must be rejectable using ONLY the current window.
238- Do NOT create a "not answerable" question only because the required page is outside the window.
239- Build near-miss negatives by changing exactly ONE required qualifier from a visible multi-page fact:
240 - wrong year/date
241 - wrong subgroup/series
242 - wrong legend item
243 - wrong row/column
244 - wrong position
245 - wrong displayed-vs-mentioned relation
246 - wrong denominator/base requirement
247Templates:
248 - "Using Table 2 on page X and the chart on page Y, what is [METRIC] for [YEAR not shown in either page]?"
249 - "In Figure N on page X and its continuation on page Y, what is [ATTRIBUTE not labeled anywhere in the window]?"
250 - "Across pages X-Y, which [ENTITY] satisfies [CONDITION not met by any item in the visible window]?"
251
252{% elif "numerical" in question_type %}
253Use visible numbers from tables/charts/text. Require arithmetic or counting across pages.
254{% if "int" in question_type %}
255Add "Answer with an integer." to the question.
256Templates:
257 - "How many organisations are introduced in detail (at least one paragraph) on pages X through Y?" [count across pages]
258 - "In Figure N, how many distinct icons/nodes/colors are shown?" [visual counting]
259 - "How many charts on pages X through Y have their horizontal axis set as year?" [cross-page chart counting]
260 - "What is the sum of [METRIC] in Table A on page X and Table B on page Y?"
261 - "How many rows in Table N on page X have [COLUMN] above [THRESHOLD]?"
262 - "In the map on page X, how many [MARKERS/SYMBOLS] are shown?"
263 - "How many [SMALL ELEMENTS: WC markers, logos, footnote symbols] appear on pages X through Y?" [dense scattered counting]"
264 - "Across pages X-Y that share the same template/layout, how many entries satisfy [CONDITION]?" [exhaustive repeated-layout counting]
265 - "Using the percentage on page X and the sample size on page Y, how many [GROUP] does that correspond to? Round to the nearest hundred and answer with an integer." [cross-page denominator binding]
266 - "How many total [ITEMS] are introduced across pages X-Y? Count every numbered entry exactly once." [avoid early stopping]
267{% elif "float" in question_type %}
268Specify rounding (e.g., "Round to two decimal places.").
269Templates:
270 - "What is inventory turnover (Cost of Sales / Average Inventory) for FY2021? Use the Income Statement on page X and Balance Sheet on page Y. Round to two decimal places."
271 - "What is the sum of the two smallest file sizes in the table on page X?"
272 - "How much did [METRIC] change between Table A on page X and Table B on page Y?"
273 - "What is the ratio of [CELL in Table A] to [CELL in Table B]?"
274{% elif "percentage" in question_type %}
275Add "Answer with a % sign.".
276Templates:
277 - "What is the percentage difference between [GROUP A] and [GROUP B] in the chart titled 'X'?"
278 - "What percentage of [ENTITY] have [ATTRIBUTE] according to Tables on pages X and Y?"
279 - "In the chart titled 'X', how much higher is [SERIES A] than [SERIES B] in [YEAR]?"
280 - "What is [METRIC A on page X] as a percentage of [METRIC B on page Y]?"
281 - "Using the count on page X and the total on page Y, what percentage does this represent? Answer with a % sign." [cross-page denominator binding]
282 - "Using the values on pages X and Y, what is the percentage-point difference between [SERIES A] and [SERIES B]? Answer with a % sign." [cross-page comparison]
283{% endif %}
284
285{% elif "list" in question_type %}
286Answer should be 2-8 short items. Add "Return a JSON array of strings, e.g., ["A", "B"]."
287Do not list the items in the question.
288The question itself must be a natural language sentence — NEVER output a JSON array as the question.
289Templates:
290 - "What are the uses of [SYSTEM] described in the section titled 'X' on pages N-M?" [text list]
291 - "In the chart titled 'X', what are the four [CATEGORY] activities?" [chart label list]
292 - "In Figure N's demonstration, what are the colors of the nodes that appear in more than one cluster?" [visual list]
293 - "List all items in Table N on page X that meet [CONDITION]." [filtered table list]
294 - "What are the [FIELD] values for [ENTITY] in Tables on pages X and Y?" [cross-page list]
295 - "What are the colors of the icons that perform [ACTION A] and [ACTION B] on page X?" [UI element list]
296 - "Across pages X-Y that share the same layout, which entries satisfy [CONDITION]? Return a JSON array of strings." [exhaustive repeated-layout list]
297 - "Across pages X-Y, which schools/colleges/sections use a [QUALIFIER] cover-page model? Return a JSON array of strings." [avoid early stopping after first hit]
298
299{% elif "yes" in question_type %}
300Templates:
301 - "In Table A on page X, is [METRIC for ENTITY A] greater than [METRIC for ENTITY B] in Table B on page Y?"
302 - "In the chart titled 'X', was [SERIES A] higher than [SERIES B] in [YEAR]?"
303 - "Does the table on page X have more rows than the table on page Y?"
304
305{% elif "multiple choice" in question_type %}
306Provide exactly 4 options (A-D), plausible and mutually exclusive.
307Templates:
308 - "In the chart titled 'X', which [ENTITY] has the highest [METRIC]? A. ... B. ... C. ... D. ..."
309 - "Based on Table A (page X) and Table B (page Y), which statement is true? A. ... B. ... C. ... D. ..."
310
311{% elif "string:" in question_type %}
312Answer is a word, phrase, or short sentence.
313Templates:
314 - "In the chart titled 'X', in the 'Change' column, which subgroup shows the largest increase?" [chart with derived column]
315 - "In Figure N on page X, which element has the highest/lowest [METRIC]?" [figure reading]
316 - "In the world map on page X, which region has the largest number of [ENTITY]?" [infographic spatial]
317 - "Using the definition in Section 'X' on page A and the data in Table N on page B, what is [DERIVED FACT]?" [cross-page reasoning]
318 - "In Table N, which [ENTITY] has [SUPERLATIVE] [ATTRIBUTE]?" [table lookup]
319 - "What is the paper's full title referenced in Table N on page X for the method with [ATTRIBUTE]?" [cross-reference]
320 - "Using the statement on page X and its continuation on page Y, what is the missing/full condition?" [page-break continuation]
321 - "Using the chart on page X to identify the subgroup and the table on page Y to find its count, what is the resulting rounded answer?" [lookup chain]
322
323{% elif "layout" in question_type %}
324Answer requires understanding visual/spatial structure. Answer is a number, word, or phrase.
325Templates:
326 - "What range does [COLOR] represent in the legend of the chart titled 'X' on page N?" [legend reading]
327 - "What is the URL/email in the [COLOR] box on page N?" [spatial anchor]
328 - "In Figure N on page X, which nodes are connected to node [LABEL]?" [graph structure]
329 - "What text appears inside the [COLOR/POSITION] box on page N?" [spatial extraction]
330 - "In the flowchart in Figure N, what step follows [LABEL]?" [process flow]
331 - "On page X, what is the heading directly above Table/Figure N?" [structural navigation]
332 - "What are the colors of the icons for [ACTION A] and [ACTION B] on page N?" [icon color perception]
333 - "In the chart titled 'X', which group of activities is labeled [SECTION] (above/below the bracket)?" [visual grouping]
334{% endif %}
335
336These templates are for inspiration. Create a question specific to the actual visible content.
337ONLY output the question text, nothing else.
338"""
339
340
341PROMPT_ANSWER = """\
342<question-type>
343{{ question_type }}
344</question-type>
345
346<question>
347{{ question }}
348</question>
349
350You are given images of pages extracted from a PDF document. Answer using ONLY information visible in the pages.
351
352You MUST use this exact output structure:
353<think>
354[all reasoning here]
355</think>
356[bare final answer here — no explanation, no labels, no extra text]
357
358In your THINKING (inside <think> tags), follow this thinking protocol.
359
360QUALIFIER LOCK (critical)
361Before extracting any answer, copy the restrictive qualifiers from the question and keep them fixed:
362- page / section / table / figure / chart identity
363- year / date / fiscal year
364- subgroup / series / legend item
365- exact metric (count vs percentage vs percentage-point difference)
366- displayed / shown / visible vs listed / mentioned
367- first / second / last / nearest / highest / lowest
368
369Do NOT substitute a nearby year, nearby subgroup, nearby series, nearby row, or nearby fact.
370If the question asks for a specific subgroup or metric, read exactly that one and no other.
371
372THINKING PROTOCOL — scan-locate-synthesize (follow in order)
373
374Use PRINTED page numbers (e.g., "page 42"), never ordinal positions ("image 1", "the first page").
375Your reasoning trace becomes training data for a model that sees the full document — ordinals are meaningless there.
376
3771) PARSE: Decompose the question into concrete lookup targets.
3782) SCAN: First, read the PRINTED page number from each page (usually at the top or bottom
379 margin) and build a mapping. Then note relevant elements:
380 "Page 31 (printed at bottom): Income Statement. Page 59 (printed at bottom): Balance Sheets."
381 If a page has no printed number, use its heading instead: "Untitled page with 'Methodology' heading."
382 Use ONLY these printed page numbers for the rest of your reasoning — never "image 1" etc.
3833) LOCATE: State where each target was found:
384 "Target A found on page 31 in the Income Statement."
385 "Target B found on page 59 in the Balance Sheets."
386 Keep track of each target.
3874) MATCH: Match elements by TITLE or CAPTION, not position.
388 - Charts: match TITLE + axis labels. If a chart has a "Change" column, read it directly.
389 - Tables: match by caption/heading (e.g., "Consolidated Balance Sheets").
390 - Maps/infographics: match each number to its spatially closest labeled region.
391 - Visual groupings: charts may use brackets or dividers to separate sections
392 (e.g., "Business Analytics" vs "Business Intelligence") — read the correct group.
393 - Colors: look carefully at actual colors of icons, lines, and legends.
394 Do NOT assume a page is grayscale. Describe what you see before answering.
395
396PAGE-BREAK CONTINUATION
397If a sentence, paragraph, table row, caption, or figure explanation appears to continue onto the next page,
398combine the text before deciding the answer is missing.
399
4005) READ: Extract values from the matched element.
401 - Tables: correct column for the fiscal year; parentheses = negative; check unit scale.
402 - Charts: read the exact value from labels, not approximations.
403
404UNIT DISCIPLINE
405- Preserve units exactly when present or requested (%, $, million, etc.).
406- Financial: parentheses = negative; check table header for unit scale; "how much higher" = positive.
407
4086) SYNTHESIZE: Combine information from multiple pages. Show the cross-page chain:
409 "From page 31: Cost of Sales = 24,576. From page 59: Avg Inventory = 7,110.5.
410 Turnover = 24,576 / 7,110.5 = 3.46."
411
412UNIT DISCIPLINE
413- Preserve units exactly when present or requested (%, $, million, etc.).
414- Financial reports: parentheses = negative; check table header for unit scale (e.g., "In millions");
415 "how much higher/more" = positive number; "change" = positive for increase, negative for decrease.
416
417COUNT / PERCENT / DENOMINATOR DISCIPLINE
418- If one page provides a percentage and another page provides a sample size, the percentage is NOT the final answer until it is converted using the sample size.
419- Distinguish carefully between count, percentage, percentage-point difference, and ratio.
420- Only round after the final computation, never before.
421
422EXHAUSTIVE REPEATED-LAYOUT SCAN
423If multiple pages in the window share the same layout or template, scan ALL matching pages before concluding.
424Do not stop after the first valid hit.
425For counts/lists, maintain a running page-by-page tally or item list until the last relevant page in the window.
426Bad: "I see items 14-29 on pages 28-31. Count = 29 - 14 + 1 = 16." (stopped early, missed pages 32-34)
427Good: "Page 28: items 14-21 (running total: 8). Page 30: items 22-29 (running total: 16). Page 32: items 30-37 (running total: 24). Page 34: items 38-44 (running total: 31). Final count: 31."
428
4297) VERIFY: Re-check your extraction against the pages.
430 - COUNTING (models undercount by ~2x): count PER PAGE with a running total.
431 Enumerate each item explicitly. Re-scan for missed items on each page.
432 - LISTS: re-check each page for missed items.
433 - VALUES: re-read the specific cell/label to confirm.
434
435THINKING STABILITY (critical)
436- Follow the protocol once from top to bottom. Do NOT restart from step 1 after you already found the relevant pages.
437- Do at most one scan pass and one verification pass.
438- Be concise: go directly to pages matching the question's anchor. Do NOT describe every page —
439 skip irrelevant pages silently. Only mention pages that contain evidence.
440- If there are two plausible candidates, compare them once using the question's qualifiers, choose the best-supported one, and continue. Do NOT keep generating new alternatives.
441- Do NOT repeat the same scan, recount, or conclusion more than once.
442- As soon as the answer is found and verified, stop thinking and produce the final answer.
443- Do NOT use filler loops such as repeating a phrase, title, entity name, or page reference many times.
444- If you have a complete answer supported by the required pages and qualifiers, commit to it. Do not reopen the search.
445
446REASONING TRACE REQUIREMENTS
447In your reasoning, always:
448- Use the printed page numbers from your SCAN mapping. NEVER write "image 1", "image 2",
449 "the first/second/third page", or any ordinal reference. These will be rejected.
450- Cite element titles: "In the Income Statement on page 31" not "in the table".
451- Quote the specific values you extracted.
452- For multi-page synthesis, explicitly connect values from different pages.
453- For computations, show the formula with named sources and page numbers.
454
455REFUSAL POLICY
456{% if "not answerable" in question_type %}
457- Output exactly: Not answerable
458- Only if the page(s) lack the exact requested qualifier.
459- In your thinking, briefly name what is missing.
460 Bad: "The information is not available in the provided pages."
461 Good: "Page 31 shows the Balance Sheet for 2020 and 2021, but the question asks for 2018. Year 2018 is not present on any page."
462 Good: "Pages 10-11 list subsections A, B, C, but none match 'Generative Retrieval'. Subsection missing."
463{% else %}
464- NEVER output "Not answerable", "Cannot determine", "Fail to answer", or any refusal.
465- If the question uses a term (e.g., "receive turnover") that does not appear verbatim,
466 look for the underlying data needed to compute it. The term not appearing does not mean
467 the answer is unavailable.
468- If you found partial evidence, provide what you found — partial > refusal.
469- If the answer requires inference or computation from visible data, do it.
470{% endif %}
471
472FINAL ANSWER:
473- Put ALL reasoning inside <think>...</think>.
474- After </think>, output ONLY the final answer.
475- Do NOT repeat reasoning outside <think> tags.
476- Do NOT output protocol labels, explanations, or extra text after </think>.
477
478OUTPUT FORMAT
479{% if "multiple choice" in question_type %}
480- Output: "<LETTER>. <option text>", e.g., "B. 92%"
481{% elif "yes" in question_type %}
482- Output exactly "Yes" or "No".
483{% elif "numerical (percentage)" in question_type %}
484- Output a number WITH a percent sign, e.g., "29%".
485{% elif "numerical (int)" in question_type %}
486- Output an integer only (digits, optional commas).
487{% elif "numerical (float)" in question_type %}
488- Output a decimal number only, unless the question requests a unit.
489{% elif "string:" in question_type %}
490- Output a short phrase/sentence only.
491{% elif "layout" in question_type %}
492- Output only the extracted content (string/number/list).
493{% elif "list" in question_type %}
494- Output a JSON array on ONE line: ["gray", "red"].
495- NEVER use comma-separated plain text. ALWAYS use ["..."] syntax.
496{% elif "not answerable" in question_type %}
497- Output exactly: Not answerable
498{% else %}
499- Output a short, direct answer.
500{% endif %}
501"""
502
503
504PROMPT_QUALITY_SCORE = """\
505<question-type>
506{{ question_type }}
507</question-type>
508
509<question>
510{{ question }}
511</question>
512
513<answer>
514{{ answer }}
515</answer>
516
517<answer_reasoning_trace>
518{{ answer__reasoning_content }}
519</answer_reasoning_trace>
520
521You are given images of pages extracted from a PDF document. Evaluate the QUALITY of this (question, answer) pair.
522
523Be STRICT. Any check failure => score 0.
524
525CHECKS
526
5271. DOCUMENT QUALITY: Are the document pages clear, readable, and not low quality?
528
5292. RELEVANCE: Is the question relevant to the content visible in the pages?
530 {% if "not answerable" in question_type %}For "not answerable" questions, the question should be relevant but the answer must NOT be present anywhere in the visible window.
531 Score 0 if the question is unanswerable ONLY because it refers to a page outside the window or the whole document. The question must be a near-miss negative where a specific qualifier (year, subgroup, row, region, etc.) is absent from the visible pages.{% endif %}
532
5333. ANSWER CORRECTNESS: Is the answer correct given the pages?
534 {% if "not answerable" in question_type %}The correct answer should be "Not answerable".{% endif %}
535
5364. QUESTION QUALITY: Is the question challenging, unambiguous, and well-formed?
537 - If the question includes the answer, give a score of 0.
538 - The question must genuinely require information from multiple pages.
539 - Score 0 if a plausible shortcut answer exists on the first relevant page and the question does not force use of the later page(s).
540
5415. FORMAT COMPLIANCE: Does the answer match the expected format?
542- Score 0 if the answer contains <think> tags, reasoning steps, protocol labels, explanations, or anything beyond the bare final result.
543{% if "multiple choice" in question_type %}
544 - Answer must be exactly "<LETTER>. <option text>" (A-D). Reject digit-only ("2") or letter-only ("B").
545{% elif "yes" in question_type %}
546 - Answer must be exactly "Yes" or "No".
547{% elif "numerical (percentage)" in question_type %}
548 - Answer MUST include a percent sign (e.g., "29%").
549{% elif "numerical (int)" in question_type %}
550 - Answer must be an integer only (digits, optional commas), no extra words.
551{% elif "numerical (float)" in question_type %}
552 - Answer must be a decimal number only, no extra words.
553{% elif "list" in question_type %}
554 - Answer must be a ONE-LINE JSON array (e.g., ["gray", "red"]).
555 - Score 0 if comma-separated text instead of JSON array.
556{% elif "not answerable" in question_type %}
557 - Answer must be exactly "Not answerable".
558{% else %}
559 - Answer MUST NOT be "Not answerable" or any refusal.
560{% endif %}
561
5626. ANCHOR QUALITY (critical — questions are generated from a page window but collated into
563 the FULL document at training time. Vague references become ambiguous in the full document):
564 - Score 0 if the question uses any of these vague phrases:
565 "the document", "the report", "the paper", "the slides", "the table" (unqualified),
566 "the chart" (unqualified), "across the pages", "in the provided pages".
567 - Score 0 if the <answer_reasoning_trace> does not use explicit anchors such as page numbers, chart/table titles, or section headings.
568 Ordinal positions ("image 1", "the first page") are not allowed as anchors.
569 - Score 0 if the question lacks a specific anchor. Must have at least one of:
570 page number, chart/table title, numbered element (Table 3, Figure 7), named financial
571 statement, or section heading that uniquely identifies the target in the full document.
572 - Ask: "If a reader saw the full 50-page document, would they know EXACTLY which
573 table/chart/section this question refers to?" If not, score 0.
574
5757. MULTI-PAGE REQUIREMENT:
576 - The answer must genuinely require information from at least 2 different pages.
577 - Score 0 if the answer can be determined from a single page alone.
578
5798. REASONING QUALITY (critical — reasoning is used as chain-of-thought training data):
580 Score 0 if any of the following hold:
581 - The reasoning does NOT use explicit anchors such as page numbers, chart/table titles, or section headings.
582 - It uses ordinal references like "image 1", "the first page", etc.
583 - It does NOT quote the specific values extracted.
584 - If computation is required, it does NOT show the operation with named sources.
585 - It repeats the same scan, recount, candidate answer, page reference, title, entity name, or conclusion without adding new evidence.
586 - It restarts the reasoning process after already finding the relevant page(s) or elements.
587 - It contains obvious loop markers such as repeated "Wait, let me", "Actually", or "Let's look again".
588 - It keeps generating new alternatives after already having enough evidence to answer.
589 - It ends in an unfinished or truncated way, or appears to stop mid-thought.
590 - For count/list questions, it does not maintain a page-by-page tally or explicit item list.
591 - For repeated-layout or repeated-entry questions, it stops after the first valid hit instead of scanning all matching pages in the visible window.
592 - For cross-page computations, it does not clearly distinguish which page provides the key, target value, denominator, or comparison value.
593 - It confuses count vs percentage vs percentage-point difference vs ratio, or rounds before the final computation rather than after it.
594
5959. VISUAL PERCEPTION (if applicable):
596 - If the question asks about colors, icons, or small visual elements, verify the answer
597 correctly describes what is visible. Score 0 if the answer claims the page is grayscale
598 when it is in color, or misidentifies a visual grouping.
599 - If the question involves counting scattered elements (markers, icons, figures across pages),
600 verify the count is plausible. Score 0 if the count is clearly too low (e.g., 3 when many
601 more are visible) or if the reasoning doesn't enumerate items.
602
603SCORING
604- Score 0: Any check fails.
605- Score 1: All checks pass.
606- Score 2: All checks pass AND the question involves at least one high-value signal: visual
607 perception (icon colors, chart groupings), counting scattered elements, financial computation
608 across statements, chart disambiguation, infographic spatial reasoning, or cross-page
609 table operation. Must still be unambiguous.
610
611Respond with ONLY the score as a single digit: 0, 1, or 2.
612"""
613
614
615# =============================================================================
616# Pipeline configuration
617# =============================================================================
618
619
620def build_config(
621 seed_path: str = "seed.parquet",
622 model_alias: str = "vl",
623 model_id: str = DEFAULT_VLM_MODEL,
624 reasoning: bool = True,
625) -> dd.DataDesignerConfigBuilder:
626 model_configs = [
627 dd.ModelConfig(
628 alias=model_alias,
629 model=model_id,
630 provider=VLLM_PROVIDER_NAME,
631 inference_parameters=_inference_params(model_id, reasoning=reasoning),
632 ),
633 ]
634
635 config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)
636
637 config_builder.with_seed_dataset(
638 dd.LocalFileSeedSource(path=seed_path),
639 sampling_strategy=dd.SamplingStrategy.SHUFFLE,
640 )
641
642 config_builder.add_column(
643 dd.SamplerColumnConfig(
644 name="question_type",
645 sampler_type=dd.SamplerType.CATEGORY,
646 params=dd.CategorySamplerParams(
647 values=[
648 "multiple choice",
649 "yes or no",
650 "string: word, phrase or short sentence",
651 "layout",
652 "numerical (int)",
653 "numerical (float)",
654 "numerical (percentage)",
655 "list of items (int, string, float or mixed)",
656 "not answerable",
657 ],
658 weights=[0.025, 0.025, 2, 2, 2, 2, 2, 2, 0.2],
659 ),
660 )
661 )
662
663 config_builder.add_column(
664 dd.LLMTextColumnConfig(
665 name="question",
666 model_alias=model_alias,
667 prompt=PROMPT_QUESTION,
668 multi_modal_context=IMAGE_CONTEXT,
669 )
670 )
671
672 config_builder.add_column(
673 dd.LLMTextColumnConfig(
674 name="answer",
675 model_alias=model_alias,
676 prompt=PROMPT_ANSWER,
677 multi_modal_context=IMAGE_CONTEXT,
678 extract_reasoning_content=True,
679 )
680 )
681
682 config_builder.add_column(
683 dd.LLMTextColumnConfig(
684 name="quality_score",
685 model_alias=model_alias,
686 prompt=PROMPT_QUALITY_SCORE,
687 multi_modal_context=IMAGE_CONTEXT,
688 )
689 )
690
691 return config_builder
692
693
694def create_dataset(
695 config_builder: dd.DataDesignerConfigBuilder,
696 num_records: int,
697 vllm_endpoint: str,
698 artifact_path: Path | str | None = None,
699) -> DatasetCreationResults:
700 model_providers = [
701 dd.ModelProvider(
702 name=VLLM_PROVIDER_NAME,
703 endpoint=vllm_endpoint,
704 ),
705 ]
706 data_designer = DataDesigner(
707 artifact_path=artifact_path,
708 model_providers=model_providers,
709 )
710 data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True))
711 results = data_designer.create(config_builder, num_records=num_records, dataset_name="multi_page_windowed_qa")
712 return results
713
714
715if __name__ == "__main__":
716 from argparse import ArgumentParser
717
718 parser = ArgumentParser()
719 parser.add_argument(
720 "--vllm-endpoint",
721 type=str,
722 required=True,
723 help="Base URL of the vLLM server hosting the VLM (e.g. http://localhost:8000/v1)",
724 )
725 parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file")
726 parser.add_argument("--model-alias", type=str, default="vl")
727 parser.add_argument("--model-id", type=str, default=DEFAULT_VLM_MODEL)
728 parser.add_argument("--num-records", type=int, default=5)
729 parser.add_argument(
730 "--reasoning",
731 action="store_true",
732 default=True,
733 help="Use reasoning-mode inference parameters (default: True)",
734 )
735 parser.add_argument(
736 "--no-reasoning",
737 dest="reasoning",
738 action="store_false",
739 help="Use non-reasoning inference parameters",
740 )
741 parser.add_argument("--artifact-path", type=str, default=None)
742 args = parser.parse_args()
743
744 config_builder = build_config(
745 seed_path=args.seed_path,
746 model_alias=args.model_alias,
747 model_id=args.model_id,
748 reasoning=args.reasoning,
749 )
750 results = create_dataset(
751 config_builder,
752 num_records=args.num_records,
753 vllm_endpoint=args.vllm_endpoint,
754 artifact_path=args.artifact_path,
755 )
756
757 print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
758
759 results.load_analysis().to_report()