Single-Page QA
Single-Page QA
Single-Page QA
1 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 # SPDX-License-Identifier: Apache-2.0 3 # /// script 4 # requires-python = ">=3.10" 5 # dependencies = [ 6 # "data-designer>=0.5.6", 7 # ] 8 # /// 9 """Long-Document Understanding Single-Page QA Recipe 10 11 Generate high-quality single-page question-answer pairs that improve VLM 12 long-document understanding across key categories: Text, Table, Chart, 13 Image/Figure, and Layout. MMLongBench-Doc is used to track progress. 14 15 Each question is anchored to a unique on-page element (page number, table/ 16 figure number, section title) so it remains unambiguous when collated with 17 questions from all other pages into a full-document training sample. 18 19 For each seed record the pipeline: 20 21 1. Samples a question type (multiple choice, yes/no, string, layout, 22 numerical int/float/percentage, list, not answerable) 23 2. Generates an anchored question from the page image 24 3. Generates an answer with chain-of-thought reasoning (captured separately) 25 4. Evaluates overall quality (document quality, relevance, correctness, 26 format compliance, anchor quality, and reasoning quality) as a 0/1/2 score 27 28 Prerequisites: 29 - A seed parquet file containing: 30 * `png_images_base64` – JSON array of base64-encoded PNGs (one 31 element per page; single-page seeds have a one-element array). 32 - A vLLM-compatible deployment of the VLM 33 (default: Qwen/Qwen3-VL-235B-A22B-Thinking-FP8). 34 Recommended vLLM launch flags: 35 --tensor-parallel-size 4 36 --max-model-len 50000 37 --gpu-memory-utilization 0.90 38 --reasoning-parser deepseek_r1 39 --limit-mm-per-prompt '{"video": 0}' 40 --trust-remote-code 41 42 Example launch script for 4× H100: 43 docker run --gpus all \ 44 -p 8000:8000 \ 45 vllm/vllm-openai:latest \ 46 --model Qwen/Qwen3-VL-235B-A22B-Thinking-FP8 \ 47 --tensor-parallel-size 4 \ 48 --max-model-len 50000 \ 49 --gpu-memory-utilization 0.90 \ 50 --reasoning-parser deepseek_r1 \ 51 --limit-mm-per-prompt '{"video": 0}' \ 52 --trust-remote-code 53 54 Run: 55 # Basic usage (generates 5 records by default) 56 uv run 06-single-page-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet 57 58 # Custom model and record count 59 uv run 06-single-page-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet --num-records 100 60 61 # For help message and available options 62 uv run 06-single-page-qa-sdg.py --help 63 """ 64 65 from pathlib import Path 66 67 import data_designer.config as dd 68 from data_designer.interface import DataDesigner, DatasetCreationResults 69 70 DEFAULT_VLM_MODEL = "Qwen/Qwen3-VL-235B-A22B-Thinking-FP8" 71 VLLM_PROVIDER_NAME = "vllm" 72 73 # ============================================================================= 74 # Image context helper 75 # ============================================================================= 76 77 IMAGE_CONTEXT = [ 78 dd.ImageContext( 79 # Expects a single-element JSON array from the per-page seed. 80 column_name="png_images_base64", 81 data_type=dd.ModalityDataType.BASE64, 82 image_format=dd.ImageFormat.PNG, 83 ), 84 ] 85 86 # ============================================================================= 87 # Prompt templates 88 # ============================================================================= 89 90 PROMPT_QUESTION = """\ 91 You are an expert at writing SINGLE-PAGE training questions that improve VLM performance on MMLongBench-Doc categories: 92 - Text (plain paragraphs) 93 - Table (tabular data) 94 - Chart (quantitative plots) 95 - Image/Figure (diagrams, UI screenshots, illustrations) 96 - Layout (spatial + structural anchors) 97 98 You see one page from a larger document. Your question will be collated with questions from 99 ALL other pages into a single training sample — the trainee sees the FULL document, not just 100 this page. Phrases like "on this page" or "the table" are ambiguous in that context. 101 Always anchor with a printed page number, unique title, or unique element number. 102 103 Your task: Create ONE high-quality question of type <question-type> that can be answered 104 using ONLY the visible content. 105 106 <question-type> 107 {{question_type}} 108 </question-type> 109 110 ═══════════════════════════════════════════════════════════════════════════════ 111 HARD CONSTRAINTS (must follow) 112 ═══════════════════════════════════════════════════════════════════════════════ 113 114 0) VISIBLE CONTENT ONLY 115 - The question must be answerable from the visible content alone. 116 - Do NOT ask questions that require any other page, the whole report, or cross-page aggregation. 117 - Do NOT ask for global counting/aggregation across the report/paper (e.g., "among all charts", "how many pages include…", "across the pages"). 118 - Benchmark-style phrasing is allowed (e.g., "According to the report/paper, …"), BUT the question must still be answerable from the visible content. 119 120 1) Answerability 121 - {% if "not answerable" in question_type %}The question should still be relevant to the page, but the exact requested answer must NOT be present anywhere on this page. 122 - Build "not answerable" questions as NEAR-MISS negatives by changing exactly ONE required qualifier from a real visible fact. 123 - Good near-miss types: wrong year/date, wrong subgroup/series, wrong unit, wrong position, wrong row/column, wrong legend item, or displayed/shown vs listed/mentioned. 124 - Do NOT make a question "not answerable" only because it refers to some other page, the whole document, or a page number that is missing from the visible page. 125 - Near-miss templates by visual element: 126 - Chart: "In the chart titled '<real title>', what is <metric> for <YEAR NOT SHOWN>?" (year is present for other series but not the asked one) 127 - Table: "In Table <N>, what is the value for '<ROW NOT IN TABLE>' in column '<real column>'?" 128 - Map/infographic: "In the map on page <N>, what number is shown for <REGION NOT LABELED>?" 129 - Text: "In the section titled '<real title>', what is the <DETAIL NOT MENTIONED>?" 130 - Layout: "In the <real position> panel, what is the <ELEMENT NOT PRESENT IN THAT PANEL>?" 131 {% else %}The correct answer MUST be present and clearly visible on this page.{% endif %} 132 - Do NOT ask anything that requires external knowledge, guesswork, or reading tiny illegible text. 133 134 2) Focus 135 - Identify the PRIMARY visual element on the page (text block, table, chart, figure, map, 136 infographic, or layout structure) and ask about it. 137 - Ignore decorative headers/footers unless they are part of the primary element. 138 139 3) Difficulty (1–3 steps, all evidence on THIS page) 140 - 1-step (common): single lookup, one comparison, one count, one filter. 141 - 2-step (encouraged — targets top benchmark failures): 142 - Lookup + lookup + compare: "Which is larger, row A or row B in column X?" 143 - Lookup + compute: "What is Gross Profit as a % of Revenues?" (two cells, divide) 144 - Scan + argmax: "Which subgroup has the largest value in the Change column?" 145 - Lookup + difference: "How much did X change from 2020 to 2021?" (two cells, subtract) 146 - Percentage → count: "If 5% of N=1500 said X, how many is that?" 147 - 3-step (allowed when all steps are trivial and on this page): 148 - Lookup + lookup + ratio + round: financial ratio questions. 149 - Scan + count + filter: "How many rows have column Y > threshold?" 150 - Avoid: questions needing external knowledge, subjective judgment, or off-page evidence. 151 152 4) Unambiguous 153 - Exactly one correct answer. 154 - Avoid subjective terms ("most consistent", "significant", "best", "optimal"). 155 156 5) No meta phrasing 157 - Do NOT say "the image" / "this image". 158 - Avoid "the document" unless it is part of an on-page title. 159 - You MAY use "According to the report/paper, …" occasionally, but never to imply cross-page evidence. 160 161 6) ANCHOR THE QUESTION (critical for per-document training) 162 - Questions are collated across ALL pages during training — anchors must be unambiguous 163 within the ENTIRE document, not just this page. 164 - Anchor priority (prefer unique; most preferred first): 165 1. Printed, explicit page number: "On page 42, ..." (usually in the header/footer) 166 2. Unique title/caption/section heading: "In the chart titled "X", ..." / "In the section titled "Y", ..." 167 3. Numbered element + local title: "In Table 3 under "X", ..." / "In Figure 7 titled "Y", ..." / "In Note 5 — "Z", ..." 168 4. Fallback: Detailed structural anchor with heading: "In the right-column table under the heading "X", ..." / "In the top-right boxed callout titled "Y", ..." 169 - BANNED anchors (these become meaningless when collated across pages): 170 "on this page", "in the image", "in the bottom half of the page", 171 "the table" / "the chart" / "the figure" without a title or number, 172 "on the left side" / "on the right side" without a heading. 173 Always pair positional references with a title, number, or heading. 174 175 7) REQUIRE ANSWER-FORMAT HINTS IN THE QUESTION (targets strict eval failures) 176 - The question MUST explicitly tell the answer format when applicable: 177 - For list questions: add "Return a JSON array of strings on one line, e.g., ["A", "B"]." 178 - For percentages: add "Answer with a % sign." and keep % vs percentage-points unambiguous. 179 - For floats: specify rounding (e.g., "Round to three decimal places") if the question requires computation or if multiple precisions appear on the page. 180 181 8) QUALIFIER FIDELITY (critical) 182 - If multiple nearby answers exist, the question MUST include the qualifier that makes the target unique. 183 - Prefer qualifiers like: strongly / somewhat / overall / net, displayed / shown / visible vs listed / mentioned / described, first / second / last / nearest, and exact row / column / year / fiscal year / subgroup / legend item. 184 - The question must not be answerable by selecting a nearby but broader fact. 185 - Good: "Which application software interfaces are DISPLAYED in screenshots on this page?" 186 - Good: "What percentage of Rep/Lean Rep STRONGLY favor ...?" 187 - Bad: "Which applications are on this page?" if some are listed and some are displayed. 188 - Bad: "What percentage of Republicans favor ...?" if the page contains both net and strongly measures. 189 190 ═══════════════════════════════════════════════════════════════════════════════ 191 WHAT TO GENERATE (driven by failure cases) 192 ═══════════════════════════════════════════════════════════════════════════════ 193 194 Look at the page and identify its primary visual element, then use the matching section: 195 - Plain text / paragraphs → A) TEXT 196 - Any table (especially financial statements)→ B) TABLE 197 - Any chart (bar, line, pie, scatter, etc.) → C) CHART 198 - Maps, infographics, diagrams, flowcharts → D) IMAGE 199 - Multi-column or spatially structured pages → E) LAYOUT 200 Skip purely decorative pages with no readable content. Pages that look like appendices, 201 regulatory compliance, financial notes, or methodology are just as valuable — don't skip them. 202 203 A) TEXT (plain paragraphs) 204 Target: false refusals on short facts, poor exact extraction. 205 - Short, high-salience answers with an anchor. Examples: 206 - "In the heading at the top of the page X, what is the full title? Answer exactly as written." 207 - "In the section titled "Support", what phone number is listed? Answer with the number only." 208 - "In the paragraph titled "About this survey", what sample size (N) is reported? Answer with an integer." 209 - "In the sentence mentioning "temperature", which value is reported as best? Answer with the number only." 210 - Avoid long copying (>20 words) and cross-page answers. 211 212 B) TABLE (tabular data — use financial patterns for income statements, balance sheets, 10-K filings) 213 Target: false refusals on obvious cells, dense headers, counting mistakes, financial unit/sign errors. 214 - Small-scope table reasoning with an explicit table anchor. Examples: 215 - Argmax/comparison: "In Table <N>, which row has the highest value in column "X"?" 216 - Difference/sum: "In Table <N>, what is the difference between rows "A" and "B" in column "X"?" 217 - Filter + count: "In Table <N>, how many rows have column "Y" > <threshold>? Answer with an integer." 218 - Multi-level headers: "In Table <N>, under column group "<group>", subcolumn "<sub>", what is the value for row "<label>"?" 219 - FINANCIAL TABLES (critical — 57% zero-score on financial reports): 220 - Line item lookup: "In the Consolidated Balance Sheets on page <N>, what is Total Current Assets for FY2021? Answer in millions as an integer." 221 - YoY difference: "In the balance sheet, how much did Accrued Liabilities change from 2020 to 2021? Answer in millions (positive if increased)." 222 - Notes / schedules / appendices (high false-refusal rate — do NOT skip these pages): 223 "In Note <N> — '<Title>', what is the total <line item>? Answer in millions as an integer." 224 "In the schedule of '<Title>' on page <N>, what is the value for '<row>' in '<year>'?" 225 "In Note <N>, how many categories/items are listed in the '<sub-table>'? Answer with an integer." 226 - If values include %, specify "Answer with a % sign." If table shows units, state whether the answer should include them. 227 228 C) CHART (bar, line, pie, scatter, area, heatmap) 229 Target: cross-chart confusion, misread values, wrong column/axis selection. 230 - CHART DISAMBIGUATION (critical): Always anchor to the chart TITLE + a distinguishing 231 axis/column label. If a chart has a pre-computed "Change" column, reference it explicitly. 232 For pages with multiple small charts, anchor to the specific sub-chart label. 233 - Examples: 234 - "In the chart titled "X", in the "Change '08-'15" column, which subgroup shows the largest change?" 235 - "In the chart titled "X", what percentage of "EU" is in the "More" category? Answer with a % sign." 236 - "In the chart titled "X", by how many percentage points did <category> change from <A> to <B>?" 237 - "In the chart titled "X", which groups have a value below 60? Return a JSON array, e.g., ["A", "B"]." 238 - "In the chart titled "X", how many categories exceed <threshold>? Answer with an integer." 239 - VISUAL GROUPING (recurring failure): If a chart uses brackets, braces, or labeled dividers 240 to separate groups (e.g., "Business Analytics" vs "Business Intelligence"), ask about a 241 specific group. The model confuses which items belong to which group. 242 - ARGMAX OVER CHANGE (recurring failure): If a chart has both absolute values (bar lengths) 243 and a "Change" column, ask which category has the largest CHANGE. The model tends to pick 244 the category with the largest absolute value instead. 245 - "In the chart titled "X", which subgroup shows the largest increase from <year A> to <year B>?" 246 - Unit discipline: if the chart uses %, the answer should include "%". 247 248 D) IMAGE / FIGURE (maps, infographics, flowcharts, diagrams, schematics) 249 Target: spatial confusion on maps/infographics, misread labels, undercounting visual elements. 250 - Prefer diagrams/UI/infographics with clear labels; avoid counting small repeated natural-photo objects. 251 - SPATIAL REASONING (critical — 128 wrong-answer figure failures): 252 For maps/infographics, force the model to bind numbers to their correct spatial region. 253 The model confuses which number belongs to which region — ask specifically. 254 - For any count question, explicitly name the counting unit. 255 - Good counting units: service-line badges, screenshots displayed, QR codes, numbered steps, nodes, boxes, legend entries, labeled regions. 256 - Avoid ambiguous units like "lines", "figures", "objects", or "applications" unless the page makes the intended unit unmistakable. 257 - Examples: 258 - "In the world map on page <N>, which region has the largest number? Answer with the region name." 259 - "In the map, what number is shown for Europe? Answer with an integer." 260 - "How many Muni service-line badges are shown at Union Square / Market Street? Answer with an integer." 261 - "Which application software interfaces are displayed in screenshots on this page? Return a JSON array of strings on one line." 262 - "In Figure <N>, how many distinct nodes/boxes are shown? Answer with an integer." 263 - "In Figure <N>, what text is inside the box labeled "Y"?" 264 - "In Figure <N>'s legend, which label corresponds to the <color> segment?" 265 - "In the flowchart in Figure <N>, what step follows the decision labeled "X"?" 266 - "In the diagram labeled "X", which component is directly connected to "Y"?" 267 - Color is allowed ONLY when it encodes meaning (legend/UI), not aesthetics. 268 - ICON / SYMBOL DISCRIMINATION (critical — model hallucinates icon presence): 269 When pages show repeated entries (attraction listings, product cards, feature grids), 270 ask about which entries have or lack a specific small icon/symbol. 271 The model tends to assume all entries share the same icons — force it to check each one. 272 - "Under the listing for '<Name>', which accessibility icons are shown? Return a JSON array of strings on one line." 273 - "Which attractions on this page do NOT have a wheelchair accessibility icon? Return a JSON array of strings on one line." 274 - "In the feature comparison grid, which products show a checkmark for '<Feature>'? Return a JSON array of strings on one line." 275 - "How many listings on this page display a 'Green Travel' ecolabel? Answer with an integer." 276 277 E) LAYOUT (spatial + structural anchors) 278 - Force locating content by structure/position with explicit anchors: 279 - Heading navigation: "In the section titled "X", what is the second bullet point?" 280 - Above/below: "In the heading directly above Table <N>, what is the heading text?" 281 - Two-column disambiguation (common benchmark failure): "In the right column under the heading "X", what is the first bullet point? Answer exactly as written." 282 - Counting structure: "In the procedure section, how many numbered steps are listed? Answer with an integer." 283 - Location: "In the top-right boxed callout, what label is shown?" 284 - Exhaustive but short: "In the right column under the heading "X", list all subheadings shown. Return a JSON array of strings on one line, e.g., ["A", "B"]." (only if 2–8 items) 285 286 ═══════════════════════════════════════════════════════════════════════════════ 287 QUESTION-TYPE SPECIFIC RULES 288 ═══════════════════════════════════════════════════════════════════════════════ 289 {% if question_type == "layout" %} 290 - Include an explicit spatial/structural anchor (e.g., top-left, right column, below the chart, under heading X, second bullet, last row). 291 {% elif "numerical (percentage)" in question_type %} 292 - Ensure the answer is naturally a percent and add: "Answer with a % sign." 293 {% elif "numerical (int)" in question_type %} 294 - Ensure the answer is an integer and add: "Answer with an integer." 295 {% elif "numerical (float)" in question_type %} 296 - Ensure the answer is a decimal number visible on the page (or computable in one step). 297 {% elif "list" in question_type %} 298 - The question must require ALL items (complete list), short (2–8 items), visible on THIS page. 299 - Add: "Return a JSON array of strings on one line, e.g., ["gray", "red"]." 300 {% elif question_type == "yes or no" %} 301 - The question must be decidable from the page content and not rely on interpretation. 302 {% elif question_type == "multiple choice" %} 303 - Provide exactly 4 options (A–D), plausible and mutually exclusive. 304 - Do NOT use "All of the above" / "None of the above". 305 {% endif %} 306 307 ═══════════════════════════════════════════════════════════════════════════════ 308 OUTPUT FORMAT 309 ═══════════════════════════════════════════════════════════════════════════════ 310 311 Return ONLY the question text. 312 {% if question_type == "multiple choice" %} 313 Format: first line is the question, then A., B., C., D. choices on separate lines. 314 {% endif %} 315 Do not include explanations or reasoning.\ 316 """ 317 318 319 PROMPT_ANSWER = """\ 320 <question-type> 321 {{ question_type }} 322 </question-type> 323 324 <question> 325 {{ question }} 326 </question> 327 328 You are given EXACTLY ONE page image from a PDF document. 329 All evidence must come from this one page only. 330 Never imagine other pages. Never search the rest of the document. 331 Never refer to "Image 1", "Image 2", or other unseen pages. 332 Answer using ONLY information visible on this page. 333 334 In your THINKING (inside <think> tags), follow this protocol. Do NOT echo these steps in 335 your final answer — the answer must be ONLY the bare result (number, phrase, list, etc.). 336 337 QUALIFIER LOCK (critical) 338 Before extracting any answer, copy the restrictive qualifiers from the question and keep them fixed: 339 - page / section / table / figure / chart identity 340 - year / date / fiscal year 341 - subgroup / series / legend item 342 - strongly / somewhat / overall / net / change 343 - displayed / shown / visible vs listed / mentioned / described 344 - first / second / last / nearest / left / right / top / bottom 345 - exact number vs ratio vs percentage vs percentage points 346 347 Do NOT substitute a nearby year, nearby subgroup, nearby chart, nearby row, or nearby fact. 348 If the question asks what is DISPLAYED, ignore items that are only named in nearby text. 349 If the question asks for STRONGLY / NET / OVERALL / CHANGE, read exactly that quantity and no other. 350 351 THINKING PROTOCOL: 352 1) PARSE: Decompose the question into concrete lookup targets. 353 E.g., "What is Gross Profit as a percentage of Revenues?" -> 354 target A = "Gross Profit value", target B = "Revenues value". 355 2) LOCATE: Scan the page to find the element the question references. Use the question's 356 anchor (page number, table/figure/section title, heading) to match. 357 3) MATCH: Identify the exact visual element by its TITLE or CAPTION before reading values. 358 - Charts: match title + axis labels (a page may have multiple charts). Identify sub-chart, series, x-axis category, unit, etc. 359 - Tables: match caption/heading (e.g., "Consolidated Balance Sheets", "Note 5"). Identify row/column labels, unit, scale, etc. 360 - Figures/Maps/infographics/UI: match the target object and counting unit. 361 - CONFIRM ANCHOR: If the page has multiple charts/tables/figures, state which one you 362 matched and confirm its title matches the question's anchor before reading any values. 363 If the title does not match, scan the page for the correct element. 364 4) READ: Extract the specific value from the matched element. 365 - Tables: correct column for the fiscal year; parentheses = negative; check unit scale header. 366 - Charts with a "Change" column: read that column directly, don't recompute from bars. 367 - If multiple nearby values exist, do not switch to a broader or more convenient one. 368 5) VERIFY: Double-check your extraction against the same bound target. 369 After this verification pass, stop thinking. 370 371 THINKING STABILITY (critical) 372 - Follow the protocol once from top to bottom. Do NOT restart from step 1 after you already found the relevant element. 373 - Do at most one locate pass and one verification pass. 374 - Be concise: go directly to the element matching the question's anchor. Do NOT describe 375 irrelevant elements on the page. Only mention what contributes to the answer. 376 - If there are two plausible candidates, compare them once using the question's qualifiers, choose the best-supported one, and continue. Do NOT keep generating new alternatives. 377 - Do NOT repeat the same scan, recount, or conclusion more than once. 378 - As soon as the answer is found and verified, stop thinking and produce the final answer. 379 - Do NOT use filler loops such as repeating a phrase, title, entity name, or page reference many times. 380 381 REASONING TRACE REQUIREMENTS (critical for training) 382 Your reasoning will be used as chain-of-thought training data. In your thinking, you MUST: 383 - Cite the question's anchor explicitly: "In Table 3 on page 42, ..." not "In the table, ..." 384 - State which element you matched and its exact title/caption. 385 - Quote the specific value(s) you read: "the row 'Accrued Liabilities' shows 6,063 for 2021." 386 - If computing, show the computation with named references: 387 "Gross Profit (19,962) / Revenues (44,538) = 0.448 = 44.8%" 388 Bad: "Looking at the page, I see a value of 6,063." 389 Good: "In the Consolidated Balance Sheets on page 59, the row 'Total Accrued Liabilities' 390 under the 2021 column shows $6,063 (in millions)." 391 392 COUNTING DISCIPLINE 393 - For any count question, state the counting unit first in your reasoning. 394 - Count once in a consistent order: left-to-right, top-to-bottom. 395 - Recount at most once. 396 - Do NOT switch counting units mid-reasoning. 397 - If both labels and visual markers are present, decide which is being counted before counting. 398 399 UNIT DISCIPLINE 400 - Preserve units exactly when present or requested (%, $, million, km, etc.). 401 - If the question asks for a percentage or percentage points, include the "%". 402 - Financial reports: 403 - Check the table header for unit scale (e.g., "In millions", "Rupees in lacs") and apply it. 404 - Parentheses in financial tables mean NEGATIVE values: (380) = -380. 405 - When the question asks "how much higher/more", answer with a POSITIVE number representing 406 the magnitude of the difference. When it asks "change", use positive for increase, 407 negative for decrease. 408 - If the question says "Answer in millions", output the number as shown in the table 409 (the table is already in millions). 410 411 REFUSAL POLICY (critical) 412 {% if "not answerable" in question_type %} 413 Only say "Not answerable" if the page lacks the exact requested qualifier. 414 In your thinking, briefly name what is missing. 415 Bad: "The information is not available on this page." 416 Good: "The chart shows data for 2015 and 2020, but the question asks for 2018. Year 2018 is not present." 417 Good: "The table lists rows for Revenue, COGS, and Gross Profit, but not 'Operating Lease Liability'. Row missing." 418 {% else %} 419 - NEVER output "Not answerable", "Cannot determine", or any refusal. 420 - If the answer is directly printed on the page, copy it and stop. 421 - If the answer is computable from visible values, compute it and stop. 422 - Do NOT refuse only because the page number is not visible, a nearby title/anchor differs slightly, or the answer is implied by one local relation rather than explicitly restated. 423 {% endif %} 424 425 FINAL ANSWER: 426 - Put ALL reasoning inside <think>...</think>. 427 - After </think>, output ONLY the final answer. 428 - Do NOT repeat reasoning outside <think> tags. 429 - Do NOT output protocol labels, explanations, or extra text after </think>. 430 431 OUTPUT FORMAT (critical) 432 {% if question_type == "multiple choice" %} 433 - Output exactly ONE line: "<LETTER>. <option text>", e.g., "B. 92%" 434 - Do NOT output only a digit ("2") or only a letter ("B"). 435 {% elif question_type == "yes or no" %} 436 - Output exactly "Yes" or "No" (no punctuation, no explanation). 437 {% elif "numerical (percentage)" in question_type %} 438 - Output a number WITH a percent sign, e.g., "29%". Do NOT omit the "%". 439 {% elif "numerical (int)" in question_type %} 440 - Output an integer only (digits, optional commas). No words. 441 {% elif "numerical (float)" in question_type %} 442 - Output a decimal number only (no extra words), unless the question explicitly requests a unit. 443 {% elif question_type is string and question_type.startswith("string") %} 444 - Output a short phrase/sentence only. No preamble. 445 {% elif question_type == "layout" %} 446 - Use the spatial/structural anchor to select the correct location. 447 - Output only the extracted content (string/number/list), no preamble. 448 {% elif "list" in question_type %} 449 - Output a JSON array on ONE line: ["gray", "red"]. Must be complete. 450 - NEVER use comma-separated plain text. ALWAYS use JSON array syntax with ["..."]. 451 - Each element must be individual, not a compound range. 452 Wrong: ["1981-82"] Correct: ["1981", "1982"] 453 Wrong: ["Training and Sportswear"] Correct: ["Training", "Sportswear"] 454 {% elif "not answerable" in question_type %} 455 - Output exactly: Not answerable 456 {% else %} 457 - Output a short, direct answer. No preamble or explanation. 458 {% endif %}\ 459 """ 460 461 462 PROMPT_QUALITY_SCORE = """\ 463 <question-type> 464 {{ question_type }} 465 </question-type> 466 467 <question> 468 {{ question }} 469 </question> 470 471 <answer> 472 {{ answer }} 473 </answer> 474 475 <answer_reasoning> 476 {{ answer__reasoning_content }} 477 </answer_reasoning> 478 479 You are given EXACTLY ONE page image extracted from a PDF document. Evaluate the QUALITY of this (question, answer, reasoning content). 480 481 Be STRICT. Any check failure => score 0. 482 483 CHECKS 484 485 1) DOCUMENT/PAGE QUALITY 486 - Pages must be readable (not too blurry/pixelated/cut off). Page must be high quality, and not be empty or nearly empty. 487 - The required evidence must be visible without guessing. 488 489 2) RELEVANCE + ANSWERABILITY 490 - The question must be grounded in visible content. 491 - {% if "not answerable" in question_type %}For "not answerable" questions: the question should be relevant, but the requested information must NOT be present anywhere on the page. 492 - Score 0 if the question is unanswerable ONLY because it refers to another page, the whole document, or a page range not shown. The question must be a near-miss negative where a specific qualifier (year, subgroup, row, region, etc.) is absent from the visible page.{% endif %} 493 494 3) ANSWER CORRECTNESS 495 - The answer must be correct given the visible page content. 496 497 4) OUTPUT FORMAT COMPLIANCE (critical) 498 - Score 0 if the answer contains <think> tags, reasoning steps, protocol labels, or anything beyond the bare final result. 499 {% if question_type == "multiple choice" %} 500 - Answer must be exactly "<LETTER>. <option text>" (A–D). Reject digit-only ("2") or letter-only ("B"). 501 {% elif question_type == "yes or no" %} 502 - Answer must be exactly "Yes" or "No". 503 {% elif "numerical (percentage)" in question_type %} 504 - Answer MUST include a percent sign (e.g., "29%"). 505 {% elif "numerical (int)" in question_type %} 506 - Answer must be an integer only (digits, optional commas), no extra words. 507 {% elif "numerical (float)" in question_type %} 508 - Answer must be a decimal number only, no extra words. 509 {% elif question_type is string and question_type.startswith("string") %} 510 - Answer must be a short phrase/sentence only (no preamble). 511 {% elif "list" in question_type %} 512 - Answer must be a ONE-LINE JSON array (e.g., ["gray", "red"]). 513 - Score 0 if comma-separated text or compound ranges (["1981-82"] should be ["1981", "1982"]). 514 {% elif "not answerable" in question_type %} 515 - Answer must be exactly "Not answerable". 516 {% else %} 517 - Answer MUST NOT be "Not answerable" or any refusal. 518 {% endif %} 519 520 5) QUESTION QUALITY 521 - Unambiguous: exactly one correct answer. 522 - Verifiable: a judge can confirm correctness from the pages. 523 - Locally bounded: 1-2 steps preferred; 3 steps allowed only if all evidence is on this page and each step is trivial. 524 - Not overly tedious: avoid long enumerations (e.g., 20+ items) unless clearly short and visible. 525 - If the question leaks the answer (contains the answer), score 0. 526 527 6) ANCHOR QUALITY (critical — questions are collated across all pages) 528 - Score 0 if the question uses a generic anchor that could match multiple elements across the 529 document (e.g., "In the table, ..." or "What is the total revenue?" without specifying which). 530 - Score 0 if the question uses any of these vague phrases: 531 "on this page", "in the image", "in the bottom/top half of the page", 532 "the table" / "the chart" / "the figure" without a title or number, 533 "on the left/right side" without a heading. 534 - Must have at least one of: exact page number, numbered element (Table 3, Figure 7, Note 5), 535 or unique section/subsection/chart/table title. 536 - If there is a chance the anchor is not unique across the full document, score 0. 537 538 7) REASONING QUALITY (critical — reasoning is used as chain-of-thought training data) 539 - The reasoning in <answer_reasoning> must be specific, stable, and finite. Score 0 if any of the following hold: 540 - It does NOT cite the question's anchor (page number, table/figure number, section title, 541 chart title, or named financial statement). 542 - It uses only generic references like "the table", "the chart", "the page", or "the image". 543 - It does NOT quote or reference the specific value(s) extracted. 544 - If the question involves computation, it does NOT show the operation with named sources 545 (e.g., "Gross Profit (19,962) / Revenues (44,538) = 44.8%"). 546 - It repeats the same scan, recount, or conclusion without adding new evidence, or restarts 547 / generates new alternatives after already having enough evidence to answer. 548 - It ends in an unfinished or truncated way, or appears to stop mid-thought. 549 - It contains filler repetition (for example, the same entity name or phrase copied many times). 550 - It references images by ordinal position ("Image 1", "Image 2", "the first image") 551 rather than by printed page numbers or element titles. 552 - For count questions, it switches counting units mid-reasoning. 553 - For displayed-vs-mentioned questions, it mixes displayed visual instances with nearby 554 listed/mentioned items. 555 556 8) FINANCIAL CHECKS (if applicable) 557 - Parentheses = negative. "How much higher" = positive. Units must match table header scale. 558 Fiscal year must match the correct column. 559 560 SCORING 561 - Score 0: Any check fails. 562 - Score 1: All checks pass. 563 - Score 2: All checks pass AND involves at least one high-value signal: financial computation, 564 chart disambiguation (title + axis label), infographic spatial reasoning, or cross-cell 565 table operation. Must still be unambiguous. 566 567 Respond with ONLY the score as a single digit: 0, 1, or 2.\ 568 """ 569 570 571 # ============================================================================= 572 # Pipeline configuration 573 # ============================================================================= 574 575 576 def build_config( 577 seed_path: str = "seed.parquet", 578 model_alias: str = "vl", 579 model_id: str = DEFAULT_VLM_MODEL, 580 ) -> dd.DataDesignerConfigBuilder: 581 model_configs = [ 582 dd.ModelConfig( 583 alias=model_alias, 584 model=model_id, 585 provider=VLLM_PROVIDER_NAME, 586 inference_parameters=dd.ChatCompletionInferenceParams( 587 timeout=1200, 588 temperature=1.0, 589 top_p=0.95, 590 max_parallel_requests=32, 591 extra_body={ 592 "top_k": 20, 593 "min_p": 0.0, 594 "presence_penalty": 1.5, 595 "repetition_penalty": 1.0, 596 }, 597 ), 598 ), 599 ] 600 601 config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs) 602 603 config_builder.with_seed_dataset( 604 dd.LocalFileSeedSource(path=seed_path), 605 sampling_strategy=dd.SamplingStrategy.SHUFFLE, 606 ) 607 608 config_builder.add_column( 609 dd.SamplerColumnConfig( 610 name="question_type", 611 sampler_type=dd.SamplerType.CATEGORY, 612 params=dd.CategorySamplerParams( 613 values=[ 614 "multiple choice", 615 "yes or no", 616 "string: word, phrase or short sentence", 617 "layout", 618 "numerical (int)", 619 "numerical (float)", 620 "numerical (percentage)", 621 "list of items (int, string, float or mixed)", 622 "not answerable", 623 ], 624 weights=[0.025, 0.025, 1, 2, 2, 2, 2, 2, 0.2], 625 ), 626 ) 627 ) 628 629 config_builder.add_column( 630 dd.LLMTextColumnConfig( 631 name="question", 632 model_alias=model_alias, 633 prompt=PROMPT_QUESTION, 634 multi_modal_context=IMAGE_CONTEXT, 635 ) 636 ) 637 638 config_builder.add_column( 639 dd.LLMTextColumnConfig( 640 name="answer", 641 model_alias=model_alias, 642 prompt=PROMPT_ANSWER, 643 multi_modal_context=IMAGE_CONTEXT, 644 extract_reasoning_content=True, 645 ) 646 ) 647 648 config_builder.add_column( 649 dd.LLMTextColumnConfig( 650 name="quality_score", 651 model_alias=model_alias, 652 prompt=PROMPT_QUALITY_SCORE, 653 multi_modal_context=IMAGE_CONTEXT, 654 ) 655 ) 656 657 return config_builder 658 659 660 def create_dataset( 661 config_builder: dd.DataDesignerConfigBuilder, 662 num_records: int, 663 vllm_endpoint: str, 664 artifact_path: Path | str | None = None, 665 ) -> DatasetCreationResults: 666 model_providers = [ 667 dd.ModelProvider( 668 name=VLLM_PROVIDER_NAME, 669 endpoint=vllm_endpoint, 670 ), 671 ] 672 data_designer = DataDesigner( 673 artifact_path=artifact_path, 674 model_providers=model_providers, 675 ) 676 data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True)) 677 results = data_designer.create(config_builder, num_records=num_records, dataset_name="single_page_qa") 678 return results 679 680 681 if __name__ == "__main__": 682 from argparse import ArgumentParser 683 684 parser = ArgumentParser() 685 parser.add_argument( 686 "--vllm-endpoint", 687 type=str, 688 required=True, 689 help="Base URL of the vLLM server hosting the VLM (e.g. http://localhost:8000/v1)", 690 ) 691 parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file") 692 parser.add_argument("--model-alias", type=str, default="vl") 693 parser.add_argument("--model-id", type=str, default=DEFAULT_VLM_MODEL) 694 parser.add_argument("--num-records", type=int, default=5) 695 parser.add_argument("--artifact-path", type=str, default=None) 696 args = parser.parse_args() 697 698 config_builder = build_config( 699 seed_path=args.seed_path, 700 model_alias=args.model_alias, 701 model_id=args.model_id, 702 ) 703 results = create_dataset( 704 config_builder, 705 num_records=args.num_records, 706 vllm_endpoint=args.vllm_endpoint, 707 artifact_path=args.artifact_path, 708 ) 709 710 print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") 711 712 results.load_analysis().to_report()