| 1 | # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | # SPDX-License-Identifier: Apache-2.0 |
| 3 | # /// script |
| 4 | # requires-python = ">=3.10" |
| 5 | # dependencies = [ |
| 6 | # "data-designer>=0.5.6", |
| 7 | # ] |
| 8 | # /// |
| 9 | """Long-Document Understanding Page Classification Recipe |
| 10 | |
| 11 | Classify document page images by their visual element types and reasoning |
| 12 | complexity using a vision-language model. For each seed record the pipeline |
| 13 | produces a structured `page_classification` column containing: |
| 14 | |
| 15 | - `contains_reasoning_content` – whether the page has visual elements |
| 16 | suitable for reasoning QA |
| 17 | - `primary_categories` – ordered list of visual element categories |
| 18 | (QUANTITATIVE, TABULAR, LOGIC_DIAGRAMS, HIERARCHICAL, etc.) |
| 19 | - `subcategories` – specific element types (BAR_CHART, FLOWCHART, …) |
| 20 | - `reasoning_complexity_score` – 1-10 cognitive demand rating |
| 21 | - `justification` – brief explanation of the classification |
| 22 | |
| 23 | Prerequisites: |
| 24 | - A seed parquet file containing a `png_images_base64` column with a JSON |
| 25 | array of base64-encoded PNG images (one element per page; single-page |
| 26 | seeds have a one-element array). |
| 27 | - A vLLM-compatible deployment of the VLM |
| 28 | (default: Qwen/Qwen3-VL-30B-A3B-Instruct). |
| 29 | Recommended vLLM launch flags: |
| 30 | --tensor-parallel-size 2 |
| 31 | --max-model-len 128000 |
| 32 | --gpu-memory-utilization 0.95 |
| 33 | --trust-remote-code |
| 34 | |
| 35 | Example launch script for 2× H100: |
| 36 | docker run --gpus all \ |
| 37 | -p 8000:8000 \ |
| 38 | vllm/vllm-openai:latest \ |
| 39 | --model Qwen/Qwen3-VL-30B-A3B-Instruct \ |
| 40 | --tensor-parallel-size 2 \ |
| 41 | --max-model-len 128000 \ |
| 42 | --gpu-memory-utilization 0.95 \ |
| 43 | --trust-remote-code |
| 44 | |
| 45 | Run: |
| 46 | # Basic usage (classifies 5 pages by default) |
| 47 | uv run 04-page-classification-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet |
| 48 | |
| 49 | # Custom record count |
| 50 | uv run 04-page-classification-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet --num-records 100 |
| 51 | |
| 52 | # For help message and available options |
| 53 | uv run 04-page-classification-sdg.py --help |
| 54 | """ |
| 55 | |
| 56 | from enum import Enum |
| 57 | from pathlib import Path |
| 58 | |
| 59 | from pydantic import BaseModel, Field |
| 60 | |
| 61 | import data_designer.config as dd |
| 62 | from data_designer.interface import DataDesigner, DatasetCreationResults |
| 63 | |
| 64 | DEFAULT_VLM_MODEL = "Qwen/Qwen3-VL-30B-A3B-Instruct" |
| 65 | VLLM_PROVIDER_NAME = "vllm" |
| 66 | |
| 67 | # ============================================================================= |
| 68 | # Structured output schema |
| 69 | # ============================================================================= |
| 70 | |
| 71 | |
| 72 | class VisualElementCategory(str, Enum): |
| 73 | QUANTITATIVE = "QUANTITATIVE" |
| 74 | LOGIC_DIAGRAMS = "LOGIC_DIAGRAMS" |
| 75 | HIERARCHICAL = "HIERARCHICAL" |
| 76 | SPATIAL_RELATIONAL = "SPATIAL_RELATIONAL" |
| 77 | SCHEMATIC = "SCHEMATIC" |
| 78 | TABULAR = "TABULAR" |
| 79 | INFOGRAPHIC = "INFOGRAPHIC" |
| 80 | NONE = "NONE" |
| 81 | |
| 82 | |
| 83 | class VisualElementSubcategory(str, Enum): |
| 84 | # QUANTITATIVE |
| 85 | BAR_CHART = "BAR_CHART" |
| 86 | LINE_GRAPH = "LINE_GRAPH" |
| 87 | SCATTER_PLOT = "SCATTER_PLOT" |
| 88 | PIE_CHART = "PIE_CHART" |
| 89 | AREA_GRAPH = "AREA_GRAPH" |
| 90 | HISTOGRAM = "HISTOGRAM" |
| 91 | BOX_PLOT = "BOX_PLOT" |
| 92 | HEATMAP = "HEATMAP" |
| 93 | BUBBLE_CHART = "BUBBLE_CHART" |
| 94 | # LOGIC_DIAGRAMS |
| 95 | FLOWCHART = "FLOWCHART" |
| 96 | DECISION_TREE = "DECISION_TREE" |
| 97 | PROCESS_MAP = "PROCESS_MAP" |
| 98 | ALGORITHM_DIAGRAM = "ALGORITHM_DIAGRAM" |
| 99 | STATE_DIAGRAM = "STATE_DIAGRAM" |
| 100 | SEQUENCE_DIAGRAM = "SEQUENCE_DIAGRAM" |
| 101 | # HIERARCHICAL |
| 102 | ORG_CHART = "ORG_CHART" |
| 103 | MIND_MAP = "MIND_MAP" |
| 104 | TREE_STRUCTURE = "TREE_STRUCTURE" |
| 105 | TAXONOMY = "TAXONOMY" |
| 106 | DENDROGRAM = "DENDROGRAM" |
| 107 | # SPATIAL_RELATIONAL |
| 108 | FLOOR_PLAN = "FLOOR_PLAN" |
| 109 | BLUEPRINT = "BLUEPRINT" |
| 110 | CHOROPLETH_MAP = "CHOROPLETH_MAP" |
| 111 | POINT_MAP = "POINT_MAP" |
| 112 | TOPOGRAPHIC_MAP = "TOPOGRAPHIC_MAP" |
| 113 | NETWORK_DIAGRAM = "NETWORK_DIAGRAM" |
| 114 | # SCHEMATIC |
| 115 | CIRCUIT_DIAGRAM = "CIRCUIT_DIAGRAM" |
| 116 | MECHANICAL_DIAGRAM = "MECHANICAL_DIAGRAM" |
| 117 | ANATOMICAL_DIAGRAM = "ANATOMICAL_DIAGRAM" |
| 118 | WIRING_DIAGRAM = "WIRING_DIAGRAM" |
| 119 | PLUMBING_DIAGRAM = "PLUMBING_DIAGRAM" |
| 120 | # TABULAR |
| 121 | SIMPLE_TABLE = "SIMPLE_TABLE" |
| 122 | NESTED_TABLE = "NESTED_TABLE" |
| 123 | PIVOT_TABLE = "PIVOT_TABLE" |
| 124 | COMPARISON_TABLE = "COMPARISON_TABLE" |
| 125 | FINANCIAL_TABLE = "FINANCIAL_TABLE" |
| 126 | # INFOGRAPHIC |
| 127 | TIMELINE = "TIMELINE" |
| 128 | STATISTICAL_INFOGRAPHIC = "STATISTICAL_INFOGRAPHIC" |
| 129 | PROCESS_INFOGRAPHIC = "PROCESS_INFOGRAPHIC" |
| 130 | COMPARISON_INFOGRAPHIC = "COMPARISON_INFOGRAPHIC" |
| 131 | # NONE |
| 132 | DECORATIVE_IMAGE = "DECORATIVE_IMAGE" |
| 133 | PHOTOGRAPH = "PHOTOGRAPH" |
| 134 | PLAIN_TEXT = "PLAIN_TEXT" |
| 135 | GENERIC_ICON = "GENERIC_ICON" |
| 136 | OTHER = "OTHER" |
| 137 | |
| 138 | |
| 139 | class PageClassification(BaseModel): |
| 140 | """Classification result for a document page's reasoning potential.""" |
| 141 | |
| 142 | contains_reasoning_content: bool = Field( |
| 143 | ..., |
| 144 | description=( |
| 145 | "Whether the page contains visual elements suitable for reasoning QA pairs. " |
| 146 | "Must be False if primary_categories contains NONE. " |
| 147 | "Must be True if primary_categories does NOT contain NONE." |
| 148 | ), |
| 149 | ) |
| 150 | primary_categories: list[VisualElementCategory] = Field( |
| 151 | ..., |
| 152 | description=( |
| 153 | "List of visual element categories found in the page, ordered by prominence. " |
| 154 | "IMPORTANT: If NONE is present, it must be the ONLY category in this list." |
| 155 | ), |
| 156 | ) |
| 157 | subcategories: list[VisualElementSubcategory] = Field( |
| 158 | ..., |
| 159 | description="Specific types of visual elements identified (e.g., BAR_CHART, FLOWCHART).", |
| 160 | ) |
| 161 | reasoning_complexity_score: int = Field( |
| 162 | ..., |
| 163 | ge=1, |
| 164 | le=10, |
| 165 | description="Complexity score from 1-10 indicating the depth of reasoning required.", |
| 166 | ) |
| 167 | justification: str = Field( |
| 168 | ..., |
| 169 | description="Brief explanation of why this page is or isn't suitable for reasoning QA generation.", |
| 170 | ) |
| 171 | |
| 172 | |
| 173 | # ============================================================================= |
| 174 | # Prompt template |
| 175 | # ============================================================================= |
| 176 | |
| 177 | PROMPT_PAGE_CLASSIFICATION = """\ |
| 178 | # ROLE AND OBJECTIVE |
| 179 | You are a document intelligence analyst specializing in visual reasoning assessment. Your task is to analyze document page images and determine their suitability for generating high-quality reasoning-based Question-Answer (QA) pairs. |
| 180 | |
| 181 | # CLASSIFICATION TAXONOMY |
| 182 | Identify and classify ALL visual elements present in the image using these categories: |
| 183 | |
| 184 | **QUANTITATIVE** - Data visualizations requiring numerical analysis |
| 185 | • Bar charts, line graphs, scatter plots, pie charts, area graphs |
| 186 | • Requires: trend analysis, value comparison, rate calculations |
| 187 | |
| 188 | **LOGIC_DIAGRAMS** - Process and decision flows |
| 189 | • Flowcharts, decision trees, process maps, algorithmic diagrams |
| 190 | • Requires: conditional reasoning, path tracing, outcome prediction |
| 191 | |
| 192 | **HIERARCHICAL** - Organizational and structural relationships |
| 193 | • Organizational charts, mind maps, tree structures, taxonomies |
| 194 | • Requires: understanding parent-child relationships, levels, dependencies |
| 195 | |
| 196 | **SPATIAL_RELATIONAL** - Geographic and spatial layouts |
| 197 | • Floor plans, blueprints, maps (choropleth, point, topographic) |
| 198 | • Requires: distance estimation, position inference, spatial reasoning |
| 199 | |
| 200 | **SCHEMATIC** - Technical diagrams with component relationships |
| 201 | • Circuit diagrams, mechanical cross-sections, anatomical diagrams with labels |
| 202 | • Requires: understanding connections, tracing signal/flow paths, component identification |
| 203 | |
| 204 | **TABULAR** - Structured data in rows and columns |
| 205 | • Tables with nested headers, merged cells, subtotals, calculated rows |
| 206 | • Requires: cross-referencing values, performing calculations, identifying patterns |
| 207 | |
| 208 | **INFOGRAPHIC** - Multi-modal composite narratives |
| 209 | • Mixed visuals combining charts, text, icons, and data into cohesive stories |
| 210 | • Requires: synthesizing information across multiple elements |
| 211 | |
| 212 | **NONE** - Content without reasoning potential |
| 213 | • Decorative images, simple photographs, plain text blocks, generic icons |
| 214 | • Presentation slides with only text or bullet points (no visual elements) |
| 215 | • No data relationships, calculations, or logical deductions possible |
| 216 | |
| 217 | **Note on Presentation Content**: The format (e.g., presentation slide, document page) doesn't matter. |
| 218 | Classify based on the actual visual elements present: |
| 219 | • Slide with bar chart → QUANTITATIVE |
| 220 | • Slide with flowchart → LOGIC_DIAGRAMS |
| 221 | • Slide with only text/bullets → NONE |
| 222 | |
| 223 | # REASONING COMPLEXITY ASSESSMENT |
| 224 | Score pages 1-10 based on cognitive demand: |
| 225 | |
| 226 | **High Complexity (8-10)**: Requires multi-step inference |
| 227 | • Cross-referencing multiple data sources |
| 228 | • Mathematical derivation (growth rates, percentages, trends) |
| 229 | • Conditional logic chains (if-then-else reasoning) |
| 230 | • Spatial or temporal reasoning across disconnected components |
| 231 | |
| 232 | **Medium Complexity (4-7)**: Requires single-step analysis |
| 233 | • Direct comparisons between values |
| 234 | • Simple calculations from visible data |
| 235 | • Following a single logical path |
| 236 | • Identifying explicit patterns or relationships |
| 237 | |
| 238 | **Low Complexity (1-3)**: Minimal reasoning |
| 239 | • Direct lookup of visible information |
| 240 | • Simple identification tasks |
| 241 | • No relationships or calculations needed |
| 242 | |
| 243 | # EVALUATION PROCESS |
| 244 | 1. **Scan for visual elements**: Identify all charts, diagrams, tables, or structured content |
| 245 | 2. **Classify elements**: Assign primary categories (up to 3, ordered by prominence) |
| 246 | 3. **Identify subcategories**: Determine specific visual element types (e.g., BAR_CHART, FLOWCHART) |
| 247 | 4. **Assess reasoning depth**: Determine if multi-step thinking is necessary |
| 248 | 5. **Score complexity**: Rate 1-10 based on cognitive requirements |
| 249 | 6. **Justify classification**: Explain why this page is or isn't suitable for reasoning QA |
| 250 | |
| 251 | # DECISION CRITERIA |
| 252 | |
| 253 | **CRITICAL RULES**: |
| 254 | 1. If `primary_categories` contains NONE, it must be the ONLY category (do NOT mix NONE with other categories) |
| 255 | 2. `contains_reasoning_content` must be **False** if NONE is present |
| 256 | 3. If content has reasoning elements, do NOT include NONE at all |
| 257 | 4. Ignore presentation format - classify by actual visual content (slide with chart = QUANTITATIVE, not PRESENTATION) |
| 258 | |
| 259 | Mark `contains_reasoning_content: true` ONLY if: |
| 260 | ✓ Primary categories does NOT contain NONE, AND |
| 261 | ✓ At least one of these reasoning elements is present: |
| 262 | - Quantitative comparisons possible (e.g., "Which region had highest growth?") |
| 263 | - Logical paths to trace (e.g., "What happens if condition X fails?") |
| 264 | - Mathematical derivations needed (e.g., "Calculate percentage change") |
| 265 | - Spatial/temporal relationships to deduce |
| 266 | - Complex table requiring cross-referencing |
| 267 | |
| 268 | Mark `contains_reasoning_content: false` if: |
| 269 | ✗ Only decorative or generic imagery → set primary_categories: ["NONE"] |
| 270 | ✗ Plain text with no visual structure → set primary_categories: ["NONE"] |
| 271 | ✗ Simple lists or single-column tables → set primary_categories: ["NONE"] |
| 272 | ✗ Slides with only text/bullet points (no charts/diagrams) → set primary_categories: ["NONE"] |
| 273 | ✗ No data relationships to explore → set primary_categories: ["NONE"] |
| 274 | |
| 275 | **Classification Logic**: |
| 276 | - Either the page has reasoning content (assign specific categories like QUANTITATIVE, TABULAR, etc.) |
| 277 | - OR it doesn't (assign only ["NONE"]) |
| 278 | - NEVER mix NONE with other categories |
| 279 | |
| 280 | # SUBCATEGORIES |
| 281 | For each visual element found, identify the specific subcategory (e.g., BAR_CHART, FLOWCHART, FLOOR_PLAN). |
| 282 | Include the most prominent subcategories in the `subcategories` list, ordered by importance.\ |
| 283 | """ |
| 284 | |
| 285 | |
| 286 | # ============================================================================= |
| 287 | # Pipeline configuration |
| 288 | # ============================================================================= |
| 289 | |
| 290 | |
| 291 | def build_config( |
| 292 | seed_path: str = "seed.parquet", |
| 293 | model_alias: str = "qwen-vl", |
| 294 | model_id: str = DEFAULT_VLM_MODEL, |
| 295 | ) -> dd.DataDesignerConfigBuilder: |
| 296 | model_configs = [ |
| 297 | dd.ModelConfig( |
| 298 | alias=model_alias, |
| 299 | model=model_id, |
| 300 | provider=VLLM_PROVIDER_NAME, |
| 301 | inference_parameters=dd.ChatCompletionInferenceParams( |
| 302 | timeout=1200, |
| 303 | max_tokens=100000, |
| 304 | max_parallel_requests=32, |
| 305 | ), |
| 306 | ), |
| 307 | ] |
| 308 | |
| 309 | config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs) |
| 310 | |
| 311 | config_builder.with_seed_dataset( |
| 312 | dd.LocalFileSeedSource(path=seed_path), |
| 313 | sampling_strategy=dd.SamplingStrategy.ORDERED, |
| 314 | ) |
| 315 | |
| 316 | config_builder.add_column( |
| 317 | dd.LLMStructuredColumnConfig( |
| 318 | name="page_classification", |
| 319 | model_alias=model_alias, |
| 320 | prompt=PROMPT_PAGE_CLASSIFICATION, |
| 321 | output_format=PageClassification, |
| 322 | multi_modal_context=[ |
| 323 | dd.ImageContext( |
| 324 | # Expects a single-element JSON array from the per-page seed. |
| 325 | column_name="png_images_base64", |
| 326 | data_type=dd.ModalityDataType.BASE64, |
| 327 | image_format=dd.ImageFormat.PNG, |
| 328 | ), |
| 329 | ], |
| 330 | ) |
| 331 | ) |
| 332 | |
| 333 | return config_builder |
| 334 | |
| 335 | |
| 336 | def create_dataset( |
| 337 | config_builder: dd.DataDesignerConfigBuilder, |
| 338 | num_records: int, |
| 339 | vllm_endpoint: str, |
| 340 | artifact_path: Path | str | None = None, |
| 341 | ) -> DatasetCreationResults: |
| 342 | model_providers = [ |
| 343 | dd.ModelProvider( |
| 344 | name=VLLM_PROVIDER_NAME, |
| 345 | endpoint=vllm_endpoint, |
| 346 | ), |
| 347 | ] |
| 348 | data_designer = DataDesigner( |
| 349 | artifact_path=artifact_path, |
| 350 | model_providers=model_providers, |
| 351 | ) |
| 352 | data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True)) |
| 353 | results = data_designer.create(config_builder, num_records=num_records, dataset_name="page_classification") |
| 354 | return results |
| 355 | |
| 356 | |
| 357 | if __name__ == "__main__": |
| 358 | from argparse import ArgumentParser |
| 359 | |
| 360 | parser = ArgumentParser() |
| 361 | parser.add_argument( |
| 362 | "--vllm-endpoint", |
| 363 | type=str, |
| 364 | required=True, |
| 365 | help="Base URL of the vLLM server hosting the VLM (e.g. http://localhost:8000/v1)", |
| 366 | ) |
| 367 | parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file") |
| 368 | parser.add_argument("--model-alias", type=str, default="qwen-vl") |
| 369 | parser.add_argument("--model-id", type=str, default=DEFAULT_VLM_MODEL) |
| 370 | parser.add_argument("--num-records", type=int, default=5) |
| 371 | parser.add_argument("--artifact-path", type=str, default=None) |
| 372 | args = parser.parse_args() |
| 373 | |
| 374 | config_builder = build_config( |
| 375 | seed_path=args.seed_path, |
| 376 | model_alias=args.model_alias, |
| 377 | model_id=args.model_id, |
| 378 | ) |
| 379 | results = create_dataset( |
| 380 | config_builder, |
| 381 | num_records=args.num_records, |
| 382 | vllm_endpoint=args.vllm_endpoint, |
| 383 | artifact_path=args.artifact_path, |
| 384 | ) |
| 385 | |
| 386 | print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") |
| 387 | |
| 388 | results.load_analysis().to_report() |