For AI agents: a documentation index is available at the root level at /llms.txt and /llms-full.txt. Append /llms.txt to any URL for a page-level index, or .md for the markdown version of any page.
  • Getting Started
    • Welcome
    • Contributing
  • Concepts
    • Columns
    • Seed Datasets
    • Agent Rollout Ingestion
    • Custom Columns
    • Validators
    • Processors
    • Person Sampling
    • Traces
    • Architecture & Performance
    • Deployment Options
    • Security
  • Tutorials
    • Overview
    • The Basics
    • Structured Outputs, Jinja Expressions, and Conditional Generation
    • Seeding with an External Dataset
    • Providing Images as Context
    • Generating Images
    • Image-to-Image Editing
  • Recipes
    • Recipe Cards
      • Seed Dataset Preparation
      • Nemotron Parse OCR
      • Text QA from OCR Transcripts
      • Page Classification
      • Visual QA
      • Single-Page QA
      • Multi-Page Windowed QA
      • Whole-Document QA
      • Frontier Judge QA Filter
  • Plugins
    • Overview
    • Example Plugin
    • FileSystemSeedReader Plugins
    • Discover
  • Code Reference
    • Overview
  • Dev Notes
    • Overview
    • Have It Your Way
    • VLM Long Document Understanding
    • Push Datasets to Hugging Face Hub
    • Text-to-SQL for Nemotron Super
    • Async All the Way Down
    • Owning the Model Stack
NVIDIANVIDIA
Developer-friendly docs for your API
Privacy Policy | Your Privacy Choices | Terms of Service | Accessibility | Corporate Policies | Product Security | Contact

Copyright © 2026, NVIDIA Corporation.

LogoLogoNeMo Data Designer
RecipesVLM Long-Document Understanding

Page Classification

||View as Markdown|
Previous

Text QA from OCR Transcripts

Next

Visual QA

Download Recipe

Download the complete recipe script

1# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2# SPDX-License-Identifier: Apache-2.0
3# /// script
4# requires-python = ">=3.10"
5# dependencies = [
6# "data-designer>=0.5.6",
7# ]
8# ///
9"""Long-Document Understanding Page Classification Recipe
10
11Classify document page images by their visual element types and reasoning
12complexity using a vision-language model. For each seed record the pipeline
13produces a structured `page_classification` column containing:
14
15 - `contains_reasoning_content` – whether the page has visual elements
16 suitable for reasoning QA
17 - `primary_categories` – ordered list of visual element categories
18 (QUANTITATIVE, TABULAR, LOGIC_DIAGRAMS, HIERARCHICAL, etc.)
19 - `subcategories` – specific element types (BAR_CHART, FLOWCHART, …)
20 - `reasoning_complexity_score` – 1-10 cognitive demand rating
21 - `justification` – brief explanation of the classification
22
23Prerequisites:
24 - A seed parquet file containing a `png_images_base64` column with a JSON
25 array of base64-encoded PNG images (one element per page; single-page
26 seeds have a one-element array).
27 - A vLLM-compatible deployment of the VLM
28 (default: Qwen/Qwen3-VL-30B-A3B-Instruct).
29 Recommended vLLM launch flags:
30 --tensor-parallel-size 2
31 --max-model-len 128000
32 --gpu-memory-utilization 0.95
33 --trust-remote-code
34
35 Example launch script for 2× H100:
36 docker run --gpus all \
37 -p 8000:8000 \
38 vllm/vllm-openai:latest \
39 --model Qwen/Qwen3-VL-30B-A3B-Instruct \
40 --tensor-parallel-size 2 \
41 --max-model-len 128000 \
42 --gpu-memory-utilization 0.95 \
43 --trust-remote-code
44
45Run:
46 # Basic usage (classifies 5 pages by default)
47 uv run 04-page-classification-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet
48
49 # Custom record count
50 uv run 04-page-classification-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet --num-records 100
51
52 # For help message and available options
53 uv run 04-page-classification-sdg.py --help
54"""
55
56from enum import Enum
57from pathlib import Path
58
59from pydantic import BaseModel, Field
60
61import data_designer.config as dd
62from data_designer.interface import DataDesigner, DatasetCreationResults
63
64DEFAULT_VLM_MODEL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
65VLLM_PROVIDER_NAME = "vllm"
66
67# =============================================================================
68# Structured output schema
69# =============================================================================
70
71
72class VisualElementCategory(str, Enum):
73 QUANTITATIVE = "QUANTITATIVE"
74 LOGIC_DIAGRAMS = "LOGIC_DIAGRAMS"
75 HIERARCHICAL = "HIERARCHICAL"
76 SPATIAL_RELATIONAL = "SPATIAL_RELATIONAL"
77 SCHEMATIC = "SCHEMATIC"
78 TABULAR = "TABULAR"
79 INFOGRAPHIC = "INFOGRAPHIC"
80 NONE = "NONE"
81
82
83class VisualElementSubcategory(str, Enum):
84 # QUANTITATIVE
85 BAR_CHART = "BAR_CHART"
86 LINE_GRAPH = "LINE_GRAPH"
87 SCATTER_PLOT = "SCATTER_PLOT"
88 PIE_CHART = "PIE_CHART"
89 AREA_GRAPH = "AREA_GRAPH"
90 HISTOGRAM = "HISTOGRAM"
91 BOX_PLOT = "BOX_PLOT"
92 HEATMAP = "HEATMAP"
93 BUBBLE_CHART = "BUBBLE_CHART"
94 # LOGIC_DIAGRAMS
95 FLOWCHART = "FLOWCHART"
96 DECISION_TREE = "DECISION_TREE"
97 PROCESS_MAP = "PROCESS_MAP"
98 ALGORITHM_DIAGRAM = "ALGORITHM_DIAGRAM"
99 STATE_DIAGRAM = "STATE_DIAGRAM"
100 SEQUENCE_DIAGRAM = "SEQUENCE_DIAGRAM"
101 # HIERARCHICAL
102 ORG_CHART = "ORG_CHART"
103 MIND_MAP = "MIND_MAP"
104 TREE_STRUCTURE = "TREE_STRUCTURE"
105 TAXONOMY = "TAXONOMY"
106 DENDROGRAM = "DENDROGRAM"
107 # SPATIAL_RELATIONAL
108 FLOOR_PLAN = "FLOOR_PLAN"
109 BLUEPRINT = "BLUEPRINT"
110 CHOROPLETH_MAP = "CHOROPLETH_MAP"
111 POINT_MAP = "POINT_MAP"
112 TOPOGRAPHIC_MAP = "TOPOGRAPHIC_MAP"
113 NETWORK_DIAGRAM = "NETWORK_DIAGRAM"
114 # SCHEMATIC
115 CIRCUIT_DIAGRAM = "CIRCUIT_DIAGRAM"
116 MECHANICAL_DIAGRAM = "MECHANICAL_DIAGRAM"
117 ANATOMICAL_DIAGRAM = "ANATOMICAL_DIAGRAM"
118 WIRING_DIAGRAM = "WIRING_DIAGRAM"
119 PLUMBING_DIAGRAM = "PLUMBING_DIAGRAM"
120 # TABULAR
121 SIMPLE_TABLE = "SIMPLE_TABLE"
122 NESTED_TABLE = "NESTED_TABLE"
123 PIVOT_TABLE = "PIVOT_TABLE"
124 COMPARISON_TABLE = "COMPARISON_TABLE"
125 FINANCIAL_TABLE = "FINANCIAL_TABLE"
126 # INFOGRAPHIC
127 TIMELINE = "TIMELINE"
128 STATISTICAL_INFOGRAPHIC = "STATISTICAL_INFOGRAPHIC"
129 PROCESS_INFOGRAPHIC = "PROCESS_INFOGRAPHIC"
130 COMPARISON_INFOGRAPHIC = "COMPARISON_INFOGRAPHIC"
131 # NONE
132 DECORATIVE_IMAGE = "DECORATIVE_IMAGE"
133 PHOTOGRAPH = "PHOTOGRAPH"
134 PLAIN_TEXT = "PLAIN_TEXT"
135 GENERIC_ICON = "GENERIC_ICON"
136 OTHER = "OTHER"
137
138
139class PageClassification(BaseModel):
140 """Classification result for a document page's reasoning potential."""
141
142 contains_reasoning_content: bool = Field(
143 ...,
144 description=(
145 "Whether the page contains visual elements suitable for reasoning QA pairs. "
146 "Must be False if primary_categories contains NONE. "
147 "Must be True if primary_categories does NOT contain NONE."
148 ),
149 )
150 primary_categories: list[VisualElementCategory] = Field(
151 ...,
152 description=(
153 "List of visual element categories found in the page, ordered by prominence. "
154 "IMPORTANT: If NONE is present, it must be the ONLY category in this list."
155 ),
156 )
157 subcategories: list[VisualElementSubcategory] = Field(
158 ...,
159 description="Specific types of visual elements identified (e.g., BAR_CHART, FLOWCHART).",
160 )
161 reasoning_complexity_score: int = Field(
162 ...,
163 ge=1,
164 le=10,
165 description="Complexity score from 1-10 indicating the depth of reasoning required.",
166 )
167 justification: str = Field(
168 ...,
169 description="Brief explanation of why this page is or isn't suitable for reasoning QA generation.",
170 )
171
172
173# =============================================================================
174# Prompt template
175# =============================================================================
176
177PROMPT_PAGE_CLASSIFICATION = """\
178# ROLE AND OBJECTIVE
179You are a document intelligence analyst specializing in visual reasoning assessment. Your task is to analyze document page images and determine their suitability for generating high-quality reasoning-based Question-Answer (QA) pairs.
180
181# CLASSIFICATION TAXONOMY
182Identify and classify ALL visual elements present in the image using these categories:
183
184**QUANTITATIVE** - Data visualizations requiring numerical analysis
185 • Bar charts, line graphs, scatter plots, pie charts, area graphs
186 • Requires: trend analysis, value comparison, rate calculations
187
188**LOGIC_DIAGRAMS** - Process and decision flows
189 • Flowcharts, decision trees, process maps, algorithmic diagrams
190 • Requires: conditional reasoning, path tracing, outcome prediction
191
192**HIERARCHICAL** - Organizational and structural relationships
193 • Organizational charts, mind maps, tree structures, taxonomies
194 • Requires: understanding parent-child relationships, levels, dependencies
195
196**SPATIAL_RELATIONAL** - Geographic and spatial layouts
197 • Floor plans, blueprints, maps (choropleth, point, topographic)
198 • Requires: distance estimation, position inference, spatial reasoning
199
200**SCHEMATIC** - Technical diagrams with component relationships
201 • Circuit diagrams, mechanical cross-sections, anatomical diagrams with labels
202 • Requires: understanding connections, tracing signal/flow paths, component identification
203
204**TABULAR** - Structured data in rows and columns
205 • Tables with nested headers, merged cells, subtotals, calculated rows
206 • Requires: cross-referencing values, performing calculations, identifying patterns
207
208**INFOGRAPHIC** - Multi-modal composite narratives
209 • Mixed visuals combining charts, text, icons, and data into cohesive stories
210 • Requires: synthesizing information across multiple elements
211
212**NONE** - Content without reasoning potential
213 • Decorative images, simple photographs, plain text blocks, generic icons
214 • Presentation slides with only text or bullet points (no visual elements)
215 • No data relationships, calculations, or logical deductions possible
216
217**Note on Presentation Content**: The format (e.g., presentation slide, document page) doesn't matter.
218Classify based on the actual visual elements present:
219 • Slide with bar chart → QUANTITATIVE
220 • Slide with flowchart → LOGIC_DIAGRAMS
221 • Slide with only text/bullets → NONE
222
223# REASONING COMPLEXITY ASSESSMENT
224Score pages 1-10 based on cognitive demand:
225
226**High Complexity (8-10)**: Requires multi-step inference
227 • Cross-referencing multiple data sources
228 • Mathematical derivation (growth rates, percentages, trends)
229 • Conditional logic chains (if-then-else reasoning)
230 • Spatial or temporal reasoning across disconnected components
231
232**Medium Complexity (4-7)**: Requires single-step analysis
233 • Direct comparisons between values
234 • Simple calculations from visible data
235 • Following a single logical path
236 • Identifying explicit patterns or relationships
237
238**Low Complexity (1-3)**: Minimal reasoning
239 • Direct lookup of visible information
240 • Simple identification tasks
241 • No relationships or calculations needed
242
243# EVALUATION PROCESS
2441. **Scan for visual elements**: Identify all charts, diagrams, tables, or structured content
2452. **Classify elements**: Assign primary categories (up to 3, ordered by prominence)
2463. **Identify subcategories**: Determine specific visual element types (e.g., BAR_CHART, FLOWCHART)
2474. **Assess reasoning depth**: Determine if multi-step thinking is necessary
2485. **Score complexity**: Rate 1-10 based on cognitive requirements
2496. **Justify classification**: Explain why this page is or isn't suitable for reasoning QA
250
251# DECISION CRITERIA
252
253**CRITICAL RULES**:
2541. If `primary_categories` contains NONE, it must be the ONLY category (do NOT mix NONE with other categories)
2552. `contains_reasoning_content` must be **False** if NONE is present
2563. If content has reasoning elements, do NOT include NONE at all
2574. Ignore presentation format - classify by actual visual content (slide with chart = QUANTITATIVE, not PRESENTATION)
258
259Mark `contains_reasoning_content: true` ONLY if:
260 ✓ Primary categories does NOT contain NONE, AND
261 ✓ At least one of these reasoning elements is present:
262 - Quantitative comparisons possible (e.g., "Which region had highest growth?")
263 - Logical paths to trace (e.g., "What happens if condition X fails?")
264 - Mathematical derivations needed (e.g., "Calculate percentage change")
265 - Spatial/temporal relationships to deduce
266 - Complex table requiring cross-referencing
267
268Mark `contains_reasoning_content: false` if:
269 ✗ Only decorative or generic imagery → set primary_categories: ["NONE"]
270 ✗ Plain text with no visual structure → set primary_categories: ["NONE"]
271 ✗ Simple lists or single-column tables → set primary_categories: ["NONE"]
272 ✗ Slides with only text/bullet points (no charts/diagrams) → set primary_categories: ["NONE"]
273 ✗ No data relationships to explore → set primary_categories: ["NONE"]
274
275**Classification Logic**:
276- Either the page has reasoning content (assign specific categories like QUANTITATIVE, TABULAR, etc.)
277- OR it doesn't (assign only ["NONE"])
278- NEVER mix NONE with other categories
279
280# SUBCATEGORIES
281For each visual element found, identify the specific subcategory (e.g., BAR_CHART, FLOWCHART, FLOOR_PLAN).
282Include the most prominent subcategories in the `subcategories` list, ordered by importance.\
283"""
284
285
286# =============================================================================
287# Pipeline configuration
288# =============================================================================
289
290
291def build_config(
292 seed_path: str = "seed.parquet",
293 model_alias: str = "qwen-vl",
294 model_id: str = DEFAULT_VLM_MODEL,
295) -> dd.DataDesignerConfigBuilder:
296 model_configs = [
297 dd.ModelConfig(
298 alias=model_alias,
299 model=model_id,
300 provider=VLLM_PROVIDER_NAME,
301 inference_parameters=dd.ChatCompletionInferenceParams(
302 timeout=1200,
303 max_tokens=100000,
304 max_parallel_requests=32,
305 ),
306 ),
307 ]
308
309 config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)
310
311 config_builder.with_seed_dataset(
312 dd.LocalFileSeedSource(path=seed_path),
313 sampling_strategy=dd.SamplingStrategy.ORDERED,
314 )
315
316 config_builder.add_column(
317 dd.LLMStructuredColumnConfig(
318 name="page_classification",
319 model_alias=model_alias,
320 prompt=PROMPT_PAGE_CLASSIFICATION,
321 output_format=PageClassification,
322 multi_modal_context=[
323 dd.ImageContext(
324 # Expects a single-element JSON array from the per-page seed.
325 column_name="png_images_base64",
326 data_type=dd.ModalityDataType.BASE64,
327 image_format=dd.ImageFormat.PNG,
328 ),
329 ],
330 )
331 )
332
333 return config_builder
334
335
336def create_dataset(
337 config_builder: dd.DataDesignerConfigBuilder,
338 num_records: int,
339 vllm_endpoint: str,
340 artifact_path: Path | str | None = None,
341) -> DatasetCreationResults:
342 model_providers = [
343 dd.ModelProvider(
344 name=VLLM_PROVIDER_NAME,
345 endpoint=vllm_endpoint,
346 ),
347 ]
348 data_designer = DataDesigner(
349 artifact_path=artifact_path,
350 model_providers=model_providers,
351 )
352 data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True))
353 results = data_designer.create(config_builder, num_records=num_records, dataset_name="page_classification")
354 return results
355
356
357if __name__ == "__main__":
358 from argparse import ArgumentParser
359
360 parser = ArgumentParser()
361 parser.add_argument(
362 "--vllm-endpoint",
363 type=str,
364 required=True,
365 help="Base URL of the vLLM server hosting the VLM (e.g. http://localhost:8000/v1)",
366 )
367 parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file")
368 parser.add_argument("--model-alias", type=str, default="qwen-vl")
369 parser.add_argument("--model-id", type=str, default=DEFAULT_VLM_MODEL)
370 parser.add_argument("--num-records", type=int, default=5)
371 parser.add_argument("--artifact-path", type=str, default=None)
372 args = parser.parse_args()
373
374 config_builder = build_config(
375 seed_path=args.seed_path,
376 model_alias=args.model_alias,
377 model_id=args.model_id,
378 )
379 results = create_dataset(
380 config_builder,
381 num_records=args.num_records,
382 vllm_endpoint=args.vllm_endpoint,
383 artifact_path=args.artifact_path,
384 )
385
386 print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
387
388 results.load_analysis().to_report()