Page Classification | NVIDIA NeMo Data Designer

Download Recipe

1 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 # SPDX-License-Identifier: Apache-2.0
3 # /// script
4 # requires-python = ">=3.10"
5 # dependencies = [
6 #     "data-designer>=0.5.6",
7 # ]
8 # ///
9 """Long-Document Understanding Page Classification Recipe
10 
11 Classify document page images by their visual element types and reasoning
12 complexity using a vision-language model. For each seed record the pipeline
13 produces a structured `page_classification` column containing:
14 
15   - `contains_reasoning_content` – whether the page has visual elements
16     suitable for reasoning QA
17   - `primary_categories` – ordered list of visual element categories
18     (QUANTITATIVE, TABULAR, LOGIC_DIAGRAMS, HIERARCHICAL, etc.)
19   - `subcategories` – specific element types (BAR_CHART, FLOWCHART, …)
20   - `reasoning_complexity_score` – 1-10 cognitive demand rating
21   - `justification` – brief explanation of the classification
22 
23 Prerequisites:
24     - A seed parquet file containing a `png_images_base64` column with a JSON
25       array of base64-encoded PNG images (one element per page; single-page
26       seeds have a one-element array).
27     - A vLLM-compatible deployment of the VLM
28       (default: Qwen/Qwen3-VL-30B-A3B-Instruct).
29       Recommended vLLM launch flags:
30         --tensor-parallel-size 2
31         --max-model-len 128000
32         --gpu-memory-utilization 0.95
33         --trust-remote-code
34 
35       Example launch script for 2× H100:
36         docker run --gpus all \
37             -p 8000:8000 \
38             vllm/vllm-openai:latest \
39             --model Qwen/Qwen3-VL-30B-A3B-Instruct \
40             --tensor-parallel-size 2 \
41             --max-model-len 128000 \
42             --gpu-memory-utilization 0.95 \
43             --trust-remote-code
44 
45 Run:
46     # Basic usage (classifies 5 pages by default)
47     uv run 04-page-classification-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet
48 
49     # Custom record count
50     uv run 04-page-classification-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet --num-records 100
51 
52     # For help message and available options
53     uv run 04-page-classification-sdg.py --help
54 """
55 
56 from enum import Enum
57 from pathlib import Path
58 
59 from pydantic import BaseModel, Field
60 
61 import data_designer.config as dd
62 from data_designer.interface import DataDesigner, DatasetCreationResults
63 
64 DEFAULT_VLM_MODEL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
65 VLLM_PROVIDER_NAME = "vllm"
66 
67 # =============================================================================
68 # Structured output schema
69 # =============================================================================
70 
71 
72 class VisualElementCategory(str, Enum):
73     QUANTITATIVE = "QUANTITATIVE"
74     LOGIC_DIAGRAMS = "LOGIC_DIAGRAMS"
75     HIERARCHICAL = "HIERARCHICAL"
76     SPATIAL_RELATIONAL = "SPATIAL_RELATIONAL"
77     SCHEMATIC = "SCHEMATIC"
78     TABULAR = "TABULAR"
79     INFOGRAPHIC = "INFOGRAPHIC"
80     NONE = "NONE"
81 
82 
83 class VisualElementSubcategory(str, Enum):
84     # QUANTITATIVE
85     BAR_CHART = "BAR_CHART"
86     LINE_GRAPH = "LINE_GRAPH"
87     SCATTER_PLOT = "SCATTER_PLOT"
88     PIE_CHART = "PIE_CHART"
89     AREA_GRAPH = "AREA_GRAPH"
90     HISTOGRAM = "HISTOGRAM"
91     BOX_PLOT = "BOX_PLOT"
92     HEATMAP = "HEATMAP"
93     BUBBLE_CHART = "BUBBLE_CHART"
94     # LOGIC_DIAGRAMS
95     FLOWCHART = "FLOWCHART"
96     DECISION_TREE = "DECISION_TREE"
97     PROCESS_MAP = "PROCESS_MAP"
98     ALGORITHM_DIAGRAM = "ALGORITHM_DIAGRAM"
99     STATE_DIAGRAM = "STATE_DIAGRAM"
100     SEQUENCE_DIAGRAM = "SEQUENCE_DIAGRAM"
101     # HIERARCHICAL
102     ORG_CHART = "ORG_CHART"
103     MIND_MAP = "MIND_MAP"
104     TREE_STRUCTURE = "TREE_STRUCTURE"
105     TAXONOMY = "TAXONOMY"
106     DENDROGRAM = "DENDROGRAM"
107     # SPATIAL_RELATIONAL
108     FLOOR_PLAN = "FLOOR_PLAN"
109     BLUEPRINT = "BLUEPRINT"
110     CHOROPLETH_MAP = "CHOROPLETH_MAP"
111     POINT_MAP = "POINT_MAP"
112     TOPOGRAPHIC_MAP = "TOPOGRAPHIC_MAP"
113     NETWORK_DIAGRAM = "NETWORK_DIAGRAM"
114     # SCHEMATIC
115     CIRCUIT_DIAGRAM = "CIRCUIT_DIAGRAM"
116     MECHANICAL_DIAGRAM = "MECHANICAL_DIAGRAM"
117     ANATOMICAL_DIAGRAM = "ANATOMICAL_DIAGRAM"
118     WIRING_DIAGRAM = "WIRING_DIAGRAM"
119     PLUMBING_DIAGRAM = "PLUMBING_DIAGRAM"
120     # TABULAR
121     SIMPLE_TABLE = "SIMPLE_TABLE"
122     NESTED_TABLE = "NESTED_TABLE"
123     PIVOT_TABLE = "PIVOT_TABLE"
124     COMPARISON_TABLE = "COMPARISON_TABLE"
125     FINANCIAL_TABLE = "FINANCIAL_TABLE"
126     # INFOGRAPHIC
127     TIMELINE = "TIMELINE"
128     STATISTICAL_INFOGRAPHIC = "STATISTICAL_INFOGRAPHIC"
129     PROCESS_INFOGRAPHIC = "PROCESS_INFOGRAPHIC"
130     COMPARISON_INFOGRAPHIC = "COMPARISON_INFOGRAPHIC"
131     # NONE
132     DECORATIVE_IMAGE = "DECORATIVE_IMAGE"
133     PHOTOGRAPH = "PHOTOGRAPH"
134     PLAIN_TEXT = "PLAIN_TEXT"
135     GENERIC_ICON = "GENERIC_ICON"
136     OTHER = "OTHER"
137 
138 
139 class PageClassification(BaseModel):
140     """Classification result for a document page's reasoning potential."""
141 
142     contains_reasoning_content: bool = Field(
143         ...,
144         description=(
145             "Whether the page contains visual elements suitable for reasoning QA pairs. "
146             "Must be False if primary_categories contains NONE. "
147             "Must be True if primary_categories does NOT contain NONE."
148         ),
149     )
150     primary_categories: list[VisualElementCategory] = Field(
151         ...,
152         description=(
153             "List of visual element categories found in the page, ordered by prominence. "
154             "IMPORTANT: If NONE is present, it must be the ONLY category in this list."
155         ),
156     )
157     subcategories: list[VisualElementSubcategory] = Field(
158         ...,
159         description="Specific types of visual elements identified (e.g., BAR_CHART, FLOWCHART).",
160     )
161     reasoning_complexity_score: int = Field(
162         ...,
163         ge=1,
164         le=10,
165         description="Complexity score from 1-10 indicating the depth of reasoning required.",
166     )
167     justification: str = Field(
168         ...,
169         description="Brief explanation of why this page is or isn't suitable for reasoning QA generation.",
170     )
171 
172 
173 # =============================================================================
174 # Prompt template
175 # =============================================================================
176 
177 PROMPT_PAGE_CLASSIFICATION = """\
178 # ROLE AND OBJECTIVE
179 You are a document intelligence analyst specializing in visual reasoning assessment. Your task is to analyze document page images and determine their suitability for generating high-quality reasoning-based Question-Answer (QA) pairs.
180 
181 # CLASSIFICATION TAXONOMY
182 Identify and classify ALL visual elements present in the image using these categories:
183 
184 **QUANTITATIVE** - Data visualizations requiring numerical analysis
185   • Bar charts, line graphs, scatter plots, pie charts, area graphs
186   • Requires: trend analysis, value comparison, rate calculations
187 
188 **LOGIC_DIAGRAMS** - Process and decision flows
189   • Flowcharts, decision trees, process maps, algorithmic diagrams
190   • Requires: conditional reasoning, path tracing, outcome prediction
191 
192 **HIERARCHICAL** - Organizational and structural relationships
193   • Organizational charts, mind maps, tree structures, taxonomies
194   • Requires: understanding parent-child relationships, levels, dependencies
195 
196 **SPATIAL_RELATIONAL** - Geographic and spatial layouts
197   • Floor plans, blueprints, maps (choropleth, point, topographic)
198   • Requires: distance estimation, position inference, spatial reasoning
199 
200 **SCHEMATIC** - Technical diagrams with component relationships
201   • Circuit diagrams, mechanical cross-sections, anatomical diagrams with labels
202   • Requires: understanding connections, tracing signal/flow paths, component identification
203 
204 **TABULAR** - Structured data in rows and columns
205   • Tables with nested headers, merged cells, subtotals, calculated rows
206   • Requires: cross-referencing values, performing calculations, identifying patterns
207 
208 **INFOGRAPHIC** - Multi-modal composite narratives
209   • Mixed visuals combining charts, text, icons, and data into cohesive stories
210   • Requires: synthesizing information across multiple elements
211 
212 **NONE** - Content without reasoning potential
213   • Decorative images, simple photographs, plain text blocks, generic icons
214   • Presentation slides with only text or bullet points (no visual elements)
215   • No data relationships, calculations, or logical deductions possible
216 
217 **Note on Presentation Content**: The format (e.g., presentation slide, document page) doesn't matter.
218 Classify based on the actual visual elements present:
219   • Slide with bar chart → QUANTITATIVE
220   • Slide with flowchart → LOGIC_DIAGRAMS
221   • Slide with only text/bullets → NONE
222 
223 # REASONING COMPLEXITY ASSESSMENT
224 Score pages 1-10 based on cognitive demand:
225 
226 **High Complexity (8-10)**: Requires multi-step inference
227   • Cross-referencing multiple data sources
228   • Mathematical derivation (growth rates, percentages, trends)
229   • Conditional logic chains (if-then-else reasoning)
230   • Spatial or temporal reasoning across disconnected components
231 
232 **Medium Complexity (4-7)**: Requires single-step analysis
233   • Direct comparisons between values
234   • Simple calculations from visible data
235   • Following a single logical path
236   • Identifying explicit patterns or relationships
237 
238 **Low Complexity (1-3)**: Minimal reasoning
239   • Direct lookup of visible information
240   • Simple identification tasks
241   • No relationships or calculations needed
242 
243 # EVALUATION PROCESS
244 1. **Scan for visual elements**: Identify all charts, diagrams, tables, or structured content
245 2. **Classify elements**: Assign primary categories (up to 3, ordered by prominence)
246 3. **Identify subcategories**: Determine specific visual element types (e.g., BAR_CHART, FLOWCHART)
247 4. **Assess reasoning depth**: Determine if multi-step thinking is necessary
248 5. **Score complexity**: Rate 1-10 based on cognitive requirements
249 6. **Justify classification**: Explain why this page is or isn't suitable for reasoning QA
250 
251 # DECISION CRITERIA
252 
253 **CRITICAL RULES**:
254 1. If `primary_categories` contains NONE, it must be the ONLY category (do NOT mix NONE with other categories)
255 2. `contains_reasoning_content` must be **False** if NONE is present
256 3. If content has reasoning elements, do NOT include NONE at all
257 4. Ignore presentation format - classify by actual visual content (slide with chart = QUANTITATIVE, not PRESENTATION)
258 
259 Mark `contains_reasoning_content: true` ONLY if:
260   ✓ Primary categories does NOT contain NONE, AND
261   ✓ At least one of these reasoning elements is present:
262     - Quantitative comparisons possible (e.g., "Which region had highest growth?")
263     - Logical paths to trace (e.g., "What happens if condition X fails?")
264     - Mathematical derivations needed (e.g., "Calculate percentage change")
265     - Spatial/temporal relationships to deduce
266     - Complex table requiring cross-referencing
267 
268 Mark `contains_reasoning_content: false` if:
269   ✗ Only decorative or generic imagery → set primary_categories: ["NONE"]
270   ✗ Plain text with no visual structure → set primary_categories: ["NONE"]
271   ✗ Simple lists or single-column tables → set primary_categories: ["NONE"]
272   ✗ Slides with only text/bullet points (no charts/diagrams) → set primary_categories: ["NONE"]
273   ✗ No data relationships to explore → set primary_categories: ["NONE"]
274 
275 **Classification Logic**:
276 - Either the page has reasoning content (assign specific categories like QUANTITATIVE, TABULAR, etc.)
277 - OR it doesn't (assign only ["NONE"])
278 - NEVER mix NONE with other categories
279 
280 # SUBCATEGORIES
281 For each visual element found, identify the specific subcategory (e.g., BAR_CHART, FLOWCHART, FLOOR_PLAN).
282 Include the most prominent subcategories in the `subcategories` list, ordered by importance.\
283 """
284 
285 
286 # =============================================================================
287 # Pipeline configuration
288 # =============================================================================
289 
290 
291 def build_config(
292     seed_path: str = "seed.parquet",
293     model_alias: str = "qwen-vl",
294     model_id: str = DEFAULT_VLM_MODEL,
295 ) -> dd.DataDesignerConfigBuilder:
296     model_configs = [
297         dd.ModelConfig(
298             alias=model_alias,
299             model=model_id,
300             provider=VLLM_PROVIDER_NAME,
301             inference_parameters=dd.ChatCompletionInferenceParams(
302                 timeout=1200,
303                 max_tokens=100000,
304                 max_parallel_requests=32,
305             ),
306         ),
307     ]
308 
309     config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)
310 
311     config_builder.with_seed_dataset(
312         dd.LocalFileSeedSource(path=seed_path),
313         sampling_strategy=dd.SamplingStrategy.ORDERED,
314     )
315 
316     config_builder.add_column(
317         dd.LLMStructuredColumnConfig(
318             name="page_classification",
319             model_alias=model_alias,
320             prompt=PROMPT_PAGE_CLASSIFICATION,
321             output_format=PageClassification,
322             multi_modal_context=[
323                 dd.ImageContext(
324                     # Expects a single-element JSON array from the per-page seed.
325                     column_name="png_images_base64",
326                     data_type=dd.ModalityDataType.BASE64,
327                     image_format=dd.ImageFormat.PNG,
328                 ),
329             ],
330         )
331     )
332 
333     return config_builder
334 
335 
336 def create_dataset(
337     config_builder: dd.DataDesignerConfigBuilder,
338     num_records: int,
339     vllm_endpoint: str,
340     artifact_path: Path | str | None = None,
341 ) -> DatasetCreationResults:
342     model_providers = [
343         dd.ModelProvider(
344             name=VLLM_PROVIDER_NAME,
345             endpoint=vllm_endpoint,
346         ),
347     ]
348     data_designer = DataDesigner(
349         artifact_path=artifact_path,
350         model_providers=model_providers,
351     )
352     data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True))
353     results = data_designer.create(config_builder, num_records=num_records, dataset_name="page_classification")
354     return results
355 
356 
357 if __name__ == "__main__":
358     from argparse import ArgumentParser
359 
360     parser = ArgumentParser()
361     parser.add_argument(
362         "--vllm-endpoint",
363         type=str,
364         required=True,
365         help="Base URL of the vLLM server hosting the VLM (e.g. http://localhost:8000/v1)",
366     )
367     parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file")
368     parser.add_argument("--model-alias", type=str, default="qwen-vl")
369     parser.add_argument("--model-id", type=str, default=DEFAULT_VLM_MODEL)
370     parser.add_argument("--num-records", type=int, default=5)
371     parser.add_argument("--artifact-path", type=str, default=None)
372     args = parser.parse_args()
373 
374     config_builder = build_config(
375         seed_path=args.seed_path,
376         model_alias=args.model_alias,
377         model_id=args.model_id,
378     )
379     results = create_dataset(
380         config_builder,
381         num_records=args.num_records,
382         vllm_endpoint=args.vllm_endpoint,
383         artifact_path=args.artifact_path,
384     )
385 
386     print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
387 
388     results.load_analysis().to_report()