For AI agents: a documentation index is available at the root level at /llms.txt and /llms-full.txt. Append /llms.txt to any URL for a page-level index, or .md for the markdown version of any page.
  • Getting Started
    • Welcome
    • Contributing
  • Concepts
    • Columns
    • Seed Datasets
    • Agent Rollout Ingestion
    • Custom Columns
    • Validators
    • Processors
    • Person Sampling
    • Traces
    • Architecture & Performance
    • Deployment Options
    • Security
  • Tutorials
    • Overview
    • The Basics
    • Structured Outputs, Jinja Expressions, and Conditional Generation
    • Seeding with an External Dataset
    • Providing Images as Context
    • Generating Images
    • Image-to-Image Editing
  • Recipes
    • Recipe Cards
      • Seed Dataset Preparation
      • Nemotron Parse OCR
      • Text QA from OCR Transcripts
      • Page Classification
      • Visual QA
      • Single-Page QA
      • Multi-Page Windowed QA
      • Whole-Document QA
      • Frontier Judge QA Filter
  • Plugins
    • Overview
    • Example Plugin
    • FileSystemSeedReader Plugins
    • Discover
  • Code Reference
    • Overview
  • Dev Notes
    • Overview
    • Prompt Sensitivity
    • Retriever SDG Toolkit
    • Have It Your Way
    • VLM Long Document Understanding
    • Push Datasets to Hugging Face Hub
    • Text-to-SQL for Nemotron Super
    • Async All the Way Down
    • Owning the Model Stack
NVIDIANVIDIA
Developer-friendly docs for your API
Privacy Policy | Manage My Privacy | Do Not Sell or Share My Data | Terms of Service | Accessibility | Corporate Policies | Product Security | Contact

Copyright © 2026, NVIDIA Corporation.

LogoLogoNeMo Data Designer
RecipesVLM Long-Document Understanding

Nemotron Parse OCR

||View as Markdown|
Previous

Seed Dataset Preparation

Next

Text QA from OCR Transcripts

Download Recipe

Download the complete recipe script

1# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2# SPDX-License-Identifier: Apache-2.0
3# /// script
4# requires-python = ">=3.10"
5# dependencies = [
6# "data-designer>=0.5.6",
7# ]
8# ///
9"""Long-Document Understanding Nemotron-Parse OCR Recipe
10
11Run Nemotron-Parse v1.1 OCR over document images from a seed parquet file.
12Each record produces:
13 - `transcribed_texts`: clean text extracted from the OCR output
14 - `transcribed_texts__metadata`: bounding-box coordinates and class labels
15
16Prerequisites:
17 - A seed parquet file containing a `png_images_base64` column with a JSON
18 array of base64-encoded PNG images (one element per page; single-page
19 seeds have a one-element array).
20 - A vLLM-compatible deployment of nvidia/NVIDIA-Nemotron-Parse-v1.1.
21 The vLLM server must be launched with a chat template that injects the
22 Nemotron-Parse special tokens. Save the following as a .jinja file and
23 pass it via --chat-template:
24
25 {% for message in messages %}{% if message["role"] == "user" %}{{ "</s><s><predict_bbox><predict_classes><output_markdown>" }}{% endif %}{% endfor %}
26
27 Example launch script for 1× H100:
28 docker run -d --gpus all \
29 -p 8000:8000 \
30 --entrypoint bash \
31 vllm/vllm-openai:v0.14.1 \
32 -c "pip install open-clip-torch albumentations timm && vllm serve nvidia/NVIDIA-Nemotron-Parse-v1.1 \
33 --tensor-parallel-size 1 \
34 --max-model-len 9000 \
35 --gpu-memory-utilization 0.85 \
36 --max-num-seqs 128 \
37 --chat-template /chat_template.jinja \
38 --trust-remote-code"
39
40Run:
41 # Basic usage (processes 5 records by default)
42 uv run 02-nemotron-parse-ocr-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet
43
44 # Custom record count
45 uv run 02-nemotron-parse-ocr-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet --num-records 100
46
47 # For help message and available options
48 uv run 02-nemotron-parse-ocr-sdg.py --help
49"""
50
51import re
52from pathlib import Path
53
54import data_designer.config as dd
55from data_designer.interface import DataDesigner, DatasetCreationResults
56
57NEMOTRON_PARSE_MODEL = "nvidia/NVIDIA-Nemotron-Parse-v1.1"
58VLLM_PROVIDER_NAME = "vllm"
59
60_STRUCTURED_ELEMENT_PATTERN = re.compile(
61 r"<x_([\d.]+)><y_([\d.]+)>(.*?)<x_([\d.]+)><y_([\d.]+)><class_([^>]+)>",
62 re.DOTALL,
63)
64
65
66def _extract_structured_elements(text: str) -> list[dict]:
67 """Parse Nemotron-Parse bbox markup into structured dicts.
68
69 Input format: <x_START><y_START>TEXT<x_END><y_END><class_LABEL>
70
71 Returns list of dicts with keys: bbox ({x1,y1,x2,y2}), class_label, text.
72 """
73 elements = []
74 for match in _STRUCTURED_ELEMENT_PATTERN.finditer(text):
75 x1, y1, content, x2, y2, class_label = match.groups()
76 elements.append(
77 {
78 "bbox": {
79 "x1": float(x1),
80 "y1": float(y1),
81 "x2": float(x2),
82 "y2": float(y2),
83 },
84 "class_label": class_label,
85 "text": content.strip(),
86 }
87 )
88 return elements
89
90
91@dd.custom_column_generator(
92 required_columns=["raw_ocr_output"],
93 side_effect_columns=["transcribed_texts__metadata"],
94)
95def parse_ocr_output(row: dict) -> dict:
96 """Extract clean text and bbox metadata from raw Nemotron-Parse output."""
97 raw = row["raw_ocr_output"]
98 elements = _extract_structured_elements(raw)
99 row["transcribed_texts"] = "\n".join(el["text"] for el in elements)
100 row["transcribed_texts__metadata"] = [{"bbox": el["bbox"], "class_label": el["class_label"]} for el in elements]
101 return row
102
103
104def build_config(
105 seed_path: str = "seed.parquet",
106 model_alias: str = "ocr",
107) -> dd.DataDesignerConfigBuilder:
108 model_configs = [
109 dd.ModelConfig(
110 alias=model_alias,
111 model=NEMOTRON_PARSE_MODEL,
112 provider=VLLM_PROVIDER_NAME,
113 # Health check sends a text-only probe; this model requires image
114 # input, so the check would fail. Skip it.
115 skip_health_check=True,
116 inference_parameters=dd.ChatCompletionInferenceParams(
117 temperature=0,
118 timeout=60,
119 max_parallel_requests=32,
120 extra_body={
121 "skip_special_tokens": False,
122 "top_k": 1,
123 "repetition_penalty": 1.1,
124 },
125 ),
126 ),
127 ]
128
129 config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)
130
131 config_builder.with_seed_dataset(
132 dd.LocalFileSeedSource(path=seed_path),
133 sampling_strategy=dd.SamplingStrategy.ORDERED,
134 )
135
136 config_builder.add_column(
137 dd.LLMTextColumnConfig(
138 name="raw_ocr_output",
139 model_alias=model_alias,
140 prompt="",
141 multi_modal_context=[
142 dd.ImageContext(
143 # Expects a single-element JSON array from the per-page seed.
144 column_name="png_images_base64",
145 data_type=dd.ModalityDataType.BASE64,
146 image_format=dd.ImageFormat.PNG,
147 ),
148 ],
149 drop=True,
150 )
151 )
152
153 config_builder.add_column(
154 dd.CustomColumnConfig(
155 name="transcribed_texts",
156 generator_function=parse_ocr_output,
157 )
158 )
159
160 return config_builder
161
162
163def create_dataset(
164 config_builder: dd.DataDesignerConfigBuilder,
165 num_records: int,
166 vllm_endpoint: str,
167 artifact_path: Path | str | None = None,
168) -> DatasetCreationResults:
169 model_providers = [
170 dd.ModelProvider(
171 name=VLLM_PROVIDER_NAME,
172 endpoint=vllm_endpoint,
173 ),
174 ]
175 data_designer = DataDesigner(
176 artifact_path=artifact_path,
177 model_providers=model_providers,
178 )
179 data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True))
180 results = data_designer.create(config_builder, num_records=num_records, dataset_name="nemotron_parse_ocr")
181 return results
182
183
184if __name__ == "__main__":
185 from argparse import ArgumentParser
186
187 parser = ArgumentParser()
188 parser.add_argument(
189 "--vllm-endpoint",
190 type=str,
191 required=True,
192 help="Base URL of the vLLM server hosting nemotron-parse (e.g. http://localhost:8000/v1)",
193 )
194 parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file")
195 parser.add_argument("--model-alias", type=str, default="ocr")
196 parser.add_argument("--num-records", type=int, default=5)
197 parser.add_argument("--artifact-path", type=str, default=None)
198 args = parser.parse_args()
199
200 config_builder = build_config(
201 seed_path=args.seed_path,
202 model_alias=args.model_alias,
203 )
204 results = create_dataset(
205 config_builder,
206 num_records=args.num_records,
207 vllm_endpoint=args.vllm_endpoint,
208 artifact_path=args.artifact_path,
209 )
210
211 print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
212
213 results.load_analysis().to_report()