Nemotron Parse OCR | NVIDIA NeMo Data Designer

Download Recipe

1 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 # SPDX-License-Identifier: Apache-2.0
3 # /// script
4 # requires-python = ">=3.10"
5 # dependencies = [
6 #     "data-designer>=0.5.6",
7 # ]
8 # ///
9 """Long-Document Understanding Nemotron-Parse OCR Recipe
10 
11 Run Nemotron-Parse v1.1 OCR over document images from a seed parquet file.
12 Each record produces:
13   - `transcribed_texts`: clean text extracted from the OCR output
14   - `transcribed_texts__metadata`: bounding-box coordinates and class labels
15 
16 Prerequisites:
17     - A seed parquet file containing a `png_images_base64` column with a JSON
18       array of base64-encoded PNG images (one element per page; single-page
19       seeds have a one-element array).
20     - A vLLM-compatible deployment of nvidia/NVIDIA-Nemotron-Parse-v1.1.
21       The vLLM server must be launched with a chat template that injects the
22       Nemotron-Parse special tokens. Save the following as a .jinja file and
23       pass it via --chat-template:
24 
25         {% for message in messages %}{% if message["role"] == "user" %}{{ "</s><s><predict_bbox><predict_classes><output_markdown>" }}{% endif %}{% endfor %}
26 
27       Example launch script for 1× H100:
28         docker run -d --gpus all \
29             -p 8000:8000 \
30             --entrypoint bash \
31             vllm/vllm-openai:v0.14.1 \
32             -c "pip install open-clip-torch albumentations timm && vllm serve nvidia/NVIDIA-Nemotron-Parse-v1.1 \
33             --tensor-parallel-size 1 \
34             --max-model-len 9000 \
35             --gpu-memory-utilization 0.85 \
36             --max-num-seqs 128 \
37             --chat-template /chat_template.jinja \
38             --trust-remote-code"
39 
40 Run:
41     # Basic usage (processes 5 records by default)
42     uv run 02-nemotron-parse-ocr-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet
43 
44     # Custom record count
45     uv run 02-nemotron-parse-ocr-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet --num-records 100
46 
47     # For help message and available options
48     uv run 02-nemotron-parse-ocr-sdg.py --help
49 """
50 
51 import re
52 from pathlib import Path
53 
54 import data_designer.config as dd
55 from data_designer.interface import DataDesigner, DatasetCreationResults
56 
57 NEMOTRON_PARSE_MODEL = "nvidia/NVIDIA-Nemotron-Parse-v1.1"
58 VLLM_PROVIDER_NAME = "vllm"
59 
60 _STRUCTURED_ELEMENT_PATTERN = re.compile(
61     r"<x_([\d.]+)><y_([\d.]+)>(.*?)<x_([\d.]+)><y_([\d.]+)><class_([^>]+)>",
62     re.DOTALL,
63 )
64 
65 
66 def _extract_structured_elements(text: str) -> list[dict]:
67     """Parse Nemotron-Parse bbox markup into structured dicts.
68 
69     Input format: <x_START><y_START>TEXT<x_END><y_END><class_LABEL>
70 
71     Returns list of dicts with keys: bbox ({x1,y1,x2,y2}), class_label, text.
72     """
73     elements = []
74     for match in _STRUCTURED_ELEMENT_PATTERN.finditer(text):
75         x1, y1, content, x2, y2, class_label = match.groups()
76         elements.append(
77             {
78                 "bbox": {
79                     "x1": float(x1),
80                     "y1": float(y1),
81                     "x2": float(x2),
82                     "y2": float(y2),
83                 },
84                 "class_label": class_label,
85                 "text": content.strip(),
86             }
87         )
88     return elements
89 
90 
91 @dd.custom_column_generator(
92     required_columns=["raw_ocr_output"],
93     side_effect_columns=["transcribed_texts__metadata"],
94 )
95 def parse_ocr_output(row: dict) -> dict:
96     """Extract clean text and bbox metadata from raw Nemotron-Parse output."""
97     raw = row["raw_ocr_output"]
98     elements = _extract_structured_elements(raw)
99     row["transcribed_texts"] = "\n".join(el["text"] for el in elements)
100     row["transcribed_texts__metadata"] = [{"bbox": el["bbox"], "class_label": el["class_label"]} for el in elements]
101     return row
102 
103 
104 def build_config(
105     seed_path: str = "seed.parquet",
106     model_alias: str = "ocr",
107 ) -> dd.DataDesignerConfigBuilder:
108     model_configs = [
109         dd.ModelConfig(
110             alias=model_alias,
111             model=NEMOTRON_PARSE_MODEL,
112             provider=VLLM_PROVIDER_NAME,
113             # Health check sends a text-only probe; this model requires image
114             # input, so the check would fail. Skip it.
115             skip_health_check=True,
116             inference_parameters=dd.ChatCompletionInferenceParams(
117                 temperature=0,
118                 timeout=60,
119                 max_parallel_requests=32,
120                 extra_body={
121                     "skip_special_tokens": False,
122                     "top_k": 1,
123                     "repetition_penalty": 1.1,
124                 },
125             ),
126         ),
127     ]
128 
129     config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)
130 
131     config_builder.with_seed_dataset(
132         dd.LocalFileSeedSource(path=seed_path),
133         sampling_strategy=dd.SamplingStrategy.ORDERED,
134     )
135 
136     config_builder.add_column(
137         dd.LLMTextColumnConfig(
138             name="raw_ocr_output",
139             model_alias=model_alias,
140             prompt="",
141             multi_modal_context=[
142                 dd.ImageContext(
143                     # Expects a single-element JSON array from the per-page seed.
144                     column_name="png_images_base64",
145                     data_type=dd.ModalityDataType.BASE64,
146                     image_format=dd.ImageFormat.PNG,
147                 ),
148             ],
149             drop=True,
150         )
151     )
152 
153     config_builder.add_column(
154         dd.CustomColumnConfig(
155             name="transcribed_texts",
156             generator_function=parse_ocr_output,
157         )
158     )
159 
160     return config_builder
161 
162 
163 def create_dataset(
164     config_builder: dd.DataDesignerConfigBuilder,
165     num_records: int,
166     vllm_endpoint: str,
167     artifact_path: Path | str | None = None,
168 ) -> DatasetCreationResults:
169     model_providers = [
170         dd.ModelProvider(
171             name=VLLM_PROVIDER_NAME,
172             endpoint=vllm_endpoint,
173         ),
174     ]
175     data_designer = DataDesigner(
176         artifact_path=artifact_path,
177         model_providers=model_providers,
178     )
179     data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True))
180     results = data_designer.create(config_builder, num_records=num_records, dataset_name="nemotron_parse_ocr")
181     return results
182 
183 
184 if __name__ == "__main__":
185     from argparse import ArgumentParser
186 
187     parser = ArgumentParser()
188     parser.add_argument(
189         "--vllm-endpoint",
190         type=str,
191         required=True,
192         help="Base URL of the vLLM server hosting nemotron-parse (e.g. http://localhost:8000/v1)",
193     )
194     parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file")
195     parser.add_argument("--model-alias", type=str, default="ocr")
196     parser.add_argument("--num-records", type=int, default=5)
197     parser.add_argument("--artifact-path", type=str, default=None)
198     args = parser.parse_args()
199 
200     config_builder = build_config(
201         seed_path=args.seed_path,
202         model_alias=args.model_alias,
203     )
204     results = create_dataset(
205         config_builder,
206         num_records=args.num_records,
207         vllm_endpoint=args.vllm_endpoint,
208         artifact_path=args.artifact_path,
209     )
210 
211     print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
212 
213     results.load_analysis().to_report()