| 1 | # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | # SPDX-License-Identifier: Apache-2.0 |
| 3 | # /// script |
| 4 | # requires-python = ">=3.10" |
| 5 | # dependencies = [ |
| 6 | # "data-designer>=0.5.6", |
| 7 | # ] |
| 8 | # /// |
| 9 | """Long-Document Understanding Nemotron-Parse OCR Recipe |
| 10 | |
| 11 | Run Nemotron-Parse v1.1 OCR over document images from a seed parquet file. |
| 12 | Each record produces: |
| 13 | - `transcribed_texts`: clean text extracted from the OCR output |
| 14 | - `transcribed_texts__metadata`: bounding-box coordinates and class labels |
| 15 | |
| 16 | Prerequisites: |
| 17 | - A seed parquet file containing a `png_images_base64` column with a JSON |
| 18 | array of base64-encoded PNG images (one element per page; single-page |
| 19 | seeds have a one-element array). |
| 20 | - A vLLM-compatible deployment of nvidia/NVIDIA-Nemotron-Parse-v1.1. |
| 21 | The vLLM server must be launched with a chat template that injects the |
| 22 | Nemotron-Parse special tokens. Save the following as a .jinja file and |
| 23 | pass it via --chat-template: |
| 24 | |
| 25 | {% for message in messages %}{% if message["role"] == "user" %}{{ "</s><s><predict_bbox><predict_classes><output_markdown>" }}{% endif %}{% endfor %} |
| 26 | |
| 27 | Example launch script for 1× H100: |
| 28 | docker run -d --gpus all \ |
| 29 | -p 8000:8000 \ |
| 30 | --entrypoint bash \ |
| 31 | vllm/vllm-openai:v0.14.1 \ |
| 32 | -c "pip install open-clip-torch albumentations timm && vllm serve nvidia/NVIDIA-Nemotron-Parse-v1.1 \ |
| 33 | --tensor-parallel-size 1 \ |
| 34 | --max-model-len 9000 \ |
| 35 | --gpu-memory-utilization 0.85 \ |
| 36 | --max-num-seqs 128 \ |
| 37 | --chat-template /chat_template.jinja \ |
| 38 | --trust-remote-code" |
| 39 | |
| 40 | Run: |
| 41 | # Basic usage (processes 5 records by default) |
| 42 | uv run 02-nemotron-parse-ocr-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet |
| 43 | |
| 44 | # Custom record count |
| 45 | uv run 02-nemotron-parse-ocr-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_per_page.parquet --num-records 100 |
| 46 | |
| 47 | # For help message and available options |
| 48 | uv run 02-nemotron-parse-ocr-sdg.py --help |
| 49 | """ |
| 50 | |
| 51 | import re |
| 52 | from pathlib import Path |
| 53 | |
| 54 | import data_designer.config as dd |
| 55 | from data_designer.interface import DataDesigner, DatasetCreationResults |
| 56 | |
| 57 | NEMOTRON_PARSE_MODEL = "nvidia/NVIDIA-Nemotron-Parse-v1.1" |
| 58 | VLLM_PROVIDER_NAME = "vllm" |
| 59 | |
| 60 | _STRUCTURED_ELEMENT_PATTERN = re.compile( |
| 61 | r"<x_([\d.]+)><y_([\d.]+)>(.*?)<x_([\d.]+)><y_([\d.]+)><class_([^>]+)>", |
| 62 | re.DOTALL, |
| 63 | ) |
| 64 | |
| 65 | |
| 66 | def _extract_structured_elements(text: str) -> list[dict]: |
| 67 | """Parse Nemotron-Parse bbox markup into structured dicts. |
| 68 | |
| 69 | Input format: <x_START><y_START>TEXT<x_END><y_END><class_LABEL> |
| 70 | |
| 71 | Returns list of dicts with keys: bbox ({x1,y1,x2,y2}), class_label, text. |
| 72 | """ |
| 73 | elements = [] |
| 74 | for match in _STRUCTURED_ELEMENT_PATTERN.finditer(text): |
| 75 | x1, y1, content, x2, y2, class_label = match.groups() |
| 76 | elements.append( |
| 77 | { |
| 78 | "bbox": { |
| 79 | "x1": float(x1), |
| 80 | "y1": float(y1), |
| 81 | "x2": float(x2), |
| 82 | "y2": float(y2), |
| 83 | }, |
| 84 | "class_label": class_label, |
| 85 | "text": content.strip(), |
| 86 | } |
| 87 | ) |
| 88 | return elements |
| 89 | |
| 90 | |
| 91 | @dd.custom_column_generator( |
| 92 | required_columns=["raw_ocr_output"], |
| 93 | side_effect_columns=["transcribed_texts__metadata"], |
| 94 | ) |
| 95 | def parse_ocr_output(row: dict) -> dict: |
| 96 | """Extract clean text and bbox metadata from raw Nemotron-Parse output.""" |
| 97 | raw = row["raw_ocr_output"] |
| 98 | elements = _extract_structured_elements(raw) |
| 99 | row["transcribed_texts"] = "\n".join(el["text"] for el in elements) |
| 100 | row["transcribed_texts__metadata"] = [{"bbox": el["bbox"], "class_label": el["class_label"]} for el in elements] |
| 101 | return row |
| 102 | |
| 103 | |
| 104 | def build_config( |
| 105 | seed_path: str = "seed.parquet", |
| 106 | model_alias: str = "ocr", |
| 107 | ) -> dd.DataDesignerConfigBuilder: |
| 108 | model_configs = [ |
| 109 | dd.ModelConfig( |
| 110 | alias=model_alias, |
| 111 | model=NEMOTRON_PARSE_MODEL, |
| 112 | provider=VLLM_PROVIDER_NAME, |
| 113 | # Health check sends a text-only probe; this model requires image |
| 114 | # input, so the check would fail. Skip it. |
| 115 | skip_health_check=True, |
| 116 | inference_parameters=dd.ChatCompletionInferenceParams( |
| 117 | temperature=0, |
| 118 | timeout=60, |
| 119 | max_parallel_requests=32, |
| 120 | extra_body={ |
| 121 | "skip_special_tokens": False, |
| 122 | "top_k": 1, |
| 123 | "repetition_penalty": 1.1, |
| 124 | }, |
| 125 | ), |
| 126 | ), |
| 127 | ] |
| 128 | |
| 129 | config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs) |
| 130 | |
| 131 | config_builder.with_seed_dataset( |
| 132 | dd.LocalFileSeedSource(path=seed_path), |
| 133 | sampling_strategy=dd.SamplingStrategy.ORDERED, |
| 134 | ) |
| 135 | |
| 136 | config_builder.add_column( |
| 137 | dd.LLMTextColumnConfig( |
| 138 | name="raw_ocr_output", |
| 139 | model_alias=model_alias, |
| 140 | prompt="", |
| 141 | multi_modal_context=[ |
| 142 | dd.ImageContext( |
| 143 | # Expects a single-element JSON array from the per-page seed. |
| 144 | column_name="png_images_base64", |
| 145 | data_type=dd.ModalityDataType.BASE64, |
| 146 | image_format=dd.ImageFormat.PNG, |
| 147 | ), |
| 148 | ], |
| 149 | drop=True, |
| 150 | ) |
| 151 | ) |
| 152 | |
| 153 | config_builder.add_column( |
| 154 | dd.CustomColumnConfig( |
| 155 | name="transcribed_texts", |
| 156 | generator_function=parse_ocr_output, |
| 157 | ) |
| 158 | ) |
| 159 | |
| 160 | return config_builder |
| 161 | |
| 162 | |
| 163 | def create_dataset( |
| 164 | config_builder: dd.DataDesignerConfigBuilder, |
| 165 | num_records: int, |
| 166 | vllm_endpoint: str, |
| 167 | artifact_path: Path | str | None = None, |
| 168 | ) -> DatasetCreationResults: |
| 169 | model_providers = [ |
| 170 | dd.ModelProvider( |
| 171 | name=VLLM_PROVIDER_NAME, |
| 172 | endpoint=vllm_endpoint, |
| 173 | ), |
| 174 | ] |
| 175 | data_designer = DataDesigner( |
| 176 | artifact_path=artifact_path, |
| 177 | model_providers=model_providers, |
| 178 | ) |
| 179 | data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True)) |
| 180 | results = data_designer.create(config_builder, num_records=num_records, dataset_name="nemotron_parse_ocr") |
| 181 | return results |
| 182 | |
| 183 | |
| 184 | if __name__ == "__main__": |
| 185 | from argparse import ArgumentParser |
| 186 | |
| 187 | parser = ArgumentParser() |
| 188 | parser.add_argument( |
| 189 | "--vllm-endpoint", |
| 190 | type=str, |
| 191 | required=True, |
| 192 | help="Base URL of the vLLM server hosting nemotron-parse (e.g. http://localhost:8000/v1)", |
| 193 | ) |
| 194 | parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file") |
| 195 | parser.add_argument("--model-alias", type=str, default="ocr") |
| 196 | parser.add_argument("--num-records", type=int, default=5) |
| 197 | parser.add_argument("--artifact-path", type=str, default=None) |
| 198 | args = parser.parse_args() |
| 199 | |
| 200 | config_builder = build_config( |
| 201 | seed_path=args.seed_path, |
| 202 | model_alias=args.model_alias, |
| 203 | ) |
| 204 | results = create_dataset( |
| 205 | config_builder, |
| 206 | num_records=args.num_records, |
| 207 | vllm_endpoint=args.vllm_endpoint, |
| 208 | artifact_path=args.artifact_path, |
| 209 | ) |
| 210 | |
| 211 | print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") |
| 212 | |
| 213 | results.load_analysis().to_report() |