For AI agents: a documentation index is available at the root level at /llms.txt and /llms-full.txt. Append /llms.txt to any URL for a page-level index, or .md for the markdown version of any page.
  • Getting Started
    • Welcome
    • Contributing
  • Concepts
    • Columns
    • Seed Datasets
    • Agent Rollout Ingestion
    • Custom Columns
    • Validators
    • Processors
    • Person Sampling
    • Traces
    • Architecture & Performance
    • Deployment Options
    • Security
  • Tutorials
    • Overview
    • The Basics
    • Structured Outputs, Jinja Expressions, and Conditional Generation
    • Seeding with an External Dataset
    • Providing Images as Context
    • Generating Images
    • Image-to-Image Editing
  • Recipes
    • Recipe Cards
      • Seed Dataset Preparation
      • Nemotron Parse OCR
      • Text QA from OCR Transcripts
      • Page Classification
      • Visual QA
      • Single-Page QA
      • Multi-Page Windowed QA
      • Whole-Document QA
      • Frontier Judge QA Filter
  • Plugins
    • Overview
    • Example Plugin
    • FileSystemSeedReader Plugins
    • Discover
  • Code Reference
    • Overview
  • Dev Notes
    • Overview
    • Have It Your Way
    • VLM Long Document Understanding
    • Push Datasets to Hugging Face Hub
    • Text-to-SQL for Nemotron Super
    • Async All the Way Down
    • Owning the Model Stack
NVIDIANVIDIA
Developer-friendly docs for your API
Privacy Policy | Your Privacy Choices | Terms of Service | Accessibility | Corporate Policies | Product Security | Contact

Copyright © 2026, NVIDIA Corporation.

LogoLogoNeMo Data Designer
RecipesVLM Long-Document Understanding

Seed Dataset Preparation

||View as Markdown|
Previous

Nemotron Super Search Agent

Next

Nemotron Parse OCR

Download Recipe

Download the complete recipe script

1# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2# SPDX-License-Identifier: Apache-2.0
3# /// script
4# requires-python = ">=3.10"
5# dependencies = [
6# "datasets",
7# "pymupdf",
8# "pandas",
9# "pyarrow",
10# ]
11# ///
12"""Long-Document Understanding Seed Dataset Preparation
13
14This script uses HuggingFace's FinePDFs dataset (HuggingFaceFW/finepdfs) as
15an example data source to demonstrate how to prepare seed data for the rest
16of the recipes. It downloads the original PDFs, renders each page to a PNG
17image, and produces three seed parquet files:
18
19 1. **per-page seed** – one row per page, with a ``png_images_base64``
20 column containing a JSON array with a single base64-encoded PNG of
21 that page. Suitable for single-page recipes (02 through 06).
22 2. **windowed seed** – one row per window of consecutive pages, with a
23 ``png_images_base64`` column containing a JSON array of base64-encoded
24 PNGs for the pages in that window. The window size adapts to document
25 length (2 pages for short documents up to 8 for long ones).
26 Suitable for the multi-page windowed recipe (07).
27 3. **whole-document seed** – one row per document, with a
28 ``png_images_base64`` column containing a JSON array of base64-encoded
29 PNGs for all pages. Suitable for the whole-document recipe (08).
30
31Prerequisites:
32 - Internet access to download PDFs from their original URLs.
33
34Run:
35 # Prepare seeds from 10 English PDFs (default)
36 uv run 01-seed-dataset-preparation.py --output-dir ./seed_data
37
38 # Prepare seeds from 50 PDFs
39 uv run 01-seed-dataset-preparation.py --output-dir ./seed_data --num-docs 50
40
41 # Use a different language subset
42 uv run 01-seed-dataset-preparation.py --output-dir ./seed_data --subset fra_Latn
43
44 # Skip documents that fail to download (default behaviour) and set
45 # a custom timeout
46 uv run 01-seed-dataset-preparation.py --output-dir ./seed_data --timeout 30
47
48 # For help
49 uv run 01-seed-dataset-preparation.py --help
50"""
51
52from __future__ import annotations
53
54import base64
55import json
56import logging
57import os
58import urllib.request
59from argparse import ArgumentParser
60from pathlib import Path
61
62import fitz # pymupdf
63import pandas as pd
64from datasets import load_dataset
65
66logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
67log = logging.getLogger(__name__)
68
69logging.getLogger("httpx").setLevel(logging.WARNING)
70logging.getLogger("httpcore").setLevel(logging.WARNING)
71logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
72logging.getLogger("datasets").setLevel(logging.WARNING)
73logging.getLogger("fsspec").setLevel(logging.WARNING)
74
75DPI = 144
76FINEPDFS_REPO = "HuggingFaceFW/finepdfs"
77
78
79def download_pdf(url: str, timeout: int = 20) -> bytes | None:
80 """Download a PDF from *url*, returning raw bytes or None on failure."""
81 try:
82 req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
83 with urllib.request.urlopen(req, timeout=timeout) as resp:
84 return resp.read()
85 except Exception as exc:
86 log.warning("Failed to download %s: %s", url, exc)
87 return None
88
89
90def render_pages(pdf_bytes: bytes, dpi: int = DPI) -> list[bytes]:
91 """Render every page of *pdf_bytes* to PNG, returning a list of raw PNG bytes."""
92 doc = fitz.open(stream=pdf_bytes, filetype="pdf")
93 pages: list[bytes] = []
94 for page in doc:
95 pix = page.get_pixmap(dpi=dpi)
96 pages.append(pix.tobytes("png"))
97 doc.close()
98 return pages
99
100
101def png_to_base64(png_bytes: bytes) -> str:
102 """Encode raw PNG bytes as a base64 string."""
103 return base64.b64encode(png_bytes).decode("ascii")
104
105
106def adaptive_window_size(n_pages: int) -> int:
107 """Choose a window size that scales with document length.
108
109 Short documents get small windows (2 pages) so multi-page questions
110 remain feasible; longer documents get larger windows (up to 8) to
111 cover more context per seed row.
112 """
113 if n_pages > 10 and n_pages < 20:
114 return 3
115 elif n_pages > 20 and n_pages < 30:
116 return 4
117 elif n_pages > 30 and n_pages < 40:
118 return 5
119 elif n_pages > 40 and n_pages < 50:
120 return 6
121 elif n_pages > 50 and n_pages < 60:
122 return 7
123 elif n_pages > 60:
124 return 8
125 return 2
126
127
128def main() -> None:
129 parser = ArgumentParser(description="Prepare seed parquets from FinePDFs")
130 parser.add_argument(
131 "--output-dir",
132 type=str,
133 required=True,
134 help="Directory for output parquet files",
135 )
136 parser.add_argument(
137 "--num-docs",
138 type=int,
139 default=10,
140 help="Number of PDF documents to process (default: 10)",
141 )
142 parser.add_argument(
143 "--subset",
144 type=str,
145 default="eng_Latn",
146 help="FinePDFs language subset (default: eng_Latn)",
147 )
148 parser.add_argument(
149 "--timeout",
150 type=int,
151 default=20,
152 help="HTTP download timeout in seconds (default: 20)",
153 )
154 parser.add_argument(
155 "--dpi",
156 type=int,
157 default=DPI,
158 help=f"Render resolution in DPI (default: {DPI})",
159 )
160 parser.add_argument(
161 "--max-pages",
162 type=int,
163 default=50,
164 help="Skip documents with more pages than this (default: 50)",
165 )
166 parser.add_argument(
167 "--min-window-pages",
168 type=int,
169 default=2,
170 help="Minimum pages in a window; documents shorter than this are skipped for windowed output (default: 2)",
171 )
172 args = parser.parse_args()
173
174 output_dir = Path(args.output_dir)
175 output_dir.mkdir(parents=True, exist_ok=True)
176
177 log.info(
178 "Streaming %d documents from %s (subset=%s)",
179 args.num_docs,
180 FINEPDFS_REPO,
181 args.subset,
182 )
183
184 ds = load_dataset(
185 FINEPDFS_REPO,
186 name=args.subset,
187 split="train",
188 streaming=True,
189 )
190
191 per_page_rows: list[dict] = []
192 windowed_rows: list[dict] = []
193 whole_doc_rows: list[dict] = []
194
195 docs_processed = 0
196 for row in ds:
197 if docs_processed >= args.num_docs:
198 break
199
200 doc_id = row.get("id", f"doc_{docs_processed:06d}")
201 url = row["url"]
202 date = row.get("date", "")
203
204 pdf_bytes = download_pdf(url, timeout=args.timeout)
205 if pdf_bytes is None:
206 continue
207
208 try:
209 page_pngs = render_pages(pdf_bytes, dpi=args.dpi)
210 except Exception as exc:
211 log.warning("Failed to render %s: %s", url, exc)
212 continue
213
214 if len(page_pngs) == 0:
215 log.warning("No pages rendered for %s, skipping", url)
216 continue
217
218 if len(page_pngs) > args.max_pages:
219 log.info(
220 "Skipping %s (%d pages > --max-pages %d)",
221 url,
222 len(page_pngs),
223 args.max_pages,
224 )
225 continue
226
227 page_b64s: list[str] = []
228
229 for page_idx, png_bytes in enumerate(page_pngs):
230 b64 = png_to_base64(png_bytes)
231 page_b64s.append(b64)
232
233 per_page_rows.append(
234 {
235 "id": doc_id,
236 "url": url,
237 "date": date,
238 "page_number": page_idx,
239 "total_pages": len(page_pngs),
240 "png_images_base64": json.dumps([b64]),
241 }
242 )
243
244 whole_doc_rows.append(
245 {
246 "id": doc_id,
247 "url": url,
248 "date": date,
249 "total_pages": len(page_pngs),
250 "png_images_base64": json.dumps(page_b64s),
251 }
252 )
253
254 n_pages = len(page_b64s)
255 win_size = adaptive_window_size(n_pages)
256 n_windows = n_pages // win_size
257 for i in range(n_windows):
258 win_start = i * win_size
259 win_end = win_start + win_size
260 if win_end - win_start < args.min_window_pages:
261 break
262 windowed_rows.append(
263 {
264 "id": doc_id,
265 "url": url,
266 "date": date,
267 "total_pages": n_pages,
268 "start_page": win_start,
269 "end_page": win_end,
270 "window_size": win_end - win_start,
271 "png_images_base64": json.dumps(page_b64s[win_start:win_end]),
272 }
273 )
274
275 docs_processed += 1
276 log.info(
277 "[%d/%d] %s — %d pages",
278 docs_processed,
279 args.num_docs,
280 url,
281 len(page_pngs),
282 )
283
284 if not per_page_rows:
285 log.error("No documents were successfully processed. Exiting.")
286 return
287
288 per_page_path = output_dir / "seed_per_page.parquet"
289 windowed_path = output_dir / "seed_windowed.parquet"
290 whole_doc_path = output_dir / "seed_whole_document.parquet"
291
292 pd.DataFrame(per_page_rows).to_parquet(per_page_path, index=False)
293 if windowed_rows:
294 pd.DataFrame(windowed_rows).to_parquet(windowed_path, index=False)
295 pd.DataFrame(whole_doc_rows).to_parquet(whole_doc_path, index=False)
296
297 log.info("Per-page seed: %s (%d rows)", per_page_path, len(per_page_rows))
298 log.info("Windowed seed: %s (%d rows)", windowed_path, len(windowed_rows))
299 log.info("Whole-document seed: %s (%d rows)", whole_doc_path, len(whole_doc_rows))
300
301
302if __name__ == "__main__":
303 main()
304 # Force-exit to avoid hanging on background threads from datasets/fsspec.
305 os._exit(0)