Seed Dataset Preparation | NVIDIA NeMo Data Designer

Download Recipe

1 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 # SPDX-License-Identifier: Apache-2.0
3 # /// script
4 # requires-python = ">=3.10"
5 # dependencies = [
6 #     "datasets",
7 #     "pymupdf",
8 #     "pandas",
9 #     "pyarrow",
10 # ]
11 # ///
12 """Long-Document Understanding Seed Dataset Preparation
13 
14 This script uses HuggingFace's FinePDFs dataset (HuggingFaceFW/finepdfs) as
15 an example data source to demonstrate how to prepare seed data for the rest
16 of the recipes. It downloads the original PDFs, renders each page to a PNG
17 image, and produces three seed parquet files:
18 
19   1. **per-page seed** – one row per page, with a ``png_images_base64``
20      column containing a JSON array with a single base64-encoded PNG of
21      that page. Suitable for single-page recipes (02 through 06).
22   2. **windowed seed** – one row per window of consecutive pages, with a
23      ``png_images_base64`` column containing a JSON array of base64-encoded
24      PNGs for the pages in that window. The window size adapts to document
25      length (2 pages for short documents up to 8 for long ones).
26      Suitable for the multi-page windowed recipe (07).
27   3. **whole-document seed** – one row per document, with a
28      ``png_images_base64`` column containing a JSON array of base64-encoded
29      PNGs for all pages. Suitable for the whole-document recipe (08).
30 
31 Prerequisites:
32     - Internet access to download PDFs from their original URLs.
33 
34 Run:
35     # Prepare seeds from 10 English PDFs (default)
36     uv run 01-seed-dataset-preparation.py --output-dir ./seed_data
37 
38     # Prepare seeds from 50 PDFs
39     uv run 01-seed-dataset-preparation.py --output-dir ./seed_data --num-docs 50
40 
41     # Use a different language subset
42     uv run 01-seed-dataset-preparation.py --output-dir ./seed_data --subset fra_Latn
43 
44     # Skip documents that fail to download (default behaviour) and set
45     # a custom timeout
46     uv run 01-seed-dataset-preparation.py --output-dir ./seed_data --timeout 30
47 
48     # For help
49     uv run 01-seed-dataset-preparation.py --help
50 """
51 
52 from __future__ import annotations
53 
54 import base64
55 import json
56 import logging
57 import os
58 import urllib.request
59 from argparse import ArgumentParser
60 from pathlib import Path
61 
62 import fitz  # pymupdf
63 import pandas as pd
64 from datasets import load_dataset
65 
66 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
67 log = logging.getLogger(__name__)
68 
69 logging.getLogger("httpx").setLevel(logging.WARNING)
70 logging.getLogger("httpcore").setLevel(logging.WARNING)
71 logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
72 logging.getLogger("datasets").setLevel(logging.WARNING)
73 logging.getLogger("fsspec").setLevel(logging.WARNING)
74 
75 DPI = 144
76 FINEPDFS_REPO = "HuggingFaceFW/finepdfs"
77 
78 
79 def download_pdf(url: str, timeout: int = 20) -> bytes | None:
80     """Download a PDF from *url*, returning raw bytes or None on failure."""
81     try:
82         req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
83         with urllib.request.urlopen(req, timeout=timeout) as resp:
84             return resp.read()
85     except Exception as exc:
86         log.warning("Failed to download %s: %s", url, exc)
87         return None
88 
89 
90 def render_pages(pdf_bytes: bytes, dpi: int = DPI) -> list[bytes]:
91     """Render every page of *pdf_bytes* to PNG, returning a list of raw PNG bytes."""
92     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
93     pages: list[bytes] = []
94     for page in doc:
95         pix = page.get_pixmap(dpi=dpi)
96         pages.append(pix.tobytes("png"))
97     doc.close()
98     return pages
99 
100 
101 def png_to_base64(png_bytes: bytes) -> str:
102     """Encode raw PNG bytes as a base64 string."""
103     return base64.b64encode(png_bytes).decode("ascii")
104 
105 
106 def adaptive_window_size(n_pages: int) -> int:
107     """Choose a window size that scales with document length.
108 
109     Short documents get small windows (2 pages) so multi-page questions
110     remain feasible; longer documents get larger windows (up to 8) to
111     cover more context per seed row.
112     """
113     if n_pages > 10 and n_pages < 20:
114         return 3
115     elif n_pages > 20 and n_pages < 30:
116         return 4
117     elif n_pages > 30 and n_pages < 40:
118         return 5
119     elif n_pages > 40 and n_pages < 50:
120         return 6
121     elif n_pages > 50 and n_pages < 60:
122         return 7
123     elif n_pages > 60:
124         return 8
125     return 2
126 
127 
128 def main() -> None:
129     parser = ArgumentParser(description="Prepare seed parquets from FinePDFs")
130     parser.add_argument(
131         "--output-dir",
132         type=str,
133         required=True,
134         help="Directory for output parquet files",
135     )
136     parser.add_argument(
137         "--num-docs",
138         type=int,
139         default=10,
140         help="Number of PDF documents to process (default: 10)",
141     )
142     parser.add_argument(
143         "--subset",
144         type=str,
145         default="eng_Latn",
146         help="FinePDFs language subset (default: eng_Latn)",
147     )
148     parser.add_argument(
149         "--timeout",
150         type=int,
151         default=20,
152         help="HTTP download timeout in seconds (default: 20)",
153     )
154     parser.add_argument(
155         "--dpi",
156         type=int,
157         default=DPI,
158         help=f"Render resolution in DPI (default: {DPI})",
159     )
160     parser.add_argument(
161         "--max-pages",
162         type=int,
163         default=50,
164         help="Skip documents with more pages than this (default: 50)",
165     )
166     parser.add_argument(
167         "--min-window-pages",
168         type=int,
169         default=2,
170         help="Minimum pages in a window; documents shorter than this are skipped for windowed output (default: 2)",
171     )
172     args = parser.parse_args()
173 
174     output_dir = Path(args.output_dir)
175     output_dir.mkdir(parents=True, exist_ok=True)
176 
177     log.info(
178         "Streaming %d documents from %s (subset=%s)",
179         args.num_docs,
180         FINEPDFS_REPO,
181         args.subset,
182     )
183 
184     ds = load_dataset(
185         FINEPDFS_REPO,
186         name=args.subset,
187         split="train",
188         streaming=True,
189     )
190 
191     per_page_rows: list[dict] = []
192     windowed_rows: list[dict] = []
193     whole_doc_rows: list[dict] = []
194 
195     docs_processed = 0
196     for row in ds:
197         if docs_processed >= args.num_docs:
198             break
199 
200         doc_id = row.get("id", f"doc_{docs_processed:06d}")
201         url = row["url"]
202         date = row.get("date", "")
203 
204         pdf_bytes = download_pdf(url, timeout=args.timeout)
205         if pdf_bytes is None:
206             continue
207 
208         try:
209             page_pngs = render_pages(pdf_bytes, dpi=args.dpi)
210         except Exception as exc:
211             log.warning("Failed to render %s: %s", url, exc)
212             continue
213 
214         if len(page_pngs) == 0:
215             log.warning("No pages rendered for %s, skipping", url)
216             continue
217 
218         if len(page_pngs) > args.max_pages:
219             log.info(
220                 "Skipping %s (%d pages > --max-pages %d)",
221                 url,
222                 len(page_pngs),
223                 args.max_pages,
224             )
225             continue
226 
227         page_b64s: list[str] = []
228 
229         for page_idx, png_bytes in enumerate(page_pngs):
230             b64 = png_to_base64(png_bytes)
231             page_b64s.append(b64)
232 
233             per_page_rows.append(
234                 {
235                     "id": doc_id,
236                     "url": url,
237                     "date": date,
238                     "page_number": page_idx,
239                     "total_pages": len(page_pngs),
240                     "png_images_base64": json.dumps([b64]),
241                 }
242             )
243 
244         whole_doc_rows.append(
245             {
246                 "id": doc_id,
247                 "url": url,
248                 "date": date,
249                 "total_pages": len(page_pngs),
250                 "png_images_base64": json.dumps(page_b64s),
251             }
252         )
253 
254         n_pages = len(page_b64s)
255         win_size = adaptive_window_size(n_pages)
256         n_windows = n_pages // win_size
257         for i in range(n_windows):
258             win_start = i * win_size
259             win_end = win_start + win_size
260             if win_end - win_start < args.min_window_pages:
261                 break
262             windowed_rows.append(
263                 {
264                     "id": doc_id,
265                     "url": url,
266                     "date": date,
267                     "total_pages": n_pages,
268                     "start_page": win_start,
269                     "end_page": win_end,
270                     "window_size": win_end - win_start,
271                     "png_images_base64": json.dumps(page_b64s[win_start:win_end]),
272                 }
273             )
274 
275         docs_processed += 1
276         log.info(
277             "[%d/%d] %s — %d pages",
278             docs_processed,
279             args.num_docs,
280             url,
281             len(page_pngs),
282         )
283 
284     if not per_page_rows:
285         log.error("No documents were successfully processed. Exiting.")
286         return
287 
288     per_page_path = output_dir / "seed_per_page.parquet"
289     windowed_path = output_dir / "seed_windowed.parquet"
290     whole_doc_path = output_dir / "seed_whole_document.parquet"
291 
292     pd.DataFrame(per_page_rows).to_parquet(per_page_path, index=False)
293     if windowed_rows:
294         pd.DataFrame(windowed_rows).to_parquet(windowed_path, index=False)
295     pd.DataFrame(whole_doc_rows).to_parquet(whole_doc_path, index=False)
296 
297     log.info("Per-page seed:       %s (%d rows)", per_page_path, len(per_page_rows))
298     log.info("Windowed seed:       %s (%d rows)", windowed_path, len(windowed_rows))
299     log.info("Whole-document seed: %s (%d rows)", whole_doc_path, len(whole_doc_rows))
300 
301 
302 if __name__ == "__main__":
303     main()
304     # Force-exit to avoid hanging on background threads from datasets/fsspec.
305     os._exit(0)