| 1 | # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | # SPDX-License-Identifier: Apache-2.0 |
| 3 | # /// script |
| 4 | # requires-python = ">=3.10" |
| 5 | # dependencies = [ |
| 6 | # "datasets", |
| 7 | # "pymupdf", |
| 8 | # "pandas", |
| 9 | # "pyarrow", |
| 10 | # ] |
| 11 | # /// |
| 12 | """Long-Document Understanding Seed Dataset Preparation |
| 13 | |
| 14 | This script uses HuggingFace's FinePDFs dataset (HuggingFaceFW/finepdfs) as |
| 15 | an example data source to demonstrate how to prepare seed data for the rest |
| 16 | of the recipes. It downloads the original PDFs, renders each page to a PNG |
| 17 | image, and produces three seed parquet files: |
| 18 | |
| 19 | 1. **per-page seed** – one row per page, with a ``png_images_base64`` |
| 20 | column containing a JSON array with a single base64-encoded PNG of |
| 21 | that page. Suitable for single-page recipes (02 through 06). |
| 22 | 2. **windowed seed** – one row per window of consecutive pages, with a |
| 23 | ``png_images_base64`` column containing a JSON array of base64-encoded |
| 24 | PNGs for the pages in that window. The window size adapts to document |
| 25 | length (2 pages for short documents up to 8 for long ones). |
| 26 | Suitable for the multi-page windowed recipe (07). |
| 27 | 3. **whole-document seed** – one row per document, with a |
| 28 | ``png_images_base64`` column containing a JSON array of base64-encoded |
| 29 | PNGs for all pages. Suitable for the whole-document recipe (08). |
| 30 | |
| 31 | Prerequisites: |
| 32 | - Internet access to download PDFs from their original URLs. |
| 33 | |
| 34 | Run: |
| 35 | # Prepare seeds from 10 English PDFs (default) |
| 36 | uv run 01-seed-dataset-preparation.py --output-dir ./seed_data |
| 37 | |
| 38 | # Prepare seeds from 50 PDFs |
| 39 | uv run 01-seed-dataset-preparation.py --output-dir ./seed_data --num-docs 50 |
| 40 | |
| 41 | # Use a different language subset |
| 42 | uv run 01-seed-dataset-preparation.py --output-dir ./seed_data --subset fra_Latn |
| 43 | |
| 44 | # Skip documents that fail to download (default behaviour) and set |
| 45 | # a custom timeout |
| 46 | uv run 01-seed-dataset-preparation.py --output-dir ./seed_data --timeout 30 |
| 47 | |
| 48 | # For help |
| 49 | uv run 01-seed-dataset-preparation.py --help |
| 50 | """ |
| 51 | |
| 52 | from __future__ import annotations |
| 53 | |
| 54 | import base64 |
| 55 | import json |
| 56 | import logging |
| 57 | import os |
| 58 | import urllib.request |
| 59 | from argparse import ArgumentParser |
| 60 | from pathlib import Path |
| 61 | |
| 62 | import fitz # pymupdf |
| 63 | import pandas as pd |
| 64 | from datasets import load_dataset |
| 65 | |
| 66 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") |
| 67 | log = logging.getLogger(__name__) |
| 68 | |
| 69 | logging.getLogger("httpx").setLevel(logging.WARNING) |
| 70 | logging.getLogger("httpcore").setLevel(logging.WARNING) |
| 71 | logging.getLogger("huggingface_hub").setLevel(logging.WARNING) |
| 72 | logging.getLogger("datasets").setLevel(logging.WARNING) |
| 73 | logging.getLogger("fsspec").setLevel(logging.WARNING) |
| 74 | |
| 75 | DPI = 144 |
| 76 | FINEPDFS_REPO = "HuggingFaceFW/finepdfs" |
| 77 | |
| 78 | |
| 79 | def download_pdf(url: str, timeout: int = 20) -> bytes | None: |
| 80 | """Download a PDF from *url*, returning raw bytes or None on failure.""" |
| 81 | try: |
| 82 | req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) |
| 83 | with urllib.request.urlopen(req, timeout=timeout) as resp: |
| 84 | return resp.read() |
| 85 | except Exception as exc: |
| 86 | log.warning("Failed to download %s: %s", url, exc) |
| 87 | return None |
| 88 | |
| 89 | |
| 90 | def render_pages(pdf_bytes: bytes, dpi: int = DPI) -> list[bytes]: |
| 91 | """Render every page of *pdf_bytes* to PNG, returning a list of raw PNG bytes.""" |
| 92 | doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
| 93 | pages: list[bytes] = [] |
| 94 | for page in doc: |
| 95 | pix = page.get_pixmap(dpi=dpi) |
| 96 | pages.append(pix.tobytes("png")) |
| 97 | doc.close() |
| 98 | return pages |
| 99 | |
| 100 | |
| 101 | def png_to_base64(png_bytes: bytes) -> str: |
| 102 | """Encode raw PNG bytes as a base64 string.""" |
| 103 | return base64.b64encode(png_bytes).decode("ascii") |
| 104 | |
| 105 | |
| 106 | def adaptive_window_size(n_pages: int) -> int: |
| 107 | """Choose a window size that scales with document length. |
| 108 | |
| 109 | Short documents get small windows (2 pages) so multi-page questions |
| 110 | remain feasible; longer documents get larger windows (up to 8) to |
| 111 | cover more context per seed row. |
| 112 | """ |
| 113 | if n_pages > 10 and n_pages < 20: |
| 114 | return 3 |
| 115 | elif n_pages > 20 and n_pages < 30: |
| 116 | return 4 |
| 117 | elif n_pages > 30 and n_pages < 40: |
| 118 | return 5 |
| 119 | elif n_pages > 40 and n_pages < 50: |
| 120 | return 6 |
| 121 | elif n_pages > 50 and n_pages < 60: |
| 122 | return 7 |
| 123 | elif n_pages > 60: |
| 124 | return 8 |
| 125 | return 2 |
| 126 | |
| 127 | |
| 128 | def main() -> None: |
| 129 | parser = ArgumentParser(description="Prepare seed parquets from FinePDFs") |
| 130 | parser.add_argument( |
| 131 | "--output-dir", |
| 132 | type=str, |
| 133 | required=True, |
| 134 | help="Directory for output parquet files", |
| 135 | ) |
| 136 | parser.add_argument( |
| 137 | "--num-docs", |
| 138 | type=int, |
| 139 | default=10, |
| 140 | help="Number of PDF documents to process (default: 10)", |
| 141 | ) |
| 142 | parser.add_argument( |
| 143 | "--subset", |
| 144 | type=str, |
| 145 | default="eng_Latn", |
| 146 | help="FinePDFs language subset (default: eng_Latn)", |
| 147 | ) |
| 148 | parser.add_argument( |
| 149 | "--timeout", |
| 150 | type=int, |
| 151 | default=20, |
| 152 | help="HTTP download timeout in seconds (default: 20)", |
| 153 | ) |
| 154 | parser.add_argument( |
| 155 | "--dpi", |
| 156 | type=int, |
| 157 | default=DPI, |
| 158 | help=f"Render resolution in DPI (default: {DPI})", |
| 159 | ) |
| 160 | parser.add_argument( |
| 161 | "--max-pages", |
| 162 | type=int, |
| 163 | default=50, |
| 164 | help="Skip documents with more pages than this (default: 50)", |
| 165 | ) |
| 166 | parser.add_argument( |
| 167 | "--min-window-pages", |
| 168 | type=int, |
| 169 | default=2, |
| 170 | help="Minimum pages in a window; documents shorter than this are skipped for windowed output (default: 2)", |
| 171 | ) |
| 172 | args = parser.parse_args() |
| 173 | |
| 174 | output_dir = Path(args.output_dir) |
| 175 | output_dir.mkdir(parents=True, exist_ok=True) |
| 176 | |
| 177 | log.info( |
| 178 | "Streaming %d documents from %s (subset=%s)", |
| 179 | args.num_docs, |
| 180 | FINEPDFS_REPO, |
| 181 | args.subset, |
| 182 | ) |
| 183 | |
| 184 | ds = load_dataset( |
| 185 | FINEPDFS_REPO, |
| 186 | name=args.subset, |
| 187 | split="train", |
| 188 | streaming=True, |
| 189 | ) |
| 190 | |
| 191 | per_page_rows: list[dict] = [] |
| 192 | windowed_rows: list[dict] = [] |
| 193 | whole_doc_rows: list[dict] = [] |
| 194 | |
| 195 | docs_processed = 0 |
| 196 | for row in ds: |
| 197 | if docs_processed >= args.num_docs: |
| 198 | break |
| 199 | |
| 200 | doc_id = row.get("id", f"doc_{docs_processed:06d}") |
| 201 | url = row["url"] |
| 202 | date = row.get("date", "") |
| 203 | |
| 204 | pdf_bytes = download_pdf(url, timeout=args.timeout) |
| 205 | if pdf_bytes is None: |
| 206 | continue |
| 207 | |
| 208 | try: |
| 209 | page_pngs = render_pages(pdf_bytes, dpi=args.dpi) |
| 210 | except Exception as exc: |
| 211 | log.warning("Failed to render %s: %s", url, exc) |
| 212 | continue |
| 213 | |
| 214 | if len(page_pngs) == 0: |
| 215 | log.warning("No pages rendered for %s, skipping", url) |
| 216 | continue |
| 217 | |
| 218 | if len(page_pngs) > args.max_pages: |
| 219 | log.info( |
| 220 | "Skipping %s (%d pages > --max-pages %d)", |
| 221 | url, |
| 222 | len(page_pngs), |
| 223 | args.max_pages, |
| 224 | ) |
| 225 | continue |
| 226 | |
| 227 | page_b64s: list[str] = [] |
| 228 | |
| 229 | for page_idx, png_bytes in enumerate(page_pngs): |
| 230 | b64 = png_to_base64(png_bytes) |
| 231 | page_b64s.append(b64) |
| 232 | |
| 233 | per_page_rows.append( |
| 234 | { |
| 235 | "id": doc_id, |
| 236 | "url": url, |
| 237 | "date": date, |
| 238 | "page_number": page_idx, |
| 239 | "total_pages": len(page_pngs), |
| 240 | "png_images_base64": json.dumps([b64]), |
| 241 | } |
| 242 | ) |
| 243 | |
| 244 | whole_doc_rows.append( |
| 245 | { |
| 246 | "id": doc_id, |
| 247 | "url": url, |
| 248 | "date": date, |
| 249 | "total_pages": len(page_pngs), |
| 250 | "png_images_base64": json.dumps(page_b64s), |
| 251 | } |
| 252 | ) |
| 253 | |
| 254 | n_pages = len(page_b64s) |
| 255 | win_size = adaptive_window_size(n_pages) |
| 256 | n_windows = n_pages // win_size |
| 257 | for i in range(n_windows): |
| 258 | win_start = i * win_size |
| 259 | win_end = win_start + win_size |
| 260 | if win_end - win_start < args.min_window_pages: |
| 261 | break |
| 262 | windowed_rows.append( |
| 263 | { |
| 264 | "id": doc_id, |
| 265 | "url": url, |
| 266 | "date": date, |
| 267 | "total_pages": n_pages, |
| 268 | "start_page": win_start, |
| 269 | "end_page": win_end, |
| 270 | "window_size": win_end - win_start, |
| 271 | "png_images_base64": json.dumps(page_b64s[win_start:win_end]), |
| 272 | } |
| 273 | ) |
| 274 | |
| 275 | docs_processed += 1 |
| 276 | log.info( |
| 277 | "[%d/%d] %s — %d pages", |
| 278 | docs_processed, |
| 279 | args.num_docs, |
| 280 | url, |
| 281 | len(page_pngs), |
| 282 | ) |
| 283 | |
| 284 | if not per_page_rows: |
| 285 | log.error("No documents were successfully processed. Exiting.") |
| 286 | return |
| 287 | |
| 288 | per_page_path = output_dir / "seed_per_page.parquet" |
| 289 | windowed_path = output_dir / "seed_windowed.parquet" |
| 290 | whole_doc_path = output_dir / "seed_whole_document.parquet" |
| 291 | |
| 292 | pd.DataFrame(per_page_rows).to_parquet(per_page_path, index=False) |
| 293 | if windowed_rows: |
| 294 | pd.DataFrame(windowed_rows).to_parquet(windowed_path, index=False) |
| 295 | pd.DataFrame(whole_doc_rows).to_parquet(whole_doc_path, index=False) |
| 296 | |
| 297 | log.info("Per-page seed: %s (%d rows)", per_page_path, len(per_page_rows)) |
| 298 | log.info("Windowed seed: %s (%d rows)", windowed_path, len(windowed_rows)) |
| 299 | log.info("Whole-document seed: %s (%d rows)", whole_doc_path, len(whole_doc_rows)) |
| 300 | |
| 301 | |
| 302 | if __name__ == "__main__": |
| 303 | main() |
| 304 | # Force-exit to avoid hanging on background threads from datasets/fsspec. |
| 305 | os._exit(0) |