For AI agents: a documentation index is available at the root level at /llms.txt and /llms-full.txt. Append /llms.txt to any URL for a page-level index, or .md for the markdown version of any page.
  • Getting Started
    • Welcome
    • Contributing
  • Concepts
    • Columns
    • Seed Datasets
    • Agent Rollout Ingestion
    • Custom Columns
    • Validators
    • Processors
    • Person Sampling
    • Traces
    • Architecture & Performance
    • Deployment Options
    • Security
  • Tutorials
    • Overview
    • The Basics
    • Structured Outputs, Jinja Expressions, and Conditional Generation
    • Seeding with an External Dataset
    • Providing Images as Context
    • Generating Images
    • Image-to-Image Editing
  • Recipes
    • Recipe Cards
      • Markdown Section Seed Reader Plugin
  • Plugins
    • Overview
    • Example Plugin
    • FileSystemSeedReader Plugins
    • Discover
  • Code Reference
    • Overview
  • Dev Notes
    • Overview
    • Have It Your Way
    • VLM Long Document Understanding
    • Push Datasets to Hugging Face Hub
    • Text-to-SQL for Nemotron Super
    • Async All the Way Down
    • Owning the Model Stack
NVIDIANVIDIA
Developer-friendly docs for your API
Privacy Policy | Your Privacy Choices | Terms of Service | Accessibility | Corporate Policies | Product Security | Contact

Copyright © 2026, NVIDIA Corporation.

LogoLogoNeMo Data Designer
On this page
  • Run the Recipe
RecipesPlugin Development

Markdown Section Seed Reader

||View as Markdown|
Previous

Nemotron Super Text to SQL

Next

Product Info QA

Turn a directory of Markdown files into a seed dataset with one row per section. This recipe stays in the same single-file format as the other recipes: it creates sample files, defines an inline FileSystemSeedReader[DirectorySeedSource], and passes that reader to DataDesigner(seed_readers=[...]).

This keeps the example focused on the actual seed reader contract:

  • implementing build_manifest(...)
  • returning 1:N hydrated rows from hydrate_row(...)
  • declaring output_columns for the hydrated schema
  • keeping IndexRange selection manifest-based

Because the example reuses DirectorySeedSource, it does not register a brand-new seed_type. If you later want to package the same reader as an installable plugin, see FileSystemSeedReader Plugins.

Run the Recipe

Run the script directly:

$uv run markdown_seed_reader.py

The script prints two previews:

  • the full section dataset across all Markdown files
  • a manifest-only selection using IndexRange(start=1, end=1) that still returns every section from the selected file
Download Recipe

Download the complete recipe script

1# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2# SPDX-License-Identifier: Apache-2.0
3# /// script
4# requires-python = ">=3.10"
5# dependencies = [
6# "data-designer",
7# ]
8# ///
9"""Markdown Section Seed Reader Recipe
10
11Prototype a custom FileSystemSeedReader inline by overriding how one
12DataDesigner instance handles DirectorySeedSource inputs. The reader keeps a
13file-based manifest and fans each Markdown file out into one row per section.
14This keeps the example in the same single-file format as the other recipes
15while still showing the core `build_manifest(...)` and `hydrate_row(...)`
16contract for a custom filesystem-backed seed reader.
17
18Run:
19 uv run markdown_seed_reader.py
20"""
21
22from __future__ import annotations
23
24import re
25from pathlib import Path
26from tempfile import TemporaryDirectory
27from typing import Any, ClassVar
28
29import data_designer.config as dd
30from data_designer.config.seed import IndexRange
31from data_designer.engine.resources.seed_reader import FileSystemSeedReader, SeedReaderFileSystemContext
32from data_designer.interface import DataDesigner
33
34_ATX_HEADING_PATTERN = re.compile(r"^(#{1,6})[ \t]+(.+?)\s*$")
35
36
37class MarkdownSectionDirectorySeedReader(FileSystemSeedReader[dd.DirectorySeedSource]):
38 """Turn each Markdown file matched by DirectorySeedSource into section rows."""
39
40 output_columns: ClassVar[list[str]] = [
41 "relative_path",
42 "file_name",
43 "section_index",
44 "section_header",
45 "section_content",
46 ]
47
48 def build_manifest(self, *, context: SeedReaderFileSystemContext) -> list[dict[str, str]]:
49 """Return one cheap manifest row per matched Markdown file."""
50
51 matched_paths = self.get_matching_relative_paths(
52 context=context,
53 file_pattern=self.source.file_pattern,
54 recursive=self.source.recursive,
55 )
56 return [
57 {
58 "relative_path": relative_path,
59 "file_name": Path(relative_path).name,
60 }
61 for relative_path in matched_paths
62 ]
63
64 def hydrate_row(
65 self,
66 *,
67 manifest_row: dict[str, Any],
68 context: SeedReaderFileSystemContext,
69 ) -> list[dict[str, Any]]:
70 """Read one Markdown file and fan it out into one record per heading section."""
71
72 relative_path = str(manifest_row["relative_path"])
73 file_name = str(manifest_row["file_name"])
74 with context.fs.open(relative_path, "r", encoding="utf-8") as handle:
75 markdown_text = handle.read()
76
77 sections = extract_markdown_sections(markdown_text=markdown_text, fallback_header=file_name)
78 return [
79 {
80 "relative_path": relative_path,
81 "file_name": file_name,
82 "section_index": section_index,
83 "section_header": section_header,
84 "section_content": section_content,
85 }
86 for section_index, (section_header, section_content) in enumerate(sections)
87 ]
88
89
90def extract_markdown_sections(*, markdown_text: str, fallback_header: str) -> list[tuple[str, str]]:
91 """Split Markdown into `(header, content)` pairs using ATX headings."""
92
93 sections: list[tuple[str, str]] = []
94 current_header = fallback_header
95 current_lines: list[str] = []
96 saw_heading = False
97
98 for line in markdown_text.splitlines():
99 heading_match = _ATX_HEADING_PATTERN.match(line)
100 if heading_match is not None:
101 if saw_heading or any(existing_line.strip() for existing_line in current_lines):
102 sections.append((current_header, "\n".join(current_lines).strip()))
103 current_header = heading_match.group(2).strip()
104 current_lines = []
105 saw_heading = True
106 continue
107 current_lines.append(line)
108
109 if saw_heading or markdown_text.strip():
110 sections.append((current_header, "\n".join(current_lines).strip()))
111
112 return [
113 (section_header, section_content)
114 for section_header, section_content in sections
115 if section_header or section_content
116 ]
117
118
119def create_sample_markdown_files(seed_dir: Path) -> None:
120 """Create a tiny Markdown corpus that keeps the recipe self-contained."""
121
122 (seed_dir / "faq.md").write_text(
123 "# FAQ\nAnswers to frequent questions.\n\n## Support\nContact support@example.com.",
124 encoding="utf-8",
125 )
126 (seed_dir / "guide.md").write_text(
127 "# Quickstart\nInstall Data Designer.\n\n## Usage\nRun the recipe with uv.",
128 encoding="utf-8",
129 )
130
131
132def build_config(
133 *,
134 seed_path: Path,
135 selection_strategy: IndexRange | None = None,
136) -> dd.DataDesignerConfigBuilder:
137 """Create the dataset config used by both preview runs in the recipe."""
138
139 config_builder = dd.DataDesignerConfigBuilder()
140 config_builder.with_seed_dataset(
141 dd.DirectorySeedSource(path=str(seed_path), file_pattern="*.md"),
142 selection_strategy=selection_strategy,
143 )
144 config_builder.add_column(
145 dd.ExpressionColumnConfig(
146 name="section_summary",
147 expr="{{ file_name }} :: {{ section_header }}",
148 )
149 )
150 return config_builder
151
152
153def print_preview(
154 *,
155 data_designer: DataDesigner,
156 title: str,
157 config_builder: dd.DataDesignerConfigBuilder,
158 num_records: int,
159) -> None:
160 """Run a preview and print the columns that matter for the walkthrough."""
161
162 print(title)
163 preview = data_designer.preview(config_builder, num_records=num_records)
164 print(
165 preview.dataset[
166 [
167 "relative_path",
168 "section_index",
169 "section_header",
170 "section_summary",
171 ]
172 ].to_string(index=False)
173 )
174 print()
175
176
177def main() -> None:
178 """Build sample input files and print previews with and without selection."""
179
180 with TemporaryDirectory(prefix="markdown-seed-reader-") as temp_dir:
181 seed_dir = Path(temp_dir) / "sample_markdown"
182 seed_dir.mkdir()
183 create_sample_markdown_files(seed_dir)
184
185 data_designer = DataDesigner(seed_readers=[MarkdownSectionDirectorySeedReader()])
186
187 print_preview(
188 data_designer=data_designer,
189 title="Full preview across all markdown files",
190 config_builder=build_config(seed_path=seed_dir),
191 num_records=4,
192 )
193 print_preview(
194 data_designer=data_designer,
195 title="Manifest-based selection of only the second matched file",
196 config_builder=build_config(
197 seed_path=seed_dir,
198 selection_strategy=IndexRange(start=1, end=1),
199 ),
200 num_records=2,
201 )
202
203
204if __name__ == "__main__":
205 main()