Markdown Section Seed Reader | NVIDIA NeMo Data Designer

Turn a directory of Markdown files into a seed dataset with one row per section. This recipe stays in the same single-file format as the other recipes: it creates sample files, defines an inline FileSystemSeedReader[DirectorySeedSource], and passes that reader to DataDesigner(seed_readers=[...]).

This keeps the example focused on the actual seed reader contract:

implementing build_manifest(...)
returning 1:N hydrated rows from hydrate_row(...)
declaring output_columns for the hydrated schema
keeping IndexRange selection manifest-based

Because the example reuses DirectorySeedSource, it does not register a brand-new seed_type. If you later want to package the same reader as an installable plugin, see FileSystemSeedReader Plugins.

Run the Recipe

Run the script directly:

$ uv run markdown_seed_reader.py

The script prints two previews:

the full section dataset across all Markdown files
a manifest-only selection using IndexRange(start=1, end=1) that still returns every section from the selected file

Download Recipe

Download the complete recipe script

1 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 # SPDX-License-Identifier: Apache-2.0
3 # /// script
4 # requires-python = ">=3.10"
5 # dependencies = [
6 #     "data-designer",
7 # ]
8 # ///
9 """Markdown Section Seed Reader Recipe
10 
11 Prototype a custom FileSystemSeedReader inline by overriding how one
12 DataDesigner instance handles DirectorySeedSource inputs. The reader keeps a
13 file-based manifest and fans each Markdown file out into one row per section.
14 This keeps the example in the same single-file format as the other recipes
15 while still showing the core `build_manifest(...)` and `hydrate_row(...)`
16 contract for a custom filesystem-backed seed reader.
17 
18 Run:
19     uv run markdown_seed_reader.py
20 """
21 
22 from __future__ import annotations
23 
24 import re
25 from pathlib import Path
26 from tempfile import TemporaryDirectory
27 from typing import Any, ClassVar
28 
29 import data_designer.config as dd
30 from data_designer.config.seed import IndexRange
31 from data_designer.engine.resources.seed_reader import FileSystemSeedReader, SeedReaderFileSystemContext
32 from data_designer.interface import DataDesigner
33 
34 _ATX_HEADING_PATTERN = re.compile(r"^(#{1,6})[ \t]+(.+?)\s*$")
35 
36 
37 class MarkdownSectionDirectorySeedReader(FileSystemSeedReader[dd.DirectorySeedSource]):
38     """Turn each Markdown file matched by DirectorySeedSource into section rows."""
39 
40     output_columns: ClassVar[list[str]] = [
41         "relative_path",
42         "file_name",
43         "section_index",
44         "section_header",
45         "section_content",
46     ]
47 
48     def build_manifest(self, *, context: SeedReaderFileSystemContext) -> list[dict[str, str]]:
49         """Return one cheap manifest row per matched Markdown file."""
50 
51         matched_paths = self.get_matching_relative_paths(
52             context=context,
53             file_pattern=self.source.file_pattern,
54             recursive=self.source.recursive,
55         )
56         return [
57             {
58                 "relative_path": relative_path,
59                 "file_name": Path(relative_path).name,
60             }
61             for relative_path in matched_paths
62         ]
63 
64     def hydrate_row(
65         self,
66         *,
67         manifest_row: dict[str, Any],
68         context: SeedReaderFileSystemContext,
69     ) -> list[dict[str, Any]]:
70         """Read one Markdown file and fan it out into one record per heading section."""
71 
72         relative_path = str(manifest_row["relative_path"])
73         file_name = str(manifest_row["file_name"])
74         with context.fs.open(relative_path, "r", encoding="utf-8") as handle:
75             markdown_text = handle.read()
76 
77         sections = extract_markdown_sections(markdown_text=markdown_text, fallback_header=file_name)
78         return [
79             {
80                 "relative_path": relative_path,
81                 "file_name": file_name,
82                 "section_index": section_index,
83                 "section_header": section_header,
84                 "section_content": section_content,
85             }
86             for section_index, (section_header, section_content) in enumerate(sections)
87         ]
88 
89 
90 def extract_markdown_sections(*, markdown_text: str, fallback_header: str) -> list[tuple[str, str]]:
91     """Split Markdown into `(header, content)` pairs using ATX headings."""
92 
93     sections: list[tuple[str, str]] = []
94     current_header = fallback_header
95     current_lines: list[str] = []
96     saw_heading = False
97 
98     for line in markdown_text.splitlines():
99         heading_match = _ATX_HEADING_PATTERN.match(line)
100         if heading_match is not None:
101             if saw_heading or any(existing_line.strip() for existing_line in current_lines):
102                 sections.append((current_header, "\n".join(current_lines).strip()))
103             current_header = heading_match.group(2).strip()
104             current_lines = []
105             saw_heading = True
106             continue
107         current_lines.append(line)
108 
109     if saw_heading or markdown_text.strip():
110         sections.append((current_header, "\n".join(current_lines).strip()))
111 
112     return [
113         (section_header, section_content)
114         for section_header, section_content in sections
115         if section_header or section_content
116     ]
117 
118 
119 def create_sample_markdown_files(seed_dir: Path) -> None:
120     """Create a tiny Markdown corpus that keeps the recipe self-contained."""
121 
122     (seed_dir / "faq.md").write_text(
123         "# FAQ\nAnswers to frequent questions.\n\n## Support\nContact support@example.com.",
124         encoding="utf-8",
125     )
126     (seed_dir / "guide.md").write_text(
127         "# Quickstart\nInstall Data Designer.\n\n## Usage\nRun the recipe with uv.",
128         encoding="utf-8",
129     )
130 
131 
132 def build_config(
133     *,
134     seed_path: Path,
135     selection_strategy: IndexRange | None = None,
136 ) -> dd.DataDesignerConfigBuilder:
137     """Create the dataset config used by both preview runs in the recipe."""
138 
139     config_builder = dd.DataDesignerConfigBuilder()
140     config_builder.with_seed_dataset(
141         dd.DirectorySeedSource(path=str(seed_path), file_pattern="*.md"),
142         selection_strategy=selection_strategy,
143     )
144     config_builder.add_column(
145         dd.ExpressionColumnConfig(
146             name="section_summary",
147             expr="{{ file_name }} :: {{ section_header }}",
148         )
149     )
150     return config_builder
151 
152 
153 def print_preview(
154     *,
155     data_designer: DataDesigner,
156     title: str,
157     config_builder: dd.DataDesignerConfigBuilder,
158     num_records: int,
159 ) -> None:
160     """Run a preview and print the columns that matter for the walkthrough."""
161 
162     print(title)
163     preview = data_designer.preview(config_builder, num_records=num_records)
164     print(
165         preview.dataset[
166             [
167                 "relative_path",
168                 "section_index",
169                 "section_header",
170                 "section_summary",
171             ]
172         ].to_string(index=False)
173     )
174     print()
175 
176 
177 def main() -> None:
178     """Build sample input files and print previews with and without selection."""
179 
180     with TemporaryDirectory(prefix="markdown-seed-reader-") as temp_dir:
181         seed_dir = Path(temp_dir) / "sample_markdown"
182         seed_dir.mkdir()
183         create_sample_markdown_files(seed_dir)
184 
185         data_designer = DataDesigner(seed_readers=[MarkdownSectionDirectorySeedReader()])
186 
187         print_preview(
188             data_designer=data_designer,
189             title="Full preview across all markdown files",
190             config_builder=build_config(seed_path=seed_dir),
191             num_records=4,
192         )
193         print_preview(
194             data_designer=data_designer,
195             title="Manifest-based selection of only the second matched file",
196             config_builder=build_config(
197                 seed_path=seed_dir,
198                 selection_strategy=IndexRange(start=1, end=1),
199             ),
200             num_records=2,
201         )
202 
203 
204 if __name__ == "__main__":
205     main()