Turn a directory of Markdown files into a seed dataset with one row per section. This recipe stays in the same single-file format as the other recipes: it creates sample files, defines an inline FileSystemSeedReader[DirectorySeedSource], and passes that reader to DataDesigner(seed_readers=[...]).
This keeps the example focused on the actual seed reader contract:
build_manifest(...)1:N hydrated rows from hydrate_row(...)output_columns for the hydrated schemaIndexRange selection manifest-basedBecause the example reuses DirectorySeedSource, it does not register a brand-new seed_type. If you later want to package the same reader as an installable plugin, see FileSystemSeedReader Plugins.
Run the script directly:
$ uv run markdown_seed_reader.py
The script prints two previews:
IndexRange(start=1, end=1) that still returns every section from the selected file
1 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 # SPDX-License-Identifier: Apache-2.0 3 # /// script 4 # requires-python = ">=3.10" 5 # dependencies = [ 6 # "data-designer", 7 # ] 8 # /// 9 """Markdown Section Seed Reader Recipe 10 11 Prototype a custom FileSystemSeedReader inline by overriding how one 12 DataDesigner instance handles DirectorySeedSource inputs. The reader keeps a 13 file-based manifest and fans each Markdown file out into one row per section. 14 This keeps the example in the same single-file format as the other recipes 15 while still showing the core `build_manifest(...)` and `hydrate_row(...)` 16 contract for a custom filesystem-backed seed reader. 17 18 Run: 19 uv run markdown_seed_reader.py 20 """ 21 22 from __future__ import annotations 23 24 import re 25 from pathlib import Path 26 from tempfile import TemporaryDirectory 27 from typing import Any, ClassVar 28 29 import data_designer.config as dd 30 from data_designer.config.seed import IndexRange 31 from data_designer.engine.resources.seed_reader import FileSystemSeedReader, SeedReaderFileSystemContext 32 from data_designer.interface import DataDesigner 33 34 _ATX_HEADING_PATTERN = re.compile(r"^(#{1,6})[ \t]+(.+?)\s*$") 35 36 37 class MarkdownSectionDirectorySeedReader(FileSystemSeedReader[dd.DirectorySeedSource]): 38 """Turn each Markdown file matched by DirectorySeedSource into section rows.""" 39 40 output_columns: ClassVar[list[str]] = [ 41 "relative_path", 42 "file_name", 43 "section_index", 44 "section_header", 45 "section_content", 46 ] 47 48 def build_manifest(self, *, context: SeedReaderFileSystemContext) -> list[dict[str, str]]: 49 """Return one cheap manifest row per matched Markdown file.""" 50 51 matched_paths = self.get_matching_relative_paths( 52 context=context, 53 file_pattern=self.source.file_pattern, 54 recursive=self.source.recursive, 55 ) 56 return [ 57 { 58 "relative_path": relative_path, 59 "file_name": Path(relative_path).name, 60 } 61 for relative_path in matched_paths 62 ] 63 64 def hydrate_row( 65 self, 66 *, 67 manifest_row: dict[str, Any], 68 context: SeedReaderFileSystemContext, 69 ) -> list[dict[str, Any]]: 70 """Read one Markdown file and fan it out into one record per heading section.""" 71 72 relative_path = str(manifest_row["relative_path"]) 73 file_name = str(manifest_row["file_name"]) 74 with context.fs.open(relative_path, "r", encoding="utf-8") as handle: 75 markdown_text = handle.read() 76 77 sections = extract_markdown_sections(markdown_text=markdown_text, fallback_header=file_name) 78 return [ 79 { 80 "relative_path": relative_path, 81 "file_name": file_name, 82 "section_index": section_index, 83 "section_header": section_header, 84 "section_content": section_content, 85 } 86 for section_index, (section_header, section_content) in enumerate(sections) 87 ] 88 89 90 def extract_markdown_sections(*, markdown_text: str, fallback_header: str) -> list[tuple[str, str]]: 91 """Split Markdown into `(header, content)` pairs using ATX headings.""" 92 93 sections: list[tuple[str, str]] = [] 94 current_header = fallback_header 95 current_lines: list[str] = [] 96 saw_heading = False 97 98 for line in markdown_text.splitlines(): 99 heading_match = _ATX_HEADING_PATTERN.match(line) 100 if heading_match is not None: 101 if saw_heading or any(existing_line.strip() for existing_line in current_lines): 102 sections.append((current_header, "\n".join(current_lines).strip())) 103 current_header = heading_match.group(2).strip() 104 current_lines = [] 105 saw_heading = True 106 continue 107 current_lines.append(line) 108 109 if saw_heading or markdown_text.strip(): 110 sections.append((current_header, "\n".join(current_lines).strip())) 111 112 return [ 113 (section_header, section_content) 114 for section_header, section_content in sections 115 if section_header or section_content 116 ] 117 118 119 def create_sample_markdown_files(seed_dir: Path) -> None: 120 """Create a tiny Markdown corpus that keeps the recipe self-contained.""" 121 122 (seed_dir / "faq.md").write_text( 123 "# FAQ\nAnswers to frequent questions.\n\n## Support\nContact support@example.com.", 124 encoding="utf-8", 125 ) 126 (seed_dir / "guide.md").write_text( 127 "# Quickstart\nInstall Data Designer.\n\n## Usage\nRun the recipe with uv.", 128 encoding="utf-8", 129 ) 130 131 132 def build_config( 133 *, 134 seed_path: Path, 135 selection_strategy: IndexRange | None = None, 136 ) -> dd.DataDesignerConfigBuilder: 137 """Create the dataset config used by both preview runs in the recipe.""" 138 139 config_builder = dd.DataDesignerConfigBuilder() 140 config_builder.with_seed_dataset( 141 dd.DirectorySeedSource(path=str(seed_path), file_pattern="*.md"), 142 selection_strategy=selection_strategy, 143 ) 144 config_builder.add_column( 145 dd.ExpressionColumnConfig( 146 name="section_summary", 147 expr="{{ file_name }} :: {{ section_header }}", 148 ) 149 ) 150 return config_builder 151 152 153 def print_preview( 154 *, 155 data_designer: DataDesigner, 156 title: str, 157 config_builder: dd.DataDesignerConfigBuilder, 158 num_records: int, 159 ) -> None: 160 """Run a preview and print the columns that matter for the walkthrough.""" 161 162 print(title) 163 preview = data_designer.preview(config_builder, num_records=num_records) 164 print( 165 preview.dataset[ 166 [ 167 "relative_path", 168 "section_index", 169 "section_header", 170 "section_summary", 171 ] 172 ].to_string(index=False) 173 ) 174 print() 175 176 177 def main() -> None: 178 """Build sample input files and print previews with and without selection.""" 179 180 with TemporaryDirectory(prefix="markdown-seed-reader-") as temp_dir: 181 seed_dir = Path(temp_dir) / "sample_markdown" 182 seed_dir.mkdir() 183 create_sample_markdown_files(seed_dir) 184 185 data_designer = DataDesigner(seed_readers=[MarkdownSectionDirectorySeedReader()]) 186 187 print_preview( 188 data_designer=data_designer, 189 title="Full preview across all markdown files", 190 config_builder=build_config(seed_path=seed_dir), 191 num_records=4, 192 ) 193 print_preview( 194 data_designer=data_designer, 195 title="Manifest-based selection of only the second matched file", 196 config_builder=build_config( 197 seed_path=seed_dir, 198 selection_strategy=IndexRange(start=1, end=1), 199 ), 200 num_records=2, 201 ) 202 203 204 if __name__ == "__main__": 205 main()