For AI agents: a documentation index is available at the root level at /llms.txt and /llms-full.txt. Append /llms.txt to any URL for a page-level index, or .md for the markdown version of any page.
  • Getting Started
    • Welcome
    • Contributing
  • Concepts
    • Columns
    • Seed Datasets
    • Agent Rollout Ingestion
    • Custom Columns
    • Validators
    • Processors
    • Person Sampling
    • Traces
    • Architecture & Performance
    • Deployment Options
    • Security
  • Tutorials
    • Overview
    • The Basics
    • Structured Outputs, Jinja Expressions, and Conditional Generation
    • Seeding with an External Dataset
    • Providing Images as Context
    • Generating Images
    • Image-to-Image Editing
  • Recipes
    • Recipe Cards
      • Text to Python
      • Text to SQL
      • Nemotron Super Text to SQL
  • Plugins
    • Overview
    • Example Plugin
    • FileSystemSeedReader Plugins
    • Discover
  • Code Reference
    • Overview
  • Dev Notes
    • Overview
    • Prompt Sensitivity
    • Retriever SDG Toolkit
    • Have It Your Way
    • VLM Long Document Understanding
    • Push Datasets to Hugging Face Hub
    • Text-to-SQL for Nemotron Super
    • Async All the Way Down
    • Owning the Model Stack
NVIDIANVIDIA
Developer-friendly docs for your API
Privacy Policy | Manage My Privacy | Do Not Sell or Share My Data | Terms of Service | Accessibility | Corporate Policies | Product Security | Contact

Copyright © 2026, NVIDIA Corporation.

LogoLogoNeMo Data Designer
RecipesCode Generation

Text to SQL

||View as Markdown|
Previous

Text to Python

Next

Nemotron Super Text to SQL

Download Recipe

Download the complete recipe script

1# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2# SPDX-License-Identifier: Apache-2.0
3# /// script
4# requires-python = ">=3.10"
5# dependencies = [
6# "data-designer",
7# ]
8# ///
9"""Text-to-SQL Code Generation Recipe
10
11Generate synthetic instruction-SQL pairs for database tasks across different
12industries, complexity levels, and SQL concepts. Each record includes an
13instruction, database context (CREATE TABLE + sample data), generated SQL query,
14code validation, and judge evaluation.
15
16Prerequisites:
17 - OPENAI_API_KEY environment variable for OpenAI provider model aliases (default model alias is "openai-text").
18 - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases.
19
20Run:
21 # Basic usage (generates 5 records by default)
22 uv run text_to_sql.py
23
24 # For help message and available options
25 uv run text_to_sql.py --help
26"""
27
28from pathlib import Path
29
30import data_designer.config as dd
31from data_designer.interface import DataDesigner, DatasetCreationResults
32
33
34def build_config(model_alias: str) -> dd.DataDesignerConfigBuilder:
35 config_builder = dd.DataDesignerConfigBuilder()
36
37 config_builder.add_column(
38 dd.SamplerColumnConfig(
39 name="industry_sector",
40 sampler_type=dd.SamplerType.CATEGORY,
41 params=dd.CategorySamplerParams(
42 values=["Healthcare", "Finance", "Technology"],
43 ),
44 )
45 )
46
47 config_builder.add_column(
48 dd.SamplerColumnConfig(
49 name="topic",
50 sampler_type=dd.SamplerType.SUBCATEGORY,
51 params=dd.SubcategorySamplerParams(
52 category="industry_sector",
53 values={
54 "Healthcare": [
55 "Electronic Health Records (EHR) Systems",
56 "Telemedicine Platforms",
57 "AI-Powered Diagnostic Tools",
58 ],
59 "Finance": [
60 "Fraud Detection Software",
61 "Automated Trading Systems",
62 "Personal Finance Apps",
63 ],
64 "Technology": [
65 "Cloud Computing Platforms",
66 "Artificial Intelligence and Machine Learning Platforms",
67 "DevOps and CI/CD Tools",
68 ],
69 },
70 ),
71 )
72 )
73
74 config_builder.add_column(
75 dd.SamplerColumnConfig(
76 name="sql_complexity",
77 sampler_type=dd.SamplerType.CATEGORY,
78 params=dd.CategorySamplerParams(
79 values=["Beginner", "Intermediate", "Advanced"],
80 ),
81 )
82 )
83
84 config_builder.add_column(
85 dd.SamplerColumnConfig(
86 name="sql_concept",
87 sampler_type=dd.SamplerType.SUBCATEGORY,
88 params=dd.SubcategorySamplerParams(
89 category="sql_complexity",
90 values={
91 "Beginner": [
92 "Basic SELECT Statements",
93 "WHERE Clauses",
94 "Basic JOINs",
95 "INSERT, UPDATE, DELETE",
96 ],
97 "Intermediate": [
98 "Aggregation Functions",
99 "Multiple JOINs",
100 "Subqueries",
101 "Views",
102 ],
103 "Advanced": [
104 "Window Functions",
105 "Common Table Expressions (CTEs)",
106 "Stored Procedures",
107 "Query Optimization",
108 ],
109 },
110 ),
111 )
112 )
113
114 config_builder.add_column(
115 dd.SamplerColumnConfig(
116 name="sql_task_type",
117 sampler_type=dd.SamplerType.CATEGORY,
118 params=dd.CategorySamplerParams(
119 values=[
120 "Data Retrieval",
121 "Data Manipulation",
122 "Analytics and Reporting",
123 "Data Transformation",
124 ],
125 ),
126 )
127 )
128
129 config_builder.add_column(
130 dd.SamplerColumnConfig(
131 name="instruction_phrase",
132 sampler_type=dd.SamplerType.CATEGORY,
133 params=dd.CategorySamplerParams(
134 values=[
135 "Write an SQL query that",
136 "Create an SQL statement to",
137 "Develop an SQL query to",
138 "Can you write SQL that",
139 "Formulate an SQL query that",
140 ],
141 ),
142 )
143 )
144
145 config_builder.add_column(
146 dd.LLMTextColumnConfig(
147 name="sql_prompt",
148 model_alias=model_alias,
149 system_prompt="You are an expert at generating clear and specific SQL tasks.",
150 prompt=SQL_PROMPT_TEXT,
151 )
152 )
153
154 config_builder.add_column(
155 dd.LLMCodeColumnConfig(
156 name="sql_context",
157 model_alias=model_alias,
158 code_lang=dd.CodeLang.SQL_ANSI,
159 system_prompt=(
160 "You are an expert SQL database designer who creates clean, efficient, and "
161 "well-structured database schemas."
162 ),
163 prompt=SQL_CONTEXT_TEXT,
164 )
165 )
166
167 config_builder.add_column(
168 dd.LLMCodeColumnConfig(
169 name="sql",
170 model_alias=model_alias,
171 code_lang=dd.CodeLang.SQL_ANSI,
172 system_prompt="You are an expert SQL programmer who writes clean, efficient, and well-structured queries.",
173 prompt=SQL_CODE_TEXT,
174 )
175 )
176
177 config_builder.add_column(
178 dd.ValidationColumnConfig(
179 name="code_validity_result",
180 validator_type=dd.ValidatorType.CODE,
181 target_columns=["sql"],
182 validator_params=dd.CodeValidatorParams(
183 code_lang=dd.CodeLang.SQL_ANSI,
184 ),
185 batch_size=100,
186 )
187 )
188
189 config_builder.add_column(
190 dd.LLMJudgeColumnConfig(
191 name="code_judge_result",
192 model_alias=model_alias,
193 prompt=TEXT_TO_SQL_JUDGE_TEMPLATE,
194 scores=sql_scoring,
195 )
196 )
197
198 return config_builder
199
200
201def create_dataset(
202 config_builder: dd.DataDesignerConfigBuilder,
203 num_records: int,
204 artifact_path: Path | str | None = None,
205) -> DatasetCreationResults:
206 data_designer = DataDesigner(artifact_path=artifact_path)
207 results = data_designer.create(config_builder, num_records=num_records)
208 return results
209
210
211SQL_PROMPT_TEXT = (
212 "Generate an instruction to create SQL code that solves a specific problem.\n"
213 "Each instruction should begin with one of the following phrases: {{instruction_phrase}}.\n\n"
214 "Important Guidelines:\n"
215 "* Industry Relevance: Ensure the instruction pertains to the {{industry_sector}} sector and {{topic}} topic.\n"
216 "* SQL Complexity: Tailor the instruction to the {{sql_complexity}} level. Utilize relevant {{sql_concept}} "
217 "where appropriate to match the complexity level.\n"
218 "* Task Type: The instruction should involve a {{sql_task_type}} task.\n"
219 "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to "
220 "understand the requirements without being overly verbose.\n"
221 "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n"
222)
223
224SQL_CONTEXT_TEXT = (
225 "Generate the SQL for creating database tables that would be relevant for the following instruction:\n"
226 "Instruction: {{sql_prompt}}\n\n"
227 "Important Guidelines:\n"
228 "* Relevance: Ensure all tables are directly related to the {{industry_sector}} sector and {{topic}} topic.\n"
229 "* Completeness: Include all essential columns with appropriate data types, primary/foreign keys, and necessary constraints.\n"
230 "* Realism: Use realistic table structures typical for the specified industry.\n"
231 "* Executable SQL: Provide complete CREATE TABLE statements that can be run without modification.\n"
232 "* Consistency: Use consistent naming conventions (e.g., snake_case for table and column names).\n"
233 "* Sample Data: Include INSERT statements with sample data that makes sense for the tables (at least 5-10 rows per table)."
234)
235
236SQL_CODE_TEXT = (
237 "Write SQL code for the following instruction based on the provided database context:\n"
238 "Instruction: {{sql_prompt}}\n\n"
239 "Database Context:\n"
240 "{{sql_context}}\n\n"
241 "Important Guidelines:\n"
242 "* Code Quality: Your SQL should be clean, complete, self-contained and accurate.\n"
243 "* Code Validity: Please ensure that your SQL code is executable and does not contain any errors.\n"
244 "* Context: Base your query on the provided database context. Only reference tables and columns that "
245 "exist in the context.\n"
246 "* Complexity & Concepts: The SQL should be written at a {{sql_complexity}} level, making use of "
247 "concepts such as {{sql_concept}}.\n"
248 "* Task Type: Ensure your solution implements the appropriate {{sql_task_type}} operation.\n"
249 "* Comments: Include brief comments explaining the key parts of your query.\n"
250)
251
252
253TEXT_TO_SQL_JUDGE_TEMPLATE = """\
254You are an expert in SQL with deep knowledge of relational modeling, query semantics,
255and performance tuning across common dialects (e.g., PostgreSQL, MySQL, SQLite, SQL Server).
256You think critically about correctness, readability, and efficiency.
257
258Use the SQL Query Quality Rubric below to score the **Generated SQL Query** based on the INSTRUCTIONS.
259
260#### INSTRUCTIONS
261The Generated SQL Query should be a valid response to the Natural Language Prompt below
262
263Natural Language Prompt:
264{{ sql_prompt }}
265
266Database Context:
267{{ sql_context }}
268
269Generated SQL Query
270{{ sql }}
271"""
272
273
274sql_scoring = [
275 dd.Score(
276 name="Relevance",
277 description="Adherence to INSTRUCTIONS and CONTEXT",
278 options={
279 4: "Perfectly meets all specified requirements.",
280 3: "Meets most requirements with minor deviations.",
281 2: "Moderate deviation from the instructions.",
282 1: "Significant deviations from the instructions.",
283 0: "Does not adhere to the instructions.",
284 },
285 ),
286 dd.Score(
287 name="SQL Correctness",
288 description="Syntax and semantic correctness; returns the intended result",
289 options={
290 4: "Valid SQL with correct joins, filters, grouping/aggregation, and NULL handling; produces the intended result set under the stated/implicit dialect.",
291 3: "Generally correct with minor issues (e.g., edge-case NULLs, minor grouping detail) but still likely yields the intended result.",
292 2: "Partially correct; noticeable semantic mistakes (joins, grouping, filters) that may change results or fail in edge cases.",
293 1: "Largely incorrect; major semantic or syntactic errors likely causing failure or wrong results.",
294 0: "Invalid SQL or unrelated to the task; will not run or cannot produce a meaningful result.",
295 },
296 ),
297 dd.Score(
298 name="Readability",
299 description="Formatting, clarity, and maintainability",
300 options={
301 4: "Cleanly formatted (keywords/clauses consistently styled), clear structure (CTEs/subqueries where helpful), meaningful table/column aliases, and concise.",
302 3: "Generally readable with consistent formatting and understandable aliases; could be organized slightly better.",
303 2: "Somewhat readable but inconsistent formatting or confusing aliasing; structure is harder to follow.",
304 1: "Poorly formatted and hard to read; unclear structure and aliasing.",
305 0: "Unreadable or chaotic; no meaningful structure or styling.",
306 },
307 ),
308 dd.Score(
309 name="Efficiency",
310 description="Query performance best practices",
311 options={
312 4: "Uses sargable predicates, appropriate joins, selective filters early, avoids SELECT *, unnecessary DISTINCT, and wasteful subqueries; likely to use indexes effectively.",
313 3: "Mostly efficient; minor opportunities for improvement (e.g., simplifying expressions, reducing data early).",
314 2: "Moderate inefficiencies (e.g., non-sargable filters, unnecessary nested subqueries, broad SELECT *).",
315 1: "Notably inefficient patterns likely causing large scans or poor plans.",
316 0: "Highly inefficient; ignores basic best practices and likely to perform very poorly.",
317 },
318 ),
319]
320
321if __name__ == "__main__":
322 from argparse import ArgumentParser
323
324 parser = ArgumentParser()
325 parser.add_argument("--model-alias", type=str, default="openai-text")
326 parser.add_argument("--num-records", type=int, default=5)
327 parser.add_argument("--artifact-path", type=str, default=None)
328 args = parser.parse_args()
329
330 config_builder = build_config(model_alias=args.model_alias)
331 results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path)
332
333 print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
334
335 results.load_analysis().to_report()