For AI agents: a documentation index is available at the root level at /llms.txt and /llms-full.txt. Append /llms.txt to any URL for a page-level index, or .md for the markdown version of any page.
  • Getting Started
    • Welcome
    • Contributing
  • Concepts
    • Columns
    • Seed Datasets
    • Agent Rollout Ingestion
    • Custom Columns
    • Validators
    • Processors
    • Person Sampling
    • Traces
    • Architecture & Performance
    • Deployment Options
    • Security
  • Tutorials
    • Overview
    • The Basics
    • Structured Outputs, Jinja Expressions, and Conditional Generation
    • Seeding with an External Dataset
    • Providing Images as Context
    • Generating Images
    • Image-to-Image Editing
  • Recipes
    • Recipe Cards
      • Text to Python
      • Text to SQL
      • Nemotron Super Text to SQL
  • Plugins
    • Overview
    • Example Plugin
    • FileSystemSeedReader Plugins
    • Discover
  • Code Reference
    • Overview
  • Dev Notes
    • Overview
    • Have It Your Way
    • VLM Long Document Understanding
    • Push Datasets to Hugging Face Hub
    • Text-to-SQL for Nemotron Super
    • Async All the Way Down
    • Owning the Model Stack
NVIDIANVIDIA
Developer-friendly docs for your API
Privacy Policy | Your Privacy Choices | Terms of Service | Accessibility | Corporate Policies | Product Security | Contact

Copyright © 2026, NVIDIA Corporation.

LogoLogoNeMo Data Designer
RecipesCode Generation

Text to Python

||View as Markdown|
Previous

Use Case Recipes

Next

Text to SQL

Download Recipe

Download the complete recipe script

1# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2# SPDX-License-Identifier: Apache-2.0
3# /// script
4# requires-python = ">=3.10"
5# dependencies = [
6# "data-designer",
7# ]
8# ///
9"""Text-to-Python Code Generation Recipe
10
11Generate synthetic instruction-code pairs for Python programming tasks across
12different industries, complexity levels, and programming concepts. Each record
13includes an instruction, generated code, judge evaluation, and code validation.
14
15Prerequisites:
16 - OPENAI_API_KEY environment variable for OpenAI provider model aliases (default model alias is "openai-text").
17 - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases.
18
19Run:
20 # Basic usage (generates 5 records by default)
21 uv run text_to_python.py
22
23 # For help message and available options
24 uv run text_to_python.py --help
25"""
26
27from pathlib import Path
28
29import data_designer.config as dd
30from data_designer.interface import DataDesigner, DatasetCreationResults
31
32
33def build_config(model_alias: str) -> dd.DataDesignerConfigBuilder:
34 config_builder = dd.DataDesignerConfigBuilder()
35
36 config_builder.add_column(
37 dd.SamplerColumnConfig(
38 name="industry_sector",
39 sampler_type=dd.SamplerType.CATEGORY,
40 params=dd.CategorySamplerParams(
41 values=[
42 "Healthcare",
43 "Finance",
44 "Technology",
45 ],
46 ),
47 ),
48 )
49
50 config_builder.add_column(
51 dd.SamplerColumnConfig(
52 name="topic",
53 sampler_type=dd.SamplerType.SUBCATEGORY,
54 params=dd.SubcategorySamplerParams(
55 category="industry_sector",
56 values={
57 "Healthcare": [
58 "Electronic Health Records (EHR) Systems",
59 "Telemedicine Platforms",
60 "AI-Powered Diagnostic Tools",
61 ],
62 "Finance": [
63 "Fraud Detection Software",
64 "Automated Trading Systems",
65 "Personal Finance Apps",
66 ],
67 "Technology": [
68 "Cloud Computing Platforms",
69 "Artificial Intelligence and Machine Learning Platforms",
70 "DevOps and CI/CD Tools",
71 ],
72 },
73 ),
74 ),
75 )
76
77 config_builder.add_column(
78 dd.SamplerColumnConfig(
79 name="code_complexity",
80 sampler_type=dd.SamplerType.CATEGORY,
81 params=dd.CategorySamplerParams(
82 values=[
83 "Beginner",
84 "Intermediate",
85 "Advanced",
86 ],
87 ),
88 ),
89 )
90
91 config_builder.add_column(
92 dd.SamplerColumnConfig(
93 name="code_concept",
94 sampler_type=dd.SamplerType.SUBCATEGORY,
95 params=dd.SubcategorySamplerParams(
96 category="code_complexity",
97 values={
98 "Beginner": [
99 "Variables",
100 "Data Types",
101 "Functions",
102 "Loops",
103 "Classes",
104 ],
105 "Intermediate": [
106 "List Comprehensions",
107 "Object-oriented programming",
108 "Lambda Functions",
109 "Web frameworks",
110 "Pandas",
111 ],
112 "Advanced": [
113 "Multithreading",
114 "Context Managers",
115 "Generators",
116 ],
117 },
118 ),
119 ),
120 )
121
122 config_builder.add_column(
123 dd.SamplerColumnConfig(
124 name="instruction_phrase",
125 sampler_type=dd.SamplerType.CATEGORY,
126 params=dd.CategorySamplerParams(
127 values=[
128 "Write a function that",
129 "Create a class that",
130 "Implement a script",
131 "Can you create a function",
132 "Develop a module that",
133 ],
134 ),
135 ),
136 )
137
138 config_builder.add_column(
139 dd.LLMTextColumnConfig(
140 name="instruction",
141 model_alias=model_alias,
142 system_prompt="You are an expert at generating clear and specific programming tasks.",
143 prompt=(
144 "Generate an instruction to create Python code that solves a specific problem.\n"
145 'The instruction should begin with the following phrase: "{{ instruction_phrase }}".\n\n'
146 "Important Guidelines:\n"
147 "* Industry Relevance: Ensure the instruction pertains to the {{ industry_sector }} sector and {{ topic }} topic.\n"
148 "* Code Complexity: Tailor the instruction to the {{ code_complexity }} level. Utilize relevant {{ code_concept }} where appropriate to match the complexity level.\n"
149 "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n"
150 "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n"
151 ),
152 )
153 )
154
155 config_builder.add_column(
156 dd.LLMCodeColumnConfig(
157 name="code_implementation",
158 model_alias=model_alias,
159 code_lang=dd.CodeLang.PYTHON,
160 system_prompt="You are an expert Python programmer who writes clean, efficient, and well-documented code.",
161 prompt=(
162 "Write Python code for the following instruction:\n"
163 "Instruction: {{ instruction }}\n\n"
164 "Important Guidelines:\n"
165 "* Code Quality: Your code should be clean, complete, self-contained, and accurate.\n"
166 "* Code Validity: Please ensure that your Python code is executable and does not contain any errors.\n"
167 "* Packages: Remember to import any necessary libraries, and to use all libraries you import.\n"
168 "* Complexity & Concepts: The code should be written at a {{ code_complexity }} level, making use of concepts such as {{ code_concept }}.\n"
169 ),
170 )
171 )
172
173 config_builder.add_column(
174 dd.LLMJudgeColumnConfig(
175 name="code_judge_result",
176 model_alias=model_alias,
177 prompt=TEXT_TO_PYTHON_JUDGE_TEMPLATE,
178 scores=python_scoring,
179 )
180 )
181
182 config_builder.add_column(
183 dd.ValidationColumnConfig(
184 name="code_validity_result",
185 validator_type=dd.ValidatorType.CODE,
186 target_columns=["code_implementation"],
187 validator_params=dd.CodeValidatorParams(code_lang=dd.CodeLang.PYTHON),
188 batch_size=100,
189 )
190 )
191
192 return config_builder
193
194
195def create_dataset(
196 config_builder: dd.DataDesignerConfigBuilder,
197 num_records: int,
198 artifact_path: Path | str | None = None,
199) -> DatasetCreationResults:
200 data_designer = DataDesigner(artifact_path=artifact_path)
201 results = data_designer.create(config_builder, num_records=num_records)
202 return results
203
204
205TEXT_TO_PYTHON_JUDGE_TEMPLATE = """\
206You are an expert in Python programming, with specialized knowledge in software engineering, data science, and algorithmic problem-solving.
207
208You think about potential flaws and errors in the code. You are a tough critic, but a fair one.
209
210Take a deep breath and use the Python Code Quality Rubric below to score the **Generated Python Code** based on the INSTRUCTIONS.
211
212#### INSTRUCTIONS
213The Generated Python Code should be a valid response to the Natural Language Prompt below
214
215Natural Language Prompt:
216{{ instruction }}
217
218Generated Python Code
219{{ code_implementation }}
220"""
221
222
223python_scoring = [
224 dd.Score(
225 name="Relevance",
226 description="Adherence to INSTRUCTIONS and CONTEXT",
227 options={
228 4: "Perfectly meets all specified requirements.",
229 3: "Meets most requirements with minor deviations.",
230 2: "Moderate deviation from the instructions.",
231 1: "Significant deviations from the instructions.",
232 0: "Does not adhere to the instructions.",
233 },
234 ),
235 dd.Score(
236 name="Pythonic",
237 description="Pythonic Code and Best Practices (Does the code follow Python conventions and best practices?)",
238 options={
239 4: "The code exemplifies Pythonic principles, making excellent use of Python-specific constructs, standard library modules and programming idioms; follows all relevant PEPs.",
240 3: "The code closely follows Python conventions and adheres to many best practices; good use of Python-specific constructs, standard library modules and programming idioms.",
241 2: "The code generally follows Python conventions but has room for better alignment with Pythonic practices.",
242 1: "The code loosely follows Python conventions, with several deviations from best practices.",
243 0: "The code does not follow Python conventions or best practices, using non-Pythonic approaches.",
244 },
245 ),
246 dd.Score(
247 name="Readability",
248 description="Readability and Maintainability (Is the Python code easy to understand and maintain?)",
249 options={
250 4: (
251 "The code is excellently formatted, follows PEP 8 guidelines, is elegantly concise and clear, uses meaningful variable names, "
252 "ensuring high readability and ease of maintenance; organizes complex logic well. Docstrings are given in a Google Docstring format."
253 ),
254 3: "The code is well-formatted in the sense of code-as-documentation, making it relatively easy to understand and maintain; uses descriptive names and organizes logic clearly.",
255 2: "The code is somewhat readable with basic formatting and some comments, but improvements are needed; needs better use of descriptive names and organization.",
256 1: "The code has minimal formatting, making it hard to understand; lacks meaningful names and organization.",
257 0: "The code is unreadable, with no attempt at formatting or description.",
258 },
259 ),
260 dd.Score(
261 name="Efficiency",
262 description="Efficiency and Performance (Is the code optimized for performance?)",
263 options={
264 4: "The solution is highly efficient, using appropriate data structures and algorithms; avoids unnecessary computations and optimizes for both time and space complexity.",
265 3: "The solution is efficient, with good use of Python's built-in functions and libraries; minor areas for optimization.",
266 2: "The solution is moderately efficient, but misses some opportunities for optimization; uses some inefficient patterns.",
267 1: "The solution shows poor efficiency, with notable performance issues; lacks effective optimization techniques.",
268 0: "The solution is highly inefficient; overlooks fundamental optimization practices, resulting in significant performance issues.",
269 },
270 ),
271]
272
273
274if __name__ == "__main__":
275 from argparse import ArgumentParser
276
277 parser = ArgumentParser()
278 parser.add_argument("--model-alias", type=str, default="openai-text")
279 parser.add_argument("--num-records", type=int, default=5)
280 parser.add_argument("--artifact-path", type=str, default=None)
281 args = parser.parse_args()
282
283 config_builder = build_config(model_alias=args.model_alias)
284 results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path)
285
286 print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
287
288 results.load_analysis().to_report()