Text to Python | NVIDIA NeMo Data Designer

Download Recipe

1 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 # SPDX-License-Identifier: Apache-2.0
3 # /// script
4 # requires-python = ">=3.10"
5 # dependencies = [
6 #     "data-designer",
7 # ]
8 # ///
9 """Text-to-Python Code Generation Recipe
10 
11 Generate synthetic instruction-code pairs for Python programming tasks across
12 different industries, complexity levels, and programming concepts. Each record
13 includes an instruction, generated code, judge evaluation, and code validation.
14 
15 Prerequisites:
16     - OPENAI_API_KEY environment variable for OpenAI provider model aliases (default model alias is "openai-text").
17     - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases.
18 
19 Run:
20     # Basic usage (generates 5 records by default)
21     uv run text_to_python.py
22 
23     # For help message and available options
24     uv run text_to_python.py --help
25 """
26 
27 from pathlib import Path
28 
29 import data_designer.config as dd
30 from data_designer.interface import DataDesigner, DatasetCreationResults
31 
32 
33 def build_config(model_alias: str) -> dd.DataDesignerConfigBuilder:
34     config_builder = dd.DataDesignerConfigBuilder()
35 
36     config_builder.add_column(
37         dd.SamplerColumnConfig(
38             name="industry_sector",
39             sampler_type=dd.SamplerType.CATEGORY,
40             params=dd.CategorySamplerParams(
41                 values=[
42                     "Healthcare",
43                     "Finance",
44                     "Technology",
45                 ],
46             ),
47         ),
48     )
49 
50     config_builder.add_column(
51         dd.SamplerColumnConfig(
52             name="topic",
53             sampler_type=dd.SamplerType.SUBCATEGORY,
54             params=dd.SubcategorySamplerParams(
55                 category="industry_sector",
56                 values={
57                     "Healthcare": [
58                         "Electronic Health Records (EHR) Systems",
59                         "Telemedicine Platforms",
60                         "AI-Powered Diagnostic Tools",
61                     ],
62                     "Finance": [
63                         "Fraud Detection Software",
64                         "Automated Trading Systems",
65                         "Personal Finance Apps",
66                     ],
67                     "Technology": [
68                         "Cloud Computing Platforms",
69                         "Artificial Intelligence and Machine Learning Platforms",
70                         "DevOps and CI/CD Tools",
71                     ],
72                 },
73             ),
74         ),
75     )
76 
77     config_builder.add_column(
78         dd.SamplerColumnConfig(
79             name="code_complexity",
80             sampler_type=dd.SamplerType.CATEGORY,
81             params=dd.CategorySamplerParams(
82                 values=[
83                     "Beginner",
84                     "Intermediate",
85                     "Advanced",
86                 ],
87             ),
88         ),
89     )
90 
91     config_builder.add_column(
92         dd.SamplerColumnConfig(
93             name="code_concept",
94             sampler_type=dd.SamplerType.SUBCATEGORY,
95             params=dd.SubcategorySamplerParams(
96                 category="code_complexity",
97                 values={
98                     "Beginner": [
99                         "Variables",
100                         "Data Types",
101                         "Functions",
102                         "Loops",
103                         "Classes",
104                     ],
105                     "Intermediate": [
106                         "List Comprehensions",
107                         "Object-oriented programming",
108                         "Lambda Functions",
109                         "Web frameworks",
110                         "Pandas",
111                     ],
112                     "Advanced": [
113                         "Multithreading",
114                         "Context Managers",
115                         "Generators",
116                     ],
117                 },
118             ),
119         ),
120     )
121 
122     config_builder.add_column(
123         dd.SamplerColumnConfig(
124             name="instruction_phrase",
125             sampler_type=dd.SamplerType.CATEGORY,
126             params=dd.CategorySamplerParams(
127                 values=[
128                     "Write a function that",
129                     "Create a class that",
130                     "Implement a script",
131                     "Can you create a function",
132                     "Develop a module that",
133                 ],
134             ),
135         ),
136     )
137 
138     config_builder.add_column(
139         dd.LLMTextColumnConfig(
140             name="instruction",
141             model_alias=model_alias,
142             system_prompt="You are an expert at generating clear and specific programming tasks.",
143             prompt=(
144                 "Generate an instruction to create Python code that solves a specific problem.\n"
145                 'The instruction should begin with the following phrase: "{{ instruction_phrase }}".\n\n'
146                 "Important Guidelines:\n"
147                 "* Industry Relevance: Ensure the instruction pertains to the {{ industry_sector }} sector and {{ topic }} topic.\n"
148                 "* Code Complexity: Tailor the instruction to the {{ code_complexity }} level. Utilize relevant {{ code_concept }} where appropriate to match the complexity level.\n"
149                 "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n"
150                 "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n"
151             ),
152         )
153     )
154 
155     config_builder.add_column(
156         dd.LLMCodeColumnConfig(
157             name="code_implementation",
158             model_alias=model_alias,
159             code_lang=dd.CodeLang.PYTHON,
160             system_prompt="You are an expert Python programmer who writes clean, efficient, and well-documented code.",
161             prompt=(
162                 "Write Python code for the following instruction:\n"
163                 "Instruction: {{ instruction }}\n\n"
164                 "Important Guidelines:\n"
165                 "* Code Quality: Your code should be clean, complete, self-contained, and accurate.\n"
166                 "* Code Validity: Please ensure that your Python code is executable and does not contain any errors.\n"
167                 "* Packages: Remember to import any necessary libraries, and to use all libraries you import.\n"
168                 "* Complexity & Concepts: The code should be written at a {{ code_complexity }} level, making use of concepts such as {{ code_concept }}.\n"
169             ),
170         )
171     )
172 
173     config_builder.add_column(
174         dd.LLMJudgeColumnConfig(
175             name="code_judge_result",
176             model_alias=model_alias,
177             prompt=TEXT_TO_PYTHON_JUDGE_TEMPLATE,
178             scores=python_scoring,
179         )
180     )
181 
182     config_builder.add_column(
183         dd.ValidationColumnConfig(
184             name="code_validity_result",
185             validator_type=dd.ValidatorType.CODE,
186             target_columns=["code_implementation"],
187             validator_params=dd.CodeValidatorParams(code_lang=dd.CodeLang.PYTHON),
188             batch_size=100,
189         )
190     )
191 
192     return config_builder
193 
194 
195 def create_dataset(
196     config_builder: dd.DataDesignerConfigBuilder,
197     num_records: int,
198     artifact_path: Path | str | None = None,
199 ) -> DatasetCreationResults:
200     data_designer = DataDesigner(artifact_path=artifact_path)
201     results = data_designer.create(config_builder, num_records=num_records)
202     return results
203 
204 
205 TEXT_TO_PYTHON_JUDGE_TEMPLATE = """\
206 You are an expert in Python programming, with specialized knowledge in software engineering, data science, and algorithmic problem-solving.
207 
208 You think about potential flaws and errors in the code. You are a tough critic, but a fair one.
209 
210 Take a deep breath and use the Python Code Quality Rubric below to score the **Generated Python Code** based on the INSTRUCTIONS.
211 
212 #### INSTRUCTIONS
213 The Generated Python Code should be a valid response to the Natural Language Prompt below
214 
215 Natural Language Prompt:
216 {{ instruction }}
217 
218 Generated Python Code
219 {{ code_implementation }}
220 """
221 
222 
223 python_scoring = [
224     dd.Score(
225         name="Relevance",
226         description="Adherence to INSTRUCTIONS and CONTEXT",
227         options={
228             4: "Perfectly meets all specified requirements.",
229             3: "Meets most requirements with minor deviations.",
230             2: "Moderate deviation from the instructions.",
231             1: "Significant deviations from the instructions.",
232             0: "Does not adhere to the instructions.",
233         },
234     ),
235     dd.Score(
236         name="Pythonic",
237         description="Pythonic Code and Best Practices (Does the code follow Python conventions and best practices?)",
238         options={
239             4: "The code exemplifies Pythonic principles, making excellent use of Python-specific constructs, standard library modules and programming idioms; follows all relevant PEPs.",
240             3: "The code closely follows Python conventions and adheres to many best practices; good use of Python-specific constructs, standard library modules and programming idioms.",
241             2: "The code generally follows Python conventions but has room for better alignment with Pythonic practices.",
242             1: "The code loosely follows Python conventions, with several deviations from best practices.",
243             0: "The code does not follow Python conventions or best practices, using non-Pythonic approaches.",
244         },
245     ),
246     dd.Score(
247         name="Readability",
248         description="Readability and Maintainability (Is the Python code easy to understand and maintain?)",
249         options={
250             4: (
251                 "The code is excellently formatted, follows PEP 8 guidelines, is elegantly concise and clear, uses meaningful variable names, "
252                 "ensuring high readability and ease of maintenance; organizes complex logic well. Docstrings are given in a Google Docstring format."
253             ),
254             3: "The code is well-formatted in the sense of code-as-documentation, making it relatively easy to understand and maintain; uses descriptive names and organizes logic clearly.",
255             2: "The code is somewhat readable with basic formatting and some comments, but improvements are needed; needs better use of descriptive names and organization.",
256             1: "The code has minimal formatting, making it hard to understand; lacks meaningful names and organization.",
257             0: "The code is unreadable, with no attempt at formatting or description.",
258         },
259     ),
260     dd.Score(
261         name="Efficiency",
262         description="Efficiency and Performance (Is the code optimized for performance?)",
263         options={
264             4: "The solution is highly efficient, using appropriate data structures and algorithms; avoids unnecessary computations and optimizes for both time and space complexity.",
265             3: "The solution is efficient, with good use of Python's built-in functions and libraries; minor areas for optimization.",
266             2: "The solution is moderately efficient, but misses some opportunities for optimization; uses some inefficient patterns.",
267             1: "The solution shows poor efficiency, with notable performance issues; lacks effective optimization techniques.",
268             0: "The solution is highly inefficient; overlooks fundamental optimization practices, resulting in significant performance issues.",
269         },
270     ),
271 ]
272 
273 
274 if __name__ == "__main__":
275     from argparse import ArgumentParser
276 
277     parser = ArgumentParser()
278     parser.add_argument("--model-alias", type=str, default="openai-text")
279     parser.add_argument("--num-records", type=int, default=5)
280     parser.add_argument("--artifact-path", type=str, default=None)
281     args = parser.parse_args()
282 
283     config_builder = build_config(model_alias=args.model_alias)
284     results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path)
285 
286     print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
287 
288     results.load_analysis().to_report()