| 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | # SPDX-License-Identifier: Apache-2.0 |
| 3 | # /// script |
| 4 | # requires-python = ">=3.10" |
| 5 | # dependencies = [ |
| 6 | # "data-designer", |
| 7 | # ] |
| 8 | # /// |
| 9 | """Text-to-Python Code Generation Recipe |
| 10 | |
| 11 | Generate synthetic instruction-code pairs for Python programming tasks across |
| 12 | different industries, complexity levels, and programming concepts. Each record |
| 13 | includes an instruction, generated code, judge evaluation, and code validation. |
| 14 | |
| 15 | Prerequisites: |
| 16 | - OPENAI_API_KEY environment variable for OpenAI provider model aliases (default model alias is "openai-text"). |
| 17 | - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases. |
| 18 | |
| 19 | Run: |
| 20 | # Basic usage (generates 5 records by default) |
| 21 | uv run text_to_python.py |
| 22 | |
| 23 | # For help message and available options |
| 24 | uv run text_to_python.py --help |
| 25 | """ |
| 26 | |
| 27 | from pathlib import Path |
| 28 | |
| 29 | import data_designer.config as dd |
| 30 | from data_designer.interface import DataDesigner, DatasetCreationResults |
| 31 | |
| 32 | |
| 33 | def build_config(model_alias: str) -> dd.DataDesignerConfigBuilder: |
| 34 | config_builder = dd.DataDesignerConfigBuilder() |
| 35 | |
| 36 | config_builder.add_column( |
| 37 | dd.SamplerColumnConfig( |
| 38 | name="industry_sector", |
| 39 | sampler_type=dd.SamplerType.CATEGORY, |
| 40 | params=dd.CategorySamplerParams( |
| 41 | values=[ |
| 42 | "Healthcare", |
| 43 | "Finance", |
| 44 | "Technology", |
| 45 | ], |
| 46 | ), |
| 47 | ), |
| 48 | ) |
| 49 | |
| 50 | config_builder.add_column( |
| 51 | dd.SamplerColumnConfig( |
| 52 | name="topic", |
| 53 | sampler_type=dd.SamplerType.SUBCATEGORY, |
| 54 | params=dd.SubcategorySamplerParams( |
| 55 | category="industry_sector", |
| 56 | values={ |
| 57 | "Healthcare": [ |
| 58 | "Electronic Health Records (EHR) Systems", |
| 59 | "Telemedicine Platforms", |
| 60 | "AI-Powered Diagnostic Tools", |
| 61 | ], |
| 62 | "Finance": [ |
| 63 | "Fraud Detection Software", |
| 64 | "Automated Trading Systems", |
| 65 | "Personal Finance Apps", |
| 66 | ], |
| 67 | "Technology": [ |
| 68 | "Cloud Computing Platforms", |
| 69 | "Artificial Intelligence and Machine Learning Platforms", |
| 70 | "DevOps and CI/CD Tools", |
| 71 | ], |
| 72 | }, |
| 73 | ), |
| 74 | ), |
| 75 | ) |
| 76 | |
| 77 | config_builder.add_column( |
| 78 | dd.SamplerColumnConfig( |
| 79 | name="code_complexity", |
| 80 | sampler_type=dd.SamplerType.CATEGORY, |
| 81 | params=dd.CategorySamplerParams( |
| 82 | values=[ |
| 83 | "Beginner", |
| 84 | "Intermediate", |
| 85 | "Advanced", |
| 86 | ], |
| 87 | ), |
| 88 | ), |
| 89 | ) |
| 90 | |
| 91 | config_builder.add_column( |
| 92 | dd.SamplerColumnConfig( |
| 93 | name="code_concept", |
| 94 | sampler_type=dd.SamplerType.SUBCATEGORY, |
| 95 | params=dd.SubcategorySamplerParams( |
| 96 | category="code_complexity", |
| 97 | values={ |
| 98 | "Beginner": [ |
| 99 | "Variables", |
| 100 | "Data Types", |
| 101 | "Functions", |
| 102 | "Loops", |
| 103 | "Classes", |
| 104 | ], |
| 105 | "Intermediate": [ |
| 106 | "List Comprehensions", |
| 107 | "Object-oriented programming", |
| 108 | "Lambda Functions", |
| 109 | "Web frameworks", |
| 110 | "Pandas", |
| 111 | ], |
| 112 | "Advanced": [ |
| 113 | "Multithreading", |
| 114 | "Context Managers", |
| 115 | "Generators", |
| 116 | ], |
| 117 | }, |
| 118 | ), |
| 119 | ), |
| 120 | ) |
| 121 | |
| 122 | config_builder.add_column( |
| 123 | dd.SamplerColumnConfig( |
| 124 | name="instruction_phrase", |
| 125 | sampler_type=dd.SamplerType.CATEGORY, |
| 126 | params=dd.CategorySamplerParams( |
| 127 | values=[ |
| 128 | "Write a function that", |
| 129 | "Create a class that", |
| 130 | "Implement a script", |
| 131 | "Can you create a function", |
| 132 | "Develop a module that", |
| 133 | ], |
| 134 | ), |
| 135 | ), |
| 136 | ) |
| 137 | |
| 138 | config_builder.add_column( |
| 139 | dd.LLMTextColumnConfig( |
| 140 | name="instruction", |
| 141 | model_alias=model_alias, |
| 142 | system_prompt="You are an expert at generating clear and specific programming tasks.", |
| 143 | prompt=( |
| 144 | "Generate an instruction to create Python code that solves a specific problem.\n" |
| 145 | 'The instruction should begin with the following phrase: "{{ instruction_phrase }}".\n\n' |
| 146 | "Important Guidelines:\n" |
| 147 | "* Industry Relevance: Ensure the instruction pertains to the {{ industry_sector }} sector and {{ topic }} topic.\n" |
| 148 | "* Code Complexity: Tailor the instruction to the {{ code_complexity }} level. Utilize relevant {{ code_concept }} where appropriate to match the complexity level.\n" |
| 149 | "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n" |
| 150 | "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n" |
| 151 | ), |
| 152 | ) |
| 153 | ) |
| 154 | |
| 155 | config_builder.add_column( |
| 156 | dd.LLMCodeColumnConfig( |
| 157 | name="code_implementation", |
| 158 | model_alias=model_alias, |
| 159 | code_lang=dd.CodeLang.PYTHON, |
| 160 | system_prompt="You are an expert Python programmer who writes clean, efficient, and well-documented code.", |
| 161 | prompt=( |
| 162 | "Write Python code for the following instruction:\n" |
| 163 | "Instruction: {{ instruction }}\n\n" |
| 164 | "Important Guidelines:\n" |
| 165 | "* Code Quality: Your code should be clean, complete, self-contained, and accurate.\n" |
| 166 | "* Code Validity: Please ensure that your Python code is executable and does not contain any errors.\n" |
| 167 | "* Packages: Remember to import any necessary libraries, and to use all libraries you import.\n" |
| 168 | "* Complexity & Concepts: The code should be written at a {{ code_complexity }} level, making use of concepts such as {{ code_concept }}.\n" |
| 169 | ), |
| 170 | ) |
| 171 | ) |
| 172 | |
| 173 | config_builder.add_column( |
| 174 | dd.LLMJudgeColumnConfig( |
| 175 | name="code_judge_result", |
| 176 | model_alias=model_alias, |
| 177 | prompt=TEXT_TO_PYTHON_JUDGE_TEMPLATE, |
| 178 | scores=python_scoring, |
| 179 | ) |
| 180 | ) |
| 181 | |
| 182 | config_builder.add_column( |
| 183 | dd.ValidationColumnConfig( |
| 184 | name="code_validity_result", |
| 185 | validator_type=dd.ValidatorType.CODE, |
| 186 | target_columns=["code_implementation"], |
| 187 | validator_params=dd.CodeValidatorParams(code_lang=dd.CodeLang.PYTHON), |
| 188 | batch_size=100, |
| 189 | ) |
| 190 | ) |
| 191 | |
| 192 | return config_builder |
| 193 | |
| 194 | |
| 195 | def create_dataset( |
| 196 | config_builder: dd.DataDesignerConfigBuilder, |
| 197 | num_records: int, |
| 198 | artifact_path: Path | str | None = None, |
| 199 | ) -> DatasetCreationResults: |
| 200 | data_designer = DataDesigner(artifact_path=artifact_path) |
| 201 | results = data_designer.create(config_builder, num_records=num_records) |
| 202 | return results |
| 203 | |
| 204 | |
| 205 | TEXT_TO_PYTHON_JUDGE_TEMPLATE = """\ |
| 206 | You are an expert in Python programming, with specialized knowledge in software engineering, data science, and algorithmic problem-solving. |
| 207 | |
| 208 | You think about potential flaws and errors in the code. You are a tough critic, but a fair one. |
| 209 | |
| 210 | Take a deep breath and use the Python Code Quality Rubric below to score the **Generated Python Code** based on the INSTRUCTIONS. |
| 211 | |
| 212 | #### INSTRUCTIONS |
| 213 | The Generated Python Code should be a valid response to the Natural Language Prompt below |
| 214 | |
| 215 | Natural Language Prompt: |
| 216 | {{ instruction }} |
| 217 | |
| 218 | Generated Python Code |
| 219 | {{ code_implementation }} |
| 220 | """ |
| 221 | |
| 222 | |
| 223 | python_scoring = [ |
| 224 | dd.Score( |
| 225 | name="Relevance", |
| 226 | description="Adherence to INSTRUCTIONS and CONTEXT", |
| 227 | options={ |
| 228 | 4: "Perfectly meets all specified requirements.", |
| 229 | 3: "Meets most requirements with minor deviations.", |
| 230 | 2: "Moderate deviation from the instructions.", |
| 231 | 1: "Significant deviations from the instructions.", |
| 232 | 0: "Does not adhere to the instructions.", |
| 233 | }, |
| 234 | ), |
| 235 | dd.Score( |
| 236 | name="Pythonic", |
| 237 | description="Pythonic Code and Best Practices (Does the code follow Python conventions and best practices?)", |
| 238 | options={ |
| 239 | 4: "The code exemplifies Pythonic principles, making excellent use of Python-specific constructs, standard library modules and programming idioms; follows all relevant PEPs.", |
| 240 | 3: "The code closely follows Python conventions and adheres to many best practices; good use of Python-specific constructs, standard library modules and programming idioms.", |
| 241 | 2: "The code generally follows Python conventions but has room for better alignment with Pythonic practices.", |
| 242 | 1: "The code loosely follows Python conventions, with several deviations from best practices.", |
| 243 | 0: "The code does not follow Python conventions or best practices, using non-Pythonic approaches.", |
| 244 | }, |
| 245 | ), |
| 246 | dd.Score( |
| 247 | name="Readability", |
| 248 | description="Readability and Maintainability (Is the Python code easy to understand and maintain?)", |
| 249 | options={ |
| 250 | 4: ( |
| 251 | "The code is excellently formatted, follows PEP 8 guidelines, is elegantly concise and clear, uses meaningful variable names, " |
| 252 | "ensuring high readability and ease of maintenance; organizes complex logic well. Docstrings are given in a Google Docstring format." |
| 253 | ), |
| 254 | 3: "The code is well-formatted in the sense of code-as-documentation, making it relatively easy to understand and maintain; uses descriptive names and organizes logic clearly.", |
| 255 | 2: "The code is somewhat readable with basic formatting and some comments, but improvements are needed; needs better use of descriptive names and organization.", |
| 256 | 1: "The code has minimal formatting, making it hard to understand; lacks meaningful names and organization.", |
| 257 | 0: "The code is unreadable, with no attempt at formatting or description.", |
| 258 | }, |
| 259 | ), |
| 260 | dd.Score( |
| 261 | name="Efficiency", |
| 262 | description="Efficiency and Performance (Is the code optimized for performance?)", |
| 263 | options={ |
| 264 | 4: "The solution is highly efficient, using appropriate data structures and algorithms; avoids unnecessary computations and optimizes for both time and space complexity.", |
| 265 | 3: "The solution is efficient, with good use of Python's built-in functions and libraries; minor areas for optimization.", |
| 266 | 2: "The solution is moderately efficient, but misses some opportunities for optimization; uses some inefficient patterns.", |
| 267 | 1: "The solution shows poor efficiency, with notable performance issues; lacks effective optimization techniques.", |
| 268 | 0: "The solution is highly inefficient; overlooks fundamental optimization practices, resulting in significant performance issues.", |
| 269 | }, |
| 270 | ), |
| 271 | ] |
| 272 | |
| 273 | |
| 274 | if __name__ == "__main__": |
| 275 | from argparse import ArgumentParser |
| 276 | |
| 277 | parser = ArgumentParser() |
| 278 | parser.add_argument("--model-alias", type=str, default="openai-text") |
| 279 | parser.add_argument("--num-records", type=int, default=5) |
| 280 | parser.add_argument("--artifact-path", type=str, default=None) |
| 281 | args = parser.parse_args() |
| 282 | |
| 283 | config_builder = build_config(model_alias=args.model_alias) |
| 284 | results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) |
| 285 | |
| 286 | print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") |
| 287 | |
| 288 | results.load_analysis().to_report() |