| 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | # SPDX-License-Identifier: Apache-2.0 |
| 3 | # /// script |
| 4 | # requires-python = ">=3.10" |
| 5 | # dependencies = [ |
| 6 | # "data-designer", |
| 7 | # ] |
| 8 | # /// |
| 9 | """Nemotron Super Text-to-SQL Recipe: Distractors, Dirty Data, and Multi-Judge Scoring |
| 10 | |
| 11 | Generate enterprise-grade text-to-SQL training data with dialect-specific SQL |
| 12 | (SQLite, MySQL, PostgreSQL), distractor table/column injection, dirty data |
| 13 | handling, conditional sampling, and multi-dimensional LLM judge scoring. |
| 14 | |
| 15 | This recipe implements the pipeline used to produce 96.5k validated text-to-SQL |
| 16 | records for Nemotron Super v3 SFT training, which raised BIRD benchmark |
| 17 | execution accuracy from 26.77% to 41.80%. |
| 18 | |
| 19 | Pipeline architecture: |
| 20 | |
| 21 | ┌─────────────────────────────────────────────────────────────────────────┐ |
| 22 | │ STAGE 1: SEEDING & DIVERSIFICATION │ |
| 23 | │ │ |
| 24 | │ Domain Controls SQL Controls Prompt Controls │ |
| 25 | │ ├─ industry_sector ├─ sql_complexity ├─ instruction_style │ |
| 26 | │ ├─ topic (conditional) ├─ sql_concept ├─ linguistic_register │ |
| 27 | │ ├─ data_quality_challenge├─ sql_task_type └─ politeness_level │ |
| 28 | │ ├─ data_quality_concept │ (conditional) │ |
| 29 | │ ├─ knowledge_dependency └─ sql_task_concept │ |
| 30 | │ └─ knowledge_concept │ |
| 31 | ├─────────────────────────────────────────────────────────────────────────┤ |
| 32 | │ STAGE 2: PROMPT GENERATION (LLM) │ |
| 33 | │ Natural-language request grounded in metadata; no SQL jargon. │ |
| 34 | │ Style adapts to instruction_style × register × politeness. │ |
| 35 | ├─────────────────────────────────────────────────────────────────────────┤ |
| 36 | │ STAGE 3: SCHEMA + DATA GENERATION (LLM) │ |
| 37 | │ Dialect-specific DDL + INSERT with 3-5 core tables, 1-2 distractor │ |
| 38 | │ tables, 3-5 distractor columns per table, dirty data injection. │ |
| 39 | ├─────────────────────────────────────────────────────────────────────────┤ |
| 40 | │ STAGE 4: SQL GENERATION (LLM) │ |
| 41 | │ Dialect-specific SQL; ignores distractors; handles dirty data. │ |
| 42 | ├─────────────────────────────────────────────────────────────────────────┤ |
| 43 | │ STAGE 5: VALIDATION + QUALITY SCORING │ |
| 44 | │ │ |
| 45 | │ Syntax Validator 5 LLM Judges (0-4 scores) │ |
| 46 | │ ├─ SQL_SQLITE ├─ Prompt: naturalness, specificity, │ |
| 47 | │ ├─ SQL_MYSQL │ absence of SQL jargon │ |
| 48 | │ └─ SQL_POSTGRES ├─ SQL: relevance, readability, │ |
| 49 | │ │ scalability, standards │ |
| 50 | │ ├─ Context: relevance, readability, │ |
| 51 | │ │ scalability, standards │ |
| 52 | │ ├─ Data Quality: cleaning correctness, │ |
| 53 | │ │ efficiency │ |
| 54 | │ └─ Knowledge: application correctness, │ |
| 55 | │ clarity of inference │ |
| 56 | │ │ |
| 57 | │ 15 score columns extracted for downstream filtering │ |
| 58 | └─────────────────────────────────────────────────────────────────────────┘ |
| 59 | |
| 60 | Prerequisites: |
| 61 | - OPENAI_API_KEY environment variable for OpenAI provider model aliases (default model alias is "openai-text"). |
| 62 | - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases. |
| 63 | |
| 64 | Run: |
| 65 | # Basic usage (generates 5 records by default, SQLite dialect) |
| 66 | uv run enterprise_text_to_sql.py |
| 67 | |
| 68 | # Generate for a specific dialect |
| 69 | uv run enterprise_text_to_sql.py --dialect postgres |
| 70 | |
| 71 | # For help message and available options |
| 72 | uv run enterprise_text_to_sql.py --help |
| 73 | """ |
| 74 | |
| 75 | from pathlib import Path |
| 76 | |
| 77 | import data_designer.config as dd |
| 78 | from data_designer.interface import DataDesigner, DatasetCreationResults |
| 79 | |
| 80 | SQL_DIALECTS = { |
| 81 | "sqlite": dd.CodeLang.SQL_SQLITE, |
| 82 | "mysql": dd.CodeLang.SQL_MYSQL, |
| 83 | "postgres": dd.CodeLang.SQL_POSTGRES, |
| 84 | } |
| 85 | |
| 86 | |
| 87 | def build_config(model_alias: str, dialect: str = "sqlite") -> dd.DataDesignerConfigBuilder: |
| 88 | code_lang = SQL_DIALECTS[dialect] |
| 89 | config_builder = dd.DataDesignerConfigBuilder() |
| 90 | |
| 91 | # ========================================================================= |
| 92 | # Stage 1: Seeding & diversification |
| 93 | # ========================================================================= |
| 94 | |
| 95 | config_builder.add_column( |
| 96 | dd.SamplerColumnConfig( |
| 97 | name="sql_dialect", |
| 98 | sampler_type=dd.SamplerType.CATEGORY, |
| 99 | params=dd.CategorySamplerParams(values=[dialect]), |
| 100 | ) |
| 101 | ) |
| 102 | |
| 103 | config_builder.add_column( |
| 104 | dd.SamplerColumnConfig( |
| 105 | name="industry_sector", |
| 106 | sampler_type=dd.SamplerType.CATEGORY, |
| 107 | params=dd.CategorySamplerParams( |
| 108 | values=[ |
| 109 | "Healthcare", |
| 110 | "Financial Services", |
| 111 | "Retail", |
| 112 | "Technology", |
| 113 | "Manufacturing", |
| 114 | "Aerospace", |
| 115 | "Energy", |
| 116 | "Telecommunications", |
| 117 | "Transportation", |
| 118 | "Education", |
| 119 | ], |
| 120 | ), |
| 121 | ) |
| 122 | ) |
| 123 | |
| 124 | config_builder.add_column( |
| 125 | dd.SamplerColumnConfig( |
| 126 | name="topic", |
| 127 | sampler_type=dd.SamplerType.SUBCATEGORY, |
| 128 | params=dd.SubcategorySamplerParams( |
| 129 | category="industry_sector", |
| 130 | values={ |
| 131 | "Healthcare": [ |
| 132 | "Electronic Health Records", |
| 133 | "Telemedicine Platforms", |
| 134 | "Clinical Trials", |
| 135 | "Patient Scheduling", |
| 136 | "Insurance Claims", |
| 137 | ], |
| 138 | "Financial Services": [ |
| 139 | "Fraud Detection", |
| 140 | "Trading Systems", |
| 141 | "Risk Assessment", |
| 142 | "Portfolio Management", |
| 143 | "Regulatory Compliance", |
| 144 | ], |
| 145 | "Retail": [ |
| 146 | "Inventory Management", |
| 147 | "Customer Segmentation", |
| 148 | "Pricing Optimization", |
| 149 | "Supply Chain", |
| 150 | "Returns Processing", |
| 151 | ], |
| 152 | "Technology": [ |
| 153 | "Cloud Platforms", |
| 154 | "ML Pipelines", |
| 155 | "DevOps Tools", |
| 156 | "API Gateway Logs", |
| 157 | "User Analytics", |
| 158 | ], |
| 159 | "Manufacturing": [ |
| 160 | "Quality Control", |
| 161 | "Production Scheduling", |
| 162 | "Equipment Maintenance", |
| 163 | "Supply Chain Optimization", |
| 164 | "Safety Compliance", |
| 165 | ], |
| 166 | "Aerospace": [ |
| 167 | "Flight Operations", |
| 168 | "Satellite Systems", |
| 169 | "Parts Procurement", |
| 170 | "Maintenance Scheduling", |
| 171 | "Crew Management", |
| 172 | ], |
| 173 | "Energy": [ |
| 174 | "Grid Management", |
| 175 | "Renewable Forecasting", |
| 176 | "Asset Monitoring", |
| 177 | "Trading and Markets", |
| 178 | "Regulatory Reporting", |
| 179 | ], |
| 180 | "Telecommunications": [ |
| 181 | "Network Operations", |
| 182 | "Customer Billing", |
| 183 | "Service Provisioning", |
| 184 | "Call Detail Records", |
| 185 | "Churn Prediction", |
| 186 | ], |
| 187 | "Transportation": [ |
| 188 | "Fleet Management", |
| 189 | "Route Optimization", |
| 190 | "Freight Tracking", |
| 191 | "Driver Scheduling", |
| 192 | "Maintenance Records", |
| 193 | ], |
| 194 | "Education": [ |
| 195 | "Student Records", |
| 196 | "Course Enrollment", |
| 197 | "Learning Analytics", |
| 198 | "Financial Aid", |
| 199 | "Faculty Management", |
| 200 | ], |
| 201 | }, |
| 202 | ), |
| 203 | ) |
| 204 | ) |
| 205 | |
| 206 | config_builder.add_column( |
| 207 | dd.SamplerColumnConfig( |
| 208 | name="sql_complexity", |
| 209 | sampler_type=dd.SamplerType.CATEGORY, |
| 210 | params=dd.CategorySamplerParams( |
| 211 | values=["Beginner", "Intermediate", "Advanced"], |
| 212 | ), |
| 213 | ) |
| 214 | ) |
| 215 | |
| 216 | config_builder.add_column( |
| 217 | dd.SamplerColumnConfig( |
| 218 | name="sql_concept", |
| 219 | sampler_type=dd.SamplerType.SUBCATEGORY, |
| 220 | params=dd.SubcategorySamplerParams( |
| 221 | category="sql_complexity", |
| 222 | values={ |
| 223 | "Beginner": [ |
| 224 | "Basic SELECT Statements", |
| 225 | "WHERE Clauses", |
| 226 | "Simple Aggregations", |
| 227 | "Basic JOINs", |
| 228 | "INSERT, UPDATE, DELETE", |
| 229 | "ORDER BY and LIMIT", |
| 230 | ], |
| 231 | "Intermediate": [ |
| 232 | "Window Functions", |
| 233 | "Correlated Subqueries", |
| 234 | "Multiple JOINs with Aggregations", |
| 235 | "CASE Expressions", |
| 236 | "GROUP BY with HAVING", |
| 237 | "Set Operations (UNION, INTERSECT, EXCEPT)", |
| 238 | ], |
| 239 | "Advanced": [ |
| 240 | "Recursive CTEs", |
| 241 | "Frame Clauses", |
| 242 | "Pivot/Unpivot Patterns", |
| 243 | "Complex Analytical Functions", |
| 244 | "Self-Joins for Hierarchies", |
| 245 | "Conditional Aggregation", |
| 246 | ], |
| 247 | }, |
| 248 | ), |
| 249 | ) |
| 250 | ) |
| 251 | |
| 252 | config_builder.add_column( |
| 253 | dd.SamplerColumnConfig( |
| 254 | name="sql_task_type", |
| 255 | sampler_type=dd.SamplerType.CATEGORY, |
| 256 | params=dd.CategorySamplerParams( |
| 257 | values=[ |
| 258 | "Foundational Queries & DML", |
| 259 | "Data Quality & Validation", |
| 260 | "Advanced Analytics & Windowing", |
| 261 | "Schema, DDL & Performance", |
| 262 | ], |
| 263 | ), |
| 264 | conditional_params={ |
| 265 | "sql_complexity == 'Beginner'": dd.CategorySamplerParams( |
| 266 | values=["Foundational Queries & DML", "Data Quality & Validation"], |
| 267 | ), |
| 268 | "sql_complexity == 'Intermediate'": dd.CategorySamplerParams( |
| 269 | values=[ |
| 270 | "Foundational Queries & DML", |
| 271 | "Data Quality & Validation", |
| 272 | "Advanced Analytics & Windowing", |
| 273 | ], |
| 274 | ), |
| 275 | "sql_complexity == 'Advanced'": dd.CategorySamplerParams( |
| 276 | values=[ |
| 277 | "Advanced Analytics & Windowing", |
| 278 | "Schema, DDL & Performance", |
| 279 | "Data Quality & Validation", |
| 280 | ], |
| 281 | ), |
| 282 | }, |
| 283 | ) |
| 284 | ) |
| 285 | |
| 286 | config_builder.add_column( |
| 287 | dd.SamplerColumnConfig( |
| 288 | name="sql_task_concept", |
| 289 | sampler_type=dd.SamplerType.SUBCATEGORY, |
| 290 | params=dd.SubcategorySamplerParams( |
| 291 | category="sql_task_type", |
| 292 | values={ |
| 293 | "Foundational Queries & DML": [ |
| 294 | "Multi-table retrieval", |
| 295 | "Filtered aggregation", |
| 296 | "Conditional insert/update", |
| 297 | "Ranked retrieval", |
| 298 | ], |
| 299 | "Data Quality & Validation": [ |
| 300 | "NULL detection and handling", |
| 301 | "Duplicate detection", |
| 302 | "Data type casting and cleanup", |
| 303 | "Referential integrity check", |
| 304 | ], |
| 305 | "Advanced Analytics & Windowing": [ |
| 306 | "Running totals and moving averages", |
| 307 | "Ranking and percentile computation", |
| 308 | "Gap and island detection", |
| 309 | "Year-over-year comparison", |
| 310 | ], |
| 311 | "Schema, DDL & Performance": [ |
| 312 | "Index-aware query optimization", |
| 313 | "Partitioned query design", |
| 314 | "Constraint-based validation", |
| 315 | "Schema migration pattern", |
| 316 | ], |
| 317 | }, |
| 318 | ), |
| 319 | ) |
| 320 | ) |
| 321 | |
| 322 | config_builder.add_column( |
| 323 | dd.SamplerColumnConfig( |
| 324 | name="data_quality_challenge", |
| 325 | sampler_type=dd.SamplerType.CATEGORY, |
| 326 | params=dd.CategorySamplerParams( |
| 327 | values=[ |
| 328 | "Type Mismatches", |
| 329 | "Temporal Drift", |
| 330 | "Embedded Special Characters", |
| 331 | "Mixed Formats", |
| 332 | "NULL Handling", |
| 333 | ], |
| 334 | ), |
| 335 | ) |
| 336 | ) |
| 337 | |
| 338 | config_builder.add_column( |
| 339 | dd.SamplerColumnConfig( |
| 340 | name="data_quality_concept", |
| 341 | sampler_type=dd.SamplerType.SUBCATEGORY, |
| 342 | params=dd.SubcategorySamplerParams( |
| 343 | category="data_quality_challenge", |
| 344 | values={ |
| 345 | "Type Mismatches": [ |
| 346 | "Currency stored as text with symbols ($57,500)", |
| 347 | "Boolean stored as string (yes/no/true/false/1/0)", |
| 348 | ], |
| 349 | "Temporal Drift": [ |
| 350 | "Dates stored as text in mixed formats (01-Jan-2023 vs 2023/01/01)", |
| 351 | "Timestamps with inconsistent timezone handling", |
| 352 | ], |
| 353 | "Embedded Special Characters": [ |
| 354 | "Newlines or tabs inside text fields", |
| 355 | "Unicode or accented characters in names", |
| 356 | ], |
| 357 | "Mixed Formats": [ |
| 358 | "Phone numbers in mixed formats (555-1234 vs (555) 123-4567)", |
| 359 | "Addresses with inconsistent abbreviations", |
| 360 | ], |
| 361 | "NULL Handling": [ |
| 362 | "NULLs disguised as empty strings or sentinel values (-1, N/A)", |
| 363 | "Optional FKs with NULL references", |
| 364 | ], |
| 365 | }, |
| 366 | ), |
| 367 | ) |
| 368 | ) |
| 369 | |
| 370 | config_builder.add_column( |
| 371 | dd.SamplerColumnConfig( |
| 372 | name="knowledge_dependency", |
| 373 | sampler_type=dd.SamplerType.CATEGORY, |
| 374 | params=dd.CategorySamplerParams( |
| 375 | values=["Domain Knowledge", "Implicit Logic", "Common Sense"], |
| 376 | ), |
| 377 | ) |
| 378 | ) |
| 379 | |
| 380 | config_builder.add_column( |
| 381 | dd.SamplerColumnConfig( |
| 382 | name="knowledge_concept", |
| 383 | sampler_type=dd.SamplerType.SUBCATEGORY, |
| 384 | params=dd.SubcategorySamplerParams( |
| 385 | category="knowledge_dependency", |
| 386 | values={ |
| 387 | "Domain Knowledge": [ |
| 388 | "Industry-specific business rules", |
| 389 | "Regulatory thresholds and compliance criteria", |
| 390 | "Domain-specific KPI definitions", |
| 391 | ], |
| 392 | "Implicit Logic": [ |
| 393 | "Fiscal year vs calendar year reasoning", |
| 394 | "Business-day exclusion logic", |
| 395 | "Implied sort/filter criteria from context", |
| 396 | ], |
| 397 | "Common Sense": [ |
| 398 | "Unit conversion (e.g., cents to dollars)", |
| 399 | "Age or duration calculation from dates", |
| 400 | "Geographic or hierarchical inference", |
| 401 | ], |
| 402 | }, |
| 403 | ), |
| 404 | ) |
| 405 | ) |
| 406 | |
| 407 | config_builder.add_column( |
| 408 | dd.SamplerColumnConfig( |
| 409 | name="instruction_style", |
| 410 | sampler_type=dd.SamplerType.CATEGORY, |
| 411 | params=dd.CategorySamplerParams( |
| 412 | values=["imperative", "declarative", "interrogative", "contextual", "abbreviated"], |
| 413 | ), |
| 414 | ) |
| 415 | ) |
| 416 | |
| 417 | config_builder.add_column( |
| 418 | dd.SamplerColumnConfig( |
| 419 | name="linguistic_register", |
| 420 | sampler_type=dd.SamplerType.CATEGORY, |
| 421 | params=dd.CategorySamplerParams( |
| 422 | values=["formal", "conversational", "technical", "academic", "direct"], |
| 423 | ), |
| 424 | ) |
| 425 | ) |
| 426 | |
| 427 | config_builder.add_column( |
| 428 | dd.SamplerColumnConfig( |
| 429 | name="politeness_level", |
| 430 | sampler_type=dd.SamplerType.CATEGORY, |
| 431 | params=dd.CategorySamplerParams( |
| 432 | values=["none", "minimal", "polite", "very polite"], |
| 433 | ), |
| 434 | ) |
| 435 | ) |
| 436 | |
| 437 | # ========================================================================= |
| 438 | # Stage 2: Prompt generation |
| 439 | # ========================================================================= |
| 440 | |
| 441 | config_builder.add_column( |
| 442 | dd.LLMTextColumnConfig( |
| 443 | name="sql_prompt", |
| 444 | model_alias=model_alias, |
| 445 | system_prompt=PROMPT_GEN_SYSTEM_PROMPT, |
| 446 | prompt=PROMPT_GEN_TEXT, |
| 447 | ) |
| 448 | ) |
| 449 | |
| 450 | # ========================================================================= |
| 451 | # Stage 3: Schema + data with distractor injection |
| 452 | # ========================================================================= |
| 453 | |
| 454 | config_builder.add_column( |
| 455 | dd.LLMCodeColumnConfig( |
| 456 | name="sql_context", |
| 457 | model_alias=model_alias, |
| 458 | system_prompt="You are an expert SQL database architect who designs well-structured, normalized schemas.", |
| 459 | prompt=SCHEMA_GEN_PROMPTS[dialect], |
| 460 | code_lang=code_lang, |
| 461 | ) |
| 462 | ) |
| 463 | |
| 464 | # ========================================================================= |
| 465 | # Stage 4: Dialect-specific SQL generation |
| 466 | # ========================================================================= |
| 467 | |
| 468 | config_builder.add_column( |
| 469 | dd.LLMCodeColumnConfig( |
| 470 | name="sql", |
| 471 | model_alias=model_alias, |
| 472 | system_prompt="You are an expert SQL programmer who solves problems with clean, efficient, and perfectly structured queries. Return only the final SQL.", |
| 473 | prompt=SQL_GEN_PROMPTS[dialect], |
| 474 | code_lang=code_lang, |
| 475 | ) |
| 476 | ) |
| 477 | |
| 478 | # ========================================================================= |
| 479 | # Stage 5: Validation + 5 LLM judges |
| 480 | # ========================================================================= |
| 481 | |
| 482 | config_builder.add_column( |
| 483 | dd.ValidationColumnConfig( |
| 484 | name="sql_validity_result", |
| 485 | target_columns=["sql"], |
| 486 | validator_type=dd.ValidatorType.CODE, |
| 487 | validator_params=dd.CodeValidatorParams(code_lang=code_lang), |
| 488 | ) |
| 489 | ) |
| 490 | |
| 491 | config_builder.add_column( |
| 492 | dd.LLMJudgeColumnConfig( |
| 493 | name="sql_prompt_judge_result", |
| 494 | model_alias=model_alias, |
| 495 | prompt=PROMPT_JUDGE_TEXT, |
| 496 | scores=PROMPT_SCORES, |
| 497 | ) |
| 498 | ) |
| 499 | |
| 500 | config_builder.add_column( |
| 501 | dd.LLMJudgeColumnConfig( |
| 502 | name="sql_judge_result", |
| 503 | model_alias=model_alias, |
| 504 | prompt=SQL_JUDGE_TEXT, |
| 505 | scores=SQL_SCORES, |
| 506 | ) |
| 507 | ) |
| 508 | |
| 509 | config_builder.add_column( |
| 510 | dd.LLMJudgeColumnConfig( |
| 511 | name="sql_context_judge_result", |
| 512 | model_alias=model_alias, |
| 513 | prompt=CONTEXT_JUDGE_PROMPTS[dialect], |
| 514 | scores=SQL_SCORES, |
| 515 | ) |
| 516 | ) |
| 517 | |
| 518 | config_builder.add_column( |
| 519 | dd.LLMJudgeColumnConfig( |
| 520 | name="sql_data_quality_judge_result", |
| 521 | model_alias=model_alias, |
| 522 | prompt=DATA_QUALITY_JUDGE_TEXT, |
| 523 | scores=DATA_QUALITY_SCORES, |
| 524 | ) |
| 525 | ) |
| 526 | |
| 527 | config_builder.add_column( |
| 528 | dd.LLMJudgeColumnConfig( |
| 529 | name="sql_knowledge_judge_result", |
| 530 | model_alias=model_alias, |
| 531 | prompt=KNOWLEDGE_JUDGE_TEXT, |
| 532 | scores=KNOWLEDGE_SCORES, |
| 533 | ) |
| 534 | ) |
| 535 | |
| 536 | # ========================================================================= |
| 537 | # Score extraction (15 flat columns for downstream filtering) |
| 538 | # ========================================================================= |
| 539 | |
| 540 | for judge_name, rubric_names in SCORE_EXTRACTION_MAP: |
| 541 | prefix = judge_name.replace("_judge_result", "").replace("sql_", "") |
| 542 | for rubric in rubric_names: |
| 543 | config_builder.add_column( |
| 544 | dd.ExpressionColumnConfig( |
| 545 | name=f"{prefix}_{rubric}_score", |
| 546 | expr=f"{{{{ {judge_name}.{rubric}.score if {judge_name}.{rubric}.score is not none else '' }}}}", |
| 547 | ) |
| 548 | ) |
| 549 | |
| 550 | return config_builder |
| 551 | |
| 552 | |
| 553 | def create_dataset( |
| 554 | config_builder: dd.DataDesignerConfigBuilder, |
| 555 | num_records: int, |
| 556 | artifact_path: Path | str | None = None, |
| 557 | ) -> DatasetCreationResults: |
| 558 | data_designer = DataDesigner(artifact_path=artifact_path) |
| 559 | results = data_designer.create(config_builder, num_records=num_records) |
| 560 | return results |
| 561 | |
| 562 | |
| 563 | # ============================================================================= |
| 564 | # Prompt Templates |
| 565 | # ============================================================================= |
| 566 | |
| 567 | PROMPT_GEN_SYSTEM_PROMPT = """\ |
| 568 | You write natural-language requests to a data assistant. You adapt your writing \ |
| 569 | style based on the specified instruction style, linguistic register, and politeness level. \ |
| 570 | Requests avoid meta-instructions, code, or explicit SQL jargon unless it's common-sense terminology.\ |
| 571 | """ |
| 572 | |
| 573 | PROMPT_GEN_TEXT = """\ |
| 574 | Write a single-sentence, natural-language request to a data assistant or agent to solve a specific SQL problem. |
| 575 | |
| 576 | ## Style Requirements |
| 577 | |
| 578 | * **Instruction Style:** Use a {{ instruction_style }} style. |
| 579 | * **Linguistic Register:** Use a {{ linguistic_register }} register. |
| 580 | * **Politeness Level:** Apply {{ politeness_level }} politeness. |
| 581 | |
| 582 | ## Content Constraints |
| 583 | |
| 584 | * Do NOT use explicit SQL keywords or technical jargon. Describe the **business problem**. |
| 585 | * Keep the intent specific; mention outputs, filters, and aggregations clearly. |
| 586 | * Do not include code, backticks, or any fenced blocks. |
| 587 | * Realistic Thresholds: The sample data is small (5-10 rows per table). Keep thresholds small. |
| 588 | * Relative Time: It's okay to say "recent", "last year", "past few months" instead of exact dates. |
| 589 | |
| 590 | ## Grounding Requirements |
| 591 | |
| 592 | * The request must pertain to the {{ industry_sector }} sector and {{ topic }} topic. |
| 593 | * The request must implicitly require SQL at the {{ sql_complexity }} level involving {{ sql_concept }}. |
| 594 | * The request must require a {{ sql_task_type }} task, specifically "{{ sql_task_concept }}". |
| 595 | * The problem must implicitly require handling "{{ data_quality_concept }}". |
| 596 | * The problem must implicitly require "{{ knowledge_concept }}".\ |
| 597 | """ |
| 598 | |
| 599 | _SCHEMA_GEN_TEMPLATE = """\ |
| 600 | Generate {dialect_label} DDL and sample data for tables relevant to the instruction. |
| 601 | Instruction: {{{{ sql_prompt }}}} |
| 602 | |
| 603 | Requirements: |
| 604 | |
| 605 | * Scope: Provide only CREATE TABLE and INSERT statements. |
| 606 | * Integrity: Define PRIMARY KEYs and FOREIGN KEYs with consistent data types. Use snake_case names. |
| 607 | * **Section Headers (REQUIRED):** |
| 608 | - `-- Core Tables` |
| 609 | - `-- Distractor Tables` |
| 610 | - `-- Sample Data for Core Tables` |
| 611 | - `-- Sample Data for Distractor Tables` |
| 612 | - Do NOT include any other comments. |
| 613 | * Coverage: Include 3-5 core tables for {{{{ industry_sector }}}}/{{{{ topic }}}} connected via FKs. |
| 614 | * **Distractor Tables:** Include 1-2 additional tables plausible for the domain but NOT needed \ |
| 615 | for the instruction. Each with FK links to core tables and 5-10 rows of realistic data. |
| 616 | * Realism: Include 3-5 distractor columns per table (created_at, updated_by, description, is_active). |
| 617 | * **Dirty Data:** Introduce "{{{{ data_quality_concept }}}}" issues. Dirty columns MUST be {text_type}. |
| 618 | * Sample Data: 5-10 realistic rows per table. Mix clean and dirty rows. |
| 619 | * **No Data Comments:** Do NOT explain which rows are dirty. |
| 620 | * **Determinism:** No NOW()/CURRENT_DATE in INSERT statements. Use explicit literal dates. |
| 621 | * Executability: End each statement with a semicolon. Use {dialect_label} syntax. |
| 622 | * Do not include meta-instructions or reasoning traces.\ |
| 623 | """ |
| 624 | |
| 625 | SCHEMA_GEN_PROMPTS = { |
| 626 | "sqlite": _SCHEMA_GEN_TEMPLATE.format(dialect_label="SQLite", text_type="TEXT"), |
| 627 | "mysql": _SCHEMA_GEN_TEMPLATE.format(dialect_label="MySQL", text_type="VARCHAR or TEXT"), |
| 628 | "postgres": _SCHEMA_GEN_TEMPLATE.format(dialect_label="PostgreSQL", text_type="TEXT or VARCHAR"), |
| 629 | } |
| 630 | |
| 631 | _SQL_GEN_BASE = """\ |
| 632 | Write {dialect_label} SQL for the instruction using only the provided database context. |
| 633 | Instruction: {{{{ sql_prompt }}}} |
| 634 | |
| 635 | Database Context: |
| 636 | {{{{ sql_context }}}} |
| 637 | |
| 638 | Requirements: |
| 639 | |
| 640 | * Validity: You are strictly forbidden from referencing any table or column not in the context. |
| 641 | * Handle Data Quality: Correctly handle "{{{{ data_quality_concept }}}}" using appropriate cleaning functions. |
| 642 | * Apply Knowledge: Apply "{{{{ knowledge_concept }}}}" even if it requires inferring unstated logic. |
| 643 | * Grounding: The SQL must demonstrate {{{{ sql_concept }}}} and {{{{ sql_task_type }}}}. |
| 644 | * Precision: Avoid SELECT *. Explicitly list columns; alias computed columns descriptively. |
| 645 | * Alignment: Match the {{{{ sql_complexity }}}} level. |
| 646 | * **Relative Time Anchoring:** Do NOT use CURRENT_DATE or real-time functions. Anchor to max date in data. |
| 647 | * **No Unasked Joins:** Do NOT join distractor tables or select distractor columns. |
| 648 | * **Logic:** Prefer CTEs to clean/normalize first, then compute/aggregate. |
| 649 | * Comments: Do not include inline comments. |
| 650 | * Formatting: Terminate with semicolons.\ |
| 651 | """ |
| 652 | |
| 653 | _SQLITE_EXTRAS = """ |
| 654 | * Use SQLite syntax: strftime for dates, LIMIT instead of TOP. |
| 655 | * Do NOT use LATERAL joins or REGEXP_REPLACE. Use REPLACE()/SUBSTR() for cleaning. |
| 656 | * Date Parsing: Normalize with REPLACE(date_col, '/', '-') inside date()/strftime().\ |
| 657 | """ |
| 658 | |
| 659 | _MYSQL_EXTRAS = """ |
| 660 | * Use MySQL syntax: DATE_ADD, DATEDIFF for dates, LIMIT for pagination, backticks for identifiers. |
| 661 | * Do NOT use REGEXP_REPLACE or CONVERT_TZ. Use REPLACE(), TRIM(), SUBSTRING().\ |
| 662 | """ |
| 663 | |
| 664 | _POSTGRES_EXTRAS = """ |
| 665 | * Use PostgreSQL syntax: :: for casting, ILIKE for case-insensitive matching, LIMIT and OFFSET. |
| 666 | * regexp_replace is available for cleaning.\ |
| 667 | """ |
| 668 | |
| 669 | SQL_GEN_PROMPTS = { |
| 670 | "sqlite": _SQL_GEN_BASE.format(dialect_label="SQLite") + _SQLITE_EXTRAS, |
| 671 | "mysql": _SQL_GEN_BASE.format(dialect_label="MySQL") + _MYSQL_EXTRAS, |
| 672 | "postgres": _SQL_GEN_BASE.format(dialect_label="PostgreSQL") + _POSTGRES_EXTRAS, |
| 673 | } |
| 674 | |
| 675 | # ============================================================================= |
| 676 | # Judge Prompts |
| 677 | # ============================================================================= |
| 678 | |
| 679 | PROMPT_JUDGE_TEXT = """\ |
| 680 | You are an expert product analyst who writes and reviews natural, human-like data requests. |
| 681 | Evaluate the **NL Prompt** quality. |
| 682 | |
| 683 | ## NL Prompt |
| 684 | |
| 685 | {{ sql_prompt }}\ |
| 686 | """ |
| 687 | |
| 688 | SQL_JUDGE_TEXT = """\ |
| 689 | You are a SQL data expert. Grade the quality of **Generated SQL** based on the prompt and context. |
| 690 | |
| 691 | Natural Language Prompt: |
| 692 | {{ sql_prompt }} |
| 693 | |
| 694 | Database Context: |
| 695 | {{ sql_context }} |
| 696 | |
| 697 | Generated SQL: |
| 698 | {{ sql }} |
| 699 | |
| 700 | When scoring, pay special attention to: |
| 701 | - **Minimal Table Usage:** Penalize queries that unnecessarily join distractor tables. |
| 702 | - **Minimal Column Usage:** Distractor columns should be ignored unless explicitly needed. |
| 703 | - **Correctness:** The query must produce the correct result. |
| 704 | - **Efficiency:** Prefer simple, readable solutions over unnecessarily complex ones.\ |
| 705 | """ |
| 706 | |
| 707 | _CONTEXT_JUDGE_TEMPLATE = """\ |
| 708 | You are a SQL database architect. Evaluate the **Generated Database Context** quality. |
| 709 | |
| 710 | Natural Language Prompt: |
| 711 | {{{{ sql_prompt }}}} |
| 712 | |
| 713 | Generated Database Context ({dialect}): |
| 714 | {{{{ sql_context }}}} |
| 715 | |
| 716 | When scoring, verify: |
| 717 | - **Sufficient Tables:** 3-5 core tables plus 1-2 distractor tables. Penalize bare-minimum schemas. |
| 718 | - **Distractor Columns:** Each table should include realistic columns beyond those needed for the query. |
| 719 | - **Realistic Relationships:** Appropriate PK/FK relationships. Distractor tables should have logical FK links. |
| 720 | - **Sample Data Quality:** Realistic, varied INSERT data. |
| 721 | - **Executability:** Syntactically correct for {dialect}.\ |
| 722 | """ |
| 723 | |
| 724 | CONTEXT_JUDGE_PROMPTS = { |
| 725 | "sqlite": _CONTEXT_JUDGE_TEMPLATE.format(dialect="SQLite"), |
| 726 | "mysql": _CONTEXT_JUDGE_TEMPLATE.format(dialect="MySQL"), |
| 727 | "postgres": _CONTEXT_JUDGE_TEMPLATE.format(dialect="PostgreSQL"), |
| 728 | } |
| 729 | |
| 730 | DATA_QUALITY_JUDGE_TEXT = """\ |
| 731 | You are an expert in data quality and validation. Score the SQL's handling of messy data. |
| 732 | |
| 733 | ## Natural Language Prompt |
| 734 | {{ sql_prompt }} |
| 735 | |
| 736 | ## Data Quality Challenge |
| 737 | {{ data_quality_challenge }} / {{ data_quality_concept }} |
| 738 | |
| 739 | ## Database Context |
| 740 | {{ sql_context }} |
| 741 | |
| 742 | ## Generated SQL |
| 743 | {{ sql }}\ |
| 744 | """ |
| 745 | |
| 746 | KNOWLEDGE_JUDGE_TEXT = """\ |
| 747 | You are an expert in business intelligence and semantic interpretation. \ |
| 748 | Score the SQL's application of implicit business knowledge. |
| 749 | |
| 750 | ## Natural Language Prompt |
| 751 | {{ sql_prompt }} |
| 752 | |
| 753 | ## Knowledge Dependency |
| 754 | {{ knowledge_dependency }} / {{ knowledge_concept }} |
| 755 | |
| 756 | ## Database Context |
| 757 | {{ sql_context }} |
| 758 | |
| 759 | ## Generated SQL |
| 760 | {{ sql }}\ |
| 761 | """ |
| 762 | |
| 763 | # ============================================================================= |
| 764 | # Scoring Rubrics (5 judges, 15 dimensions) |
| 765 | # ============================================================================= |
| 766 | |
| 767 | SQL_SCORES = [ |
| 768 | dd.Score( |
| 769 | name="relevance", |
| 770 | description="Uses only necessary tables/columns; ignores distractors", |
| 771 | options={ |
| 772 | "4": "Perfectly meets all requirements; uses only strictly necessary tables and columns.", |
| 773 | "3": "Meets most requirements with minor deviations; may include a slightly unnecessary column.", |
| 774 | "2": "Moderate deviation; joins an unnecessary table or selects several irrelevant columns.", |
| 775 | "1": "Significant deviations; multiple unnecessary table joins or largely irrelevant output.", |
| 776 | "0": "Does not adhere to instructions; query is unrelated or joins many unnecessary tables.", |
| 777 | }, |
| 778 | ), |
| 779 | dd.Score( |
| 780 | name="readability", |
| 781 | description="Formatting, clarity, and maintainability", |
| 782 | options={ |
| 783 | "4": "Excellently formatted, meaningful aliases, high readability and ease of maintenance.", |
| 784 | "3": "Well-formatted, relatively easy to understand; uses aliases with some organization.", |
| 785 | "2": "Somewhat readable with basic formatting but needs improvement.", |
| 786 | "1": "Minimal formatting, hard to understand; lacks meaningful names.", |
| 787 | "0": "Unreadable, no attempt at formatting.", |
| 788 | }, |
| 789 | ), |
| 790 | dd.Score( |
| 791 | name="scalability", |
| 792 | description="Scales well with larger datasets; avoids inefficient patterns", |
| 793 | options={ |
| 794 | "4": "Highly scalable; avoids Cartesian joins and unnecessary table joins.", |
| 795 | "3": "Scales well; minor areas for optimization such as an extra join.", |
| 796 | "2": "Moderately scalable; includes unnecessary joins or suboptimal access patterns.", |
| 797 | "1": "Poor scalability; joins multiple unnecessary tables or uses inefficient patterns.", |
| 798 | "0": "Does not scale; overlooks fundamental scalability practices.", |
| 799 | }, |
| 800 | ), |
| 801 | dd.Score( |
| 802 | name="standards", |
| 803 | description="Compliance with SQL standards and best practices", |
| 804 | options={ |
| 805 | "4": "Strictly adheres to SQL standards and best practices.", |
| 806 | "3": "Closely follows SQL standards and many best practices.", |
| 807 | "2": "Generally follows standards but has room for better alignment.", |
| 808 | "1": "Loosely follows standards, with several deviations.", |
| 809 | "0": "Does not follow standards; uses deprecated or non-standard syntax.", |
| 810 | }, |
| 811 | ), |
| 812 | ] |
| 813 | |
| 814 | PROMPT_SCORES = [ |
| 815 | dd.Score( |
| 816 | name="naturalness_of_wording", |
| 817 | description="How human-like, colloquial, and non-robotic the phrasing is", |
| 818 | options={ |
| 819 | "4": "Reads like a native speaker; concise, fluent, and natural.", |
| 820 | "3": "Generally natural; minor stiffness or formalism.", |
| 821 | "2": "Somewhat stilted or templated; noticeable artifacts.", |
| 822 | "1": "Robotic or awkward; obviously machine-generated.", |
| 823 | "0": "Unnatural and hard to read.", |
| 824 | }, |
| 825 | ), |
| 826 | dd.Score( |
| 827 | name="specificity_and_clarity", |
| 828 | description="Is the request specific about outputs, filters, and operations?", |
| 829 | options={ |
| 830 | "4": "Very specific and clear outputs/filters/aggregations; minimal ambiguity.", |
| 831 | "3": "Mostly specific; minor ambiguity remains.", |
| 832 | "2": "Partially specific; key details are missing.", |
| 833 | "1": "Vague; unclear what should be returned or computed.", |
| 834 | "0": "Completely ambiguous.", |
| 835 | }, |
| 836 | ), |
| 837 | dd.Score( |
| 838 | name="absence_of_sql_jargon", |
| 839 | description="Avoids explicit SQL terms, table/column names, or schema hints", |
| 840 | options={ |
| 841 | "4": "No SQL jargon at all; entirely tool-agnostic phrasing.", |
| 842 | "3": "Tiny hints but no explicit SQL or schema leakage.", |
| 843 | "2": "Occasional SQL terms or schema leakage present.", |
| 844 | "1": "Frequent SQL jargon and schema references.", |
| 845 | "0": "Reads like a SQL spec; heavy jargon.", |
| 846 | }, |
| 847 | ), |
| 848 | ] |
| 849 | |
| 850 | DATA_QUALITY_SCORES = [ |
| 851 | dd.Score( |
| 852 | name="correctness_of_cleaning_logic", |
| 853 | description="Does the query correctly and fully clean the messy data?", |
| 854 | options={ |
| 855 | "4": "Flawless cleaning logic; handles all transformations and edge cases perfectly.", |
| 856 | "3": "Correctly cleans data for most cases but might miss minor edge cases.", |
| 857 | "2": "Attempts to clean data, but logic is only partially correct.", |
| 858 | "1": "Cleaning logic is fundamentally flawed.", |
| 859 | "0": "No attempt to clean the data.", |
| 860 | }, |
| 861 | ), |
| 862 | dd.Score( |
| 863 | name="efficiency_of_cleaning_method", |
| 864 | description="Uses efficient, standard functions for cleaning?", |
| 865 | options={ |
| 866 | "4": "Highly efficient, optimal SQL functions for the task.", |
| 867 | "3": "Correct and standard functions, but a more performant approach exists.", |
| 868 | "2": "Convoluted or inefficient method where a simpler one would suffice.", |
| 869 | "1": "Very inefficient or non-standard method that would scale poorly.", |
| 870 | "0": "Completely inappropriate or non-functional method.", |
| 871 | }, |
| 872 | ), |
| 873 | ] |
| 874 | |
| 875 | KNOWLEDGE_SCORES = [ |
| 876 | dd.Score( |
| 877 | name="correctness_of_knowledge_application", |
| 878 | description="Does the query correctly translate the implicit knowledge into SQL logic?", |
| 879 | options={ |
| 880 | "4": "Flawlessly translates abstract concept into precise, correct SQL logic.", |
| 881 | "3": "Logic correctly reflects the knowledge concept but could be expressed more directly.", |
| 882 | "2": "Partially applies the logic, misinterpreting some nuances.", |
| 883 | "1": "Fundamentally misinterprets the knowledge concept.", |
| 884 | "0": "No attempt to apply the required knowledge.", |
| 885 | }, |
| 886 | ), |
| 887 | dd.Score( |
| 888 | name="clarity_of_inference", |
| 889 | description="Is the applied logic clear and self-explanatory within the query?", |
| 890 | options={ |
| 891 | "4": "Logic is immediately obvious through well-chosen aliases, CTEs, or clear filtering.", |
| 892 | "3": "Logic is correct but requires some inspection to understand.", |
| 893 | "2": "Logic is technically correct but obscure, using magic numbers or hard-to-read conditions.", |
| 894 | "1": "Logic is convoluted and appears incorrect.", |
| 895 | "0": "Query is completely opaque, with no discernible link to the required knowledge.", |
| 896 | }, |
| 897 | ), |
| 898 | ] |
| 899 | |
| 900 | SCORE_EXTRACTION_MAP = [ |
| 901 | ("sql_judge_result", ["relevance", "readability", "scalability", "standards"]), |
| 902 | ("sql_context_judge_result", ["relevance", "readability", "scalability", "standards"]), |
| 903 | ("sql_prompt_judge_result", ["naturalness_of_wording", "specificity_and_clarity", "absence_of_sql_jargon"]), |
| 904 | ("sql_data_quality_judge_result", ["correctness_of_cleaning_logic", "efficiency_of_cleaning_method"]), |
| 905 | ("sql_knowledge_judge_result", ["correctness_of_knowledge_application", "clarity_of_inference"]), |
| 906 | ] |
| 907 | |
| 908 | if __name__ == "__main__": |
| 909 | from argparse import ArgumentParser |
| 910 | |
| 911 | parser = ArgumentParser() |
| 912 | parser.add_argument("--model-alias", type=str, default="openai-text") |
| 913 | parser.add_argument("--num-records", type=int, default=5) |
| 914 | parser.add_argument("--artifact-path", type=str, default=None) |
| 915 | parser.add_argument( |
| 916 | "--dialect", |
| 917 | type=str, |
| 918 | default="sqlite", |
| 919 | choices=list(SQL_DIALECTS.keys()), |
| 920 | help="SQL dialect to generate for (default: sqlite)", |
| 921 | ) |
| 922 | args = parser.parse_args() |
| 923 | |
| 924 | config_builder = build_config(model_alias=args.model_alias, dialect=args.dialect) |
| 925 | results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) |
| 926 | |
| 927 | print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") |
| 928 | |
| 929 | results.load_analysis().to_report() |
|