Dev Note For a deep dive into the pipeline design, distractor injection strategy, quality waterfall analysis, and BIRD benchmark results, see Engineering an Enterprise-Grade Text-to-SQL Dataset with NeMo Data Designer.
1 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 # SPDX-License-Identifier: Apache-2.0 3 # /// script 4 # requires-python = ">=3.10" 5 # dependencies = [ 6 # "data-designer", 7 # ] 8 # /// 9 """Nemotron Super Text-to-SQL Recipe: Distractors, Dirty Data, and Multi-Judge Scoring 10 11 Generate enterprise-grade text-to-SQL training data with dialect-specific SQL 12 (SQLite, MySQL, PostgreSQL), distractor table/column injection, dirty data 13 handling, conditional sampling, and multi-dimensional LLM judge scoring. 14 15 This recipe implements the pipeline used to produce 96.5k validated text-to-SQL 16 records for Nemotron Super v3 SFT training, which raised BIRD benchmark 17 execution accuracy from 26.77% to 41.80%. 18 19 Pipeline architecture: 20 21 ┌─────────────────────────────────────────────────────────────────────────┐ 22 │ STAGE 1: SEEDING & DIVERSIFICATION │ 23 │ │ 24 │ Domain Controls SQL Controls Prompt Controls │ 25 │ ├─ industry_sector ├─ sql_complexity ├─ instruction_style │ 26 │ ├─ topic (conditional) ├─ sql_concept ├─ linguistic_register │ 27 │ ├─ data_quality_challenge├─ sql_task_type └─ politeness_level │ 28 │ ├─ data_quality_concept │ (conditional) │ 29 │ ├─ knowledge_dependency └─ sql_task_concept │ 30 │ └─ knowledge_concept │ 31 ├─────────────────────────────────────────────────────────────────────────┤ 32 │ STAGE 2: PROMPT GENERATION (LLM) │ 33 │ Natural-language request grounded in metadata; no SQL jargon. │ 34 │ Style adapts to instruction_style × register × politeness. │ 35 ├─────────────────────────────────────────────────────────────────────────┤ 36 │ STAGE 3: SCHEMA + DATA GENERATION (LLM) │ 37 │ Dialect-specific DDL + INSERT with 3-5 core tables, 1-2 distractor │ 38 │ tables, 3-5 distractor columns per table, dirty data injection. │ 39 ├─────────────────────────────────────────────────────────────────────────┤ 40 │ STAGE 4: SQL GENERATION (LLM) │ 41 │ Dialect-specific SQL; ignores distractors; handles dirty data. │ 42 ├─────────────────────────────────────────────────────────────────────────┤ 43 │ STAGE 5: VALIDATION + QUALITY SCORING │ 44 │ │ 45 │ Syntax Validator 5 LLM Judges (0-4 scores) │ 46 │ ├─ SQL_SQLITE ├─ Prompt: naturalness, specificity, │ 47 │ ├─ SQL_MYSQL │ absence of SQL jargon │ 48 │ └─ SQL_POSTGRES ├─ SQL: relevance, readability, │ 49 │ │ scalability, standards │ 50 │ ├─ Context: relevance, readability, │ 51 │ │ scalability, standards │ 52 │ ├─ Data Quality: cleaning correctness, │ 53 │ │ efficiency │ 54 │ └─ Knowledge: application correctness, │ 55 │ clarity of inference │ 56 │ │ 57 │ 15 score columns extracted for downstream filtering │ 58 └─────────────────────────────────────────────────────────────────────────┘ 59 60 Prerequisites: 61 - OPENAI_API_KEY environment variable for OpenAI provider model aliases (default model alias is "openai-text"). 62 - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases. 63 64 Run: 65 # Basic usage (generates 5 records by default, SQLite dialect) 66 uv run enterprise_text_to_sql.py 67 68 # Generate for a specific dialect 69 uv run enterprise_text_to_sql.py --dialect postgres 70 71 # For help message and available options 72 uv run enterprise_text_to_sql.py --help 73 """ 74 75 from pathlib import Path 76 77 import data_designer.config as dd 78 from data_designer.interface import DataDesigner, DatasetCreationResults 79 80 SQL_DIALECTS = { 81 "sqlite": dd.CodeLang.SQL_SQLITE, 82 "mysql": dd.CodeLang.SQL_MYSQL, 83 "postgres": dd.CodeLang.SQL_POSTGRES, 84 } 85 86 87 def build_config(model_alias: str, dialect: str = "sqlite") -> dd.DataDesignerConfigBuilder: 88 code_lang = SQL_DIALECTS[dialect] 89 config_builder = dd.DataDesignerConfigBuilder() 90 91 # ========================================================================= 92 # Stage 1: Seeding & diversification 93 # ========================================================================= 94 95 config_builder.add_column( 96 dd.SamplerColumnConfig( 97 name="sql_dialect", 98 sampler_type=dd.SamplerType.CATEGORY, 99 params=dd.CategorySamplerParams(values=[dialect]), 100 ) 101 ) 102 103 config_builder.add_column( 104 dd.SamplerColumnConfig( 105 name="industry_sector", 106 sampler_type=dd.SamplerType.CATEGORY, 107 params=dd.CategorySamplerParams( 108 values=[ 109 "Healthcare", 110 "Financial Services", 111 "Retail", 112 "Technology", 113 "Manufacturing", 114 "Aerospace", 115 "Energy", 116 "Telecommunications", 117 "Transportation", 118 "Education", 119 ], 120 ), 121 ) 122 ) 123 124 config_builder.add_column( 125 dd.SamplerColumnConfig( 126 name="topic", 127 sampler_type=dd.SamplerType.SUBCATEGORY, 128 params=dd.SubcategorySamplerParams( 129 category="industry_sector", 130 values={ 131 "Healthcare": [ 132 "Electronic Health Records", 133 "Telemedicine Platforms", 134 "Clinical Trials", 135 "Patient Scheduling", 136 "Insurance Claims", 137 ], 138 "Financial Services": [ 139 "Fraud Detection", 140 "Trading Systems", 141 "Risk Assessment", 142 "Portfolio Management", 143 "Regulatory Compliance", 144 ], 145 "Retail": [ 146 "Inventory Management", 147 "Customer Segmentation", 148 "Pricing Optimization", 149 "Supply Chain", 150 "Returns Processing", 151 ], 152 "Technology": [ 153 "Cloud Platforms", 154 "ML Pipelines", 155 "DevOps Tools", 156 "API Gateway Logs", 157 "User Analytics", 158 ], 159 "Manufacturing": [ 160 "Quality Control", 161 "Production Scheduling", 162 "Equipment Maintenance", 163 "Supply Chain Optimization", 164 "Safety Compliance", 165 ], 166 "Aerospace": [ 167 "Flight Operations", 168 "Satellite Systems", 169 "Parts Procurement", 170 "Maintenance Scheduling", 171 "Crew Management", 172 ], 173 "Energy": [ 174 "Grid Management", 175 "Renewable Forecasting", 176 "Asset Monitoring", 177 "Trading and Markets", 178 "Regulatory Reporting", 179 ], 180 "Telecommunications": [ 181 "Network Operations", 182 "Customer Billing", 183 "Service Provisioning", 184 "Call Detail Records", 185 "Churn Prediction", 186 ], 187 "Transportation": [ 188 "Fleet Management", 189 "Route Optimization", 190 "Freight Tracking", 191 "Driver Scheduling", 192 "Maintenance Records", 193 ], 194 "Education": [ 195 "Student Records", 196 "Course Enrollment", 197 "Learning Analytics", 198 "Financial Aid", 199 "Faculty Management", 200 ], 201 }, 202 ), 203 ) 204 ) 205 206 config_builder.add_column( 207 dd.SamplerColumnConfig( 208 name="sql_complexity", 209 sampler_type=dd.SamplerType.CATEGORY, 210 params=dd.CategorySamplerParams( 211 values=["Beginner", "Intermediate", "Advanced"], 212 ), 213 ) 214 ) 215 216 config_builder.add_column( 217 dd.SamplerColumnConfig( 218 name="sql_concept", 219 sampler_type=dd.SamplerType.SUBCATEGORY, 220 params=dd.SubcategorySamplerParams( 221 category="sql_complexity", 222 values={ 223 "Beginner": [ 224 "Basic SELECT Statements", 225 "WHERE Clauses", 226 "Simple Aggregations", 227 "Basic JOINs", 228 "INSERT, UPDATE, DELETE", 229 "ORDER BY and LIMIT", 230 ], 231 "Intermediate": [ 232 "Window Functions", 233 "Correlated Subqueries", 234 "Multiple JOINs with Aggregations", 235 "CASE Expressions", 236 "GROUP BY with HAVING", 237 "Set Operations (UNION, INTERSECT, EXCEPT)", 238 ], 239 "Advanced": [ 240 "Recursive CTEs", 241 "Frame Clauses", 242 "Pivot/Unpivot Patterns", 243 "Complex Analytical Functions", 244 "Self-Joins for Hierarchies", 245 "Conditional Aggregation", 246 ], 247 }, 248 ), 249 ) 250 ) 251 252 config_builder.add_column( 253 dd.SamplerColumnConfig( 254 name="sql_task_type", 255 sampler_type=dd.SamplerType.CATEGORY, 256 params=dd.CategorySamplerParams( 257 values=[ 258 "Foundational Queries & DML", 259 "Data Quality & Validation", 260 "Advanced Analytics & Windowing", 261 "Schema, DDL & Performance", 262 ], 263 ), 264 conditional_params={ 265 "sql_complexity == 'Beginner'": dd.CategorySamplerParams( 266 values=["Foundational Queries & DML", "Data Quality & Validation"], 267 ), 268 "sql_complexity == 'Intermediate'": dd.CategorySamplerParams( 269 values=[ 270 "Foundational Queries & DML", 271 "Data Quality & Validation", 272 "Advanced Analytics & Windowing", 273 ], 274 ), 275 "sql_complexity == 'Advanced'": dd.CategorySamplerParams( 276 values=[ 277 "Advanced Analytics & Windowing", 278 "Schema, DDL & Performance", 279 "Data Quality & Validation", 280 ], 281 ), 282 }, 283 ) 284 ) 285 286 config_builder.add_column( 287 dd.SamplerColumnConfig( 288 name="sql_task_concept", 289 sampler_type=dd.SamplerType.SUBCATEGORY, 290 params=dd.SubcategorySamplerParams( 291 category="sql_task_type", 292 values={ 293 "Foundational Queries & DML": [ 294 "Multi-table retrieval", 295 "Filtered aggregation", 296 "Conditional insert/update", 297 "Ranked retrieval", 298 ], 299 "Data Quality & Validation": [ 300 "NULL detection and handling", 301 "Duplicate detection", 302 "Data type casting and cleanup", 303 "Referential integrity check", 304 ], 305 "Advanced Analytics & Windowing": [ 306 "Running totals and moving averages", 307 "Ranking and percentile computation", 308 "Gap and island detection", 309 "Year-over-year comparison", 310 ], 311 "Schema, DDL & Performance": [ 312 "Index-aware query optimization", 313 "Partitioned query design", 314 "Constraint-based validation", 315 "Schema migration pattern", 316 ], 317 }, 318 ), 319 ) 320 ) 321 322 config_builder.add_column( 323 dd.SamplerColumnConfig( 324 name="data_quality_challenge", 325 sampler_type=dd.SamplerType.CATEGORY, 326 params=dd.CategorySamplerParams( 327 values=[ 328 "Type Mismatches", 329 "Temporal Drift", 330 "Embedded Special Characters", 331 "Mixed Formats", 332 "NULL Handling", 333 ], 334 ), 335 ) 336 ) 337 338 config_builder.add_column( 339 dd.SamplerColumnConfig( 340 name="data_quality_concept", 341 sampler_type=dd.SamplerType.SUBCATEGORY, 342 params=dd.SubcategorySamplerParams( 343 category="data_quality_challenge", 344 values={ 345 "Type Mismatches": [ 346 "Currency stored as text with symbols ($57,500)", 347 "Boolean stored as string (yes/no/true/false/1/0)", 348 ], 349 "Temporal Drift": [ 350 "Dates stored as text in mixed formats (01-Jan-2023 vs 2023/01/01)", 351 "Timestamps with inconsistent timezone handling", 352 ], 353 "Embedded Special Characters": [ 354 "Newlines or tabs inside text fields", 355 "Unicode or accented characters in names", 356 ], 357 "Mixed Formats": [ 358 "Phone numbers in mixed formats (555-1234 vs (555) 123-4567)", 359 "Addresses with inconsistent abbreviations", 360 ], 361 "NULL Handling": [ 362 "NULLs disguised as empty strings or sentinel values (-1, N/A)", 363 "Optional FKs with NULL references", 364 ], 365 }, 366 ), 367 ) 368 ) 369 370 config_builder.add_column( 371 dd.SamplerColumnConfig( 372 name="knowledge_dependency", 373 sampler_type=dd.SamplerType.CATEGORY, 374 params=dd.CategorySamplerParams( 375 values=["Domain Knowledge", "Implicit Logic", "Common Sense"], 376 ), 377 ) 378 ) 379 380 config_builder.add_column( 381 dd.SamplerColumnConfig( 382 name="knowledge_concept", 383 sampler_type=dd.SamplerType.SUBCATEGORY, 384 params=dd.SubcategorySamplerParams( 385 category="knowledge_dependency", 386 values={ 387 "Domain Knowledge": [ 388 "Industry-specific business rules", 389 "Regulatory thresholds and compliance criteria", 390 "Domain-specific KPI definitions", 391 ], 392 "Implicit Logic": [ 393 "Fiscal year vs calendar year reasoning", 394 "Business-day exclusion logic", 395 "Implied sort/filter criteria from context", 396 ], 397 "Common Sense": [ 398 "Unit conversion (e.g., cents to dollars)", 399 "Age or duration calculation from dates", 400 "Geographic or hierarchical inference", 401 ], 402 }, 403 ), 404 ) 405 ) 406 407 config_builder.add_column( 408 dd.SamplerColumnConfig( 409 name="instruction_style", 410 sampler_type=dd.SamplerType.CATEGORY, 411 params=dd.CategorySamplerParams( 412 values=["imperative", "declarative", "interrogative", "contextual", "abbreviated"], 413 ), 414 ) 415 ) 416 417 config_builder.add_column( 418 dd.SamplerColumnConfig( 419 name="linguistic_register", 420 sampler_type=dd.SamplerType.CATEGORY, 421 params=dd.CategorySamplerParams( 422 values=["formal", "conversational", "technical", "academic", "direct"], 423 ), 424 ) 425 ) 426 427 config_builder.add_column( 428 dd.SamplerColumnConfig( 429 name="politeness_level", 430 sampler_type=dd.SamplerType.CATEGORY, 431 params=dd.CategorySamplerParams( 432 values=["none", "minimal", "polite", "very polite"], 433 ), 434 ) 435 ) 436 437 # ========================================================================= 438 # Stage 2: Prompt generation 439 # ========================================================================= 440 441 config_builder.add_column( 442 dd.LLMTextColumnConfig( 443 name="sql_prompt", 444 model_alias=model_alias, 445 system_prompt=PROMPT_GEN_SYSTEM_PROMPT, 446 prompt=PROMPT_GEN_TEXT, 447 ) 448 ) 449 450 # ========================================================================= 451 # Stage 3: Schema + data with distractor injection 452 # ========================================================================= 453 454 config_builder.add_column( 455 dd.LLMCodeColumnConfig( 456 name="sql_context", 457 model_alias=model_alias, 458 system_prompt="You are an expert SQL database architect who designs well-structured, normalized schemas.", 459 prompt=SCHEMA_GEN_PROMPTS[dialect], 460 code_lang=code_lang, 461 ) 462 ) 463 464 # ========================================================================= 465 # Stage 4: Dialect-specific SQL generation 466 # ========================================================================= 467 468 config_builder.add_column( 469 dd.LLMCodeColumnConfig( 470 name="sql", 471 model_alias=model_alias, 472 system_prompt="You are an expert SQL programmer who solves problems with clean, efficient, and perfectly structured queries. Return only the final SQL.", 473 prompt=SQL_GEN_PROMPTS[dialect], 474 code_lang=code_lang, 475 ) 476 ) 477 478 # ========================================================================= 479 # Stage 5: Validation + 5 LLM judges 480 # ========================================================================= 481 482 config_builder.add_column( 483 dd.ValidationColumnConfig( 484 name="sql_validity_result", 485 target_columns=["sql"], 486 validator_type=dd.ValidatorType.CODE, 487 validator_params=dd.CodeValidatorParams(code_lang=code_lang), 488 ) 489 ) 490 491 config_builder.add_column( 492 dd.LLMJudgeColumnConfig( 493 name="sql_prompt_judge_result", 494 model_alias=model_alias, 495 prompt=PROMPT_JUDGE_TEXT, 496 scores=PROMPT_SCORES, 497 ) 498 ) 499 500 config_builder.add_column( 501 dd.LLMJudgeColumnConfig( 502 name="sql_judge_result", 503 model_alias=model_alias, 504 prompt=SQL_JUDGE_TEXT, 505 scores=SQL_SCORES, 506 ) 507 ) 508 509 config_builder.add_column( 510 dd.LLMJudgeColumnConfig( 511 name="sql_context_judge_result", 512 model_alias=model_alias, 513 prompt=CONTEXT_JUDGE_PROMPTS[dialect], 514 scores=SQL_SCORES, 515 ) 516 ) 517 518 config_builder.add_column( 519 dd.LLMJudgeColumnConfig( 520 name="sql_data_quality_judge_result", 521 model_alias=model_alias, 522 prompt=DATA_QUALITY_JUDGE_TEXT, 523 scores=DATA_QUALITY_SCORES, 524 ) 525 ) 526 527 config_builder.add_column( 528 dd.LLMJudgeColumnConfig( 529 name="sql_knowledge_judge_result", 530 model_alias=model_alias, 531 prompt=KNOWLEDGE_JUDGE_TEXT, 532 scores=KNOWLEDGE_SCORES, 533 ) 534 ) 535 536 # ========================================================================= 537 # Score extraction (15 flat columns for downstream filtering) 538 # ========================================================================= 539 540 for judge_name, rubric_names in SCORE_EXTRACTION_MAP: 541 prefix = judge_name.replace("_judge_result", "").replace("sql_", "") 542 for rubric in rubric_names: 543 config_builder.add_column( 544 dd.ExpressionColumnConfig( 545 name=f"{prefix}_{rubric}_score", 546 expr=f"{{{{ {judge_name}.{rubric}.score if {judge_name}.{rubric}.score is not none else '' }}}}", 547 ) 548 ) 549 550 return config_builder 551 552 553 def create_dataset( 554 config_builder: dd.DataDesignerConfigBuilder, 555 num_records: int, 556 artifact_path: Path | str | None = None, 557 ) -> DatasetCreationResults: 558 data_designer = DataDesigner(artifact_path=artifact_path) 559 results = data_designer.create(config_builder, num_records=num_records) 560 return results 561 562 563 # ============================================================================= 564 # Prompt Templates 565 # ============================================================================= 566 567 PROMPT_GEN_SYSTEM_PROMPT = """\ 568 You write natural-language requests to a data assistant. You adapt your writing \ 569 style based on the specified instruction style, linguistic register, and politeness level. \ 570 Requests avoid meta-instructions, code, or explicit SQL jargon unless it's common-sense terminology.\ 571 """ 572 573 PROMPT_GEN_TEXT = """\ 574 Write a single-sentence, natural-language request to a data assistant or agent to solve a specific SQL problem. 575 576 ## Style Requirements 577 578 * **Instruction Style:** Use a {{ instruction_style }} style. 579 * **Linguistic Register:** Use a {{ linguistic_register }} register. 580 * **Politeness Level:** Apply {{ politeness_level }} politeness. 581 582 ## Content Constraints 583 584 * Do NOT use explicit SQL keywords or technical jargon. Describe the **business problem**. 585 * Keep the intent specific; mention outputs, filters, and aggregations clearly. 586 * Do not include code, backticks, or any fenced blocks. 587 * Realistic Thresholds: The sample data is small (5-10 rows per table). Keep thresholds small. 588 * Relative Time: It's okay to say "recent", "last year", "past few months" instead of exact dates. 589 590 ## Grounding Requirements 591 592 * The request must pertain to the {{ industry_sector }} sector and {{ topic }} topic. 593 * The request must implicitly require SQL at the {{ sql_complexity }} level involving {{ sql_concept }}. 594 * The request must require a {{ sql_task_type }} task, specifically "{{ sql_task_concept }}". 595 * The problem must implicitly require handling "{{ data_quality_concept }}". 596 * The problem must implicitly require "{{ knowledge_concept }}".\ 597 """ 598 599 _SCHEMA_GEN_TEMPLATE = """\ 600 Generate {dialect_label} DDL and sample data for tables relevant to the instruction. 601 Instruction: {{{{ sql_prompt }}}} 602 603 Requirements: 604 605 * Scope: Provide only CREATE TABLE and INSERT statements. 606 * Integrity: Define PRIMARY KEYs and FOREIGN KEYs with consistent data types. Use snake_case names. 607 * **Section Headers (REQUIRED):** 608 - `-- Core Tables` 609 - `-- Distractor Tables` 610 - `-- Sample Data for Core Tables` 611 - `-- Sample Data for Distractor Tables` 612 - Do NOT include any other comments. 613 * Coverage: Include 3-5 core tables for {{{{ industry_sector }}}}/{{{{ topic }}}} connected via FKs. 614 * **Distractor Tables:** Include 1-2 additional tables plausible for the domain but NOT needed \ 615 for the instruction. Each with FK links to core tables and 5-10 rows of realistic data. 616 * Realism: Include 3-5 distractor columns per table (created_at, updated_by, description, is_active). 617 * **Dirty Data:** Introduce "{{{{ data_quality_concept }}}}" issues. Dirty columns MUST be {text_type}. 618 * Sample Data: 5-10 realistic rows per table. Mix clean and dirty rows. 619 * **No Data Comments:** Do NOT explain which rows are dirty. 620 * **Determinism:** No NOW()/CURRENT_DATE in INSERT statements. Use explicit literal dates. 621 * Executability: End each statement with a semicolon. Use {dialect_label} syntax. 622 * Do not include meta-instructions or reasoning traces.\ 623 """ 624 625 SCHEMA_GEN_PROMPTS = { 626 "sqlite": _SCHEMA_GEN_TEMPLATE.format(dialect_label="SQLite", text_type="TEXT"), 627 "mysql": _SCHEMA_GEN_TEMPLATE.format(dialect_label="MySQL", text_type="VARCHAR or TEXT"), 628 "postgres": _SCHEMA_GEN_TEMPLATE.format(dialect_label="PostgreSQL", text_type="TEXT or VARCHAR"), 629 } 630 631 _SQL_GEN_BASE = """\ 632 Write {dialect_label} SQL for the instruction using only the provided database context. 633 Instruction: {{{{ sql_prompt }}}} 634 635 Database Context: 636 {{{{ sql_context }}}} 637 638 Requirements: 639 640 * Validity: You are strictly forbidden from referencing any table or column not in the context. 641 * Handle Data Quality: Correctly handle "{{{{ data_quality_concept }}}}" using appropriate cleaning functions. 642 * Apply Knowledge: Apply "{{{{ knowledge_concept }}}}" even if it requires inferring unstated logic. 643 * Grounding: The SQL must demonstrate {{{{ sql_concept }}}} and {{{{ sql_task_type }}}}. 644 * Precision: Avoid SELECT *. Explicitly list columns; alias computed columns descriptively. 645 * Alignment: Match the {{{{ sql_complexity }}}} level. 646 * **Relative Time Anchoring:** Do NOT use CURRENT_DATE or real-time functions. Anchor to max date in data. 647 * **No Unasked Joins:** Do NOT join distractor tables or select distractor columns. 648 * **Logic:** Prefer CTEs to clean/normalize first, then compute/aggregate. 649 * Comments: Do not include inline comments. 650 * Formatting: Terminate with semicolons.\ 651 """ 652 653 _SQLITE_EXTRAS = """ 654 * Use SQLite syntax: strftime for dates, LIMIT instead of TOP. 655 * Do NOT use LATERAL joins or REGEXP_REPLACE. Use REPLACE()/SUBSTR() for cleaning. 656 * Date Parsing: Normalize with REPLACE(date_col, '/', '-') inside date()/strftime().\ 657 """ 658 659 _MYSQL_EXTRAS = """ 660 * Use MySQL syntax: DATE_ADD, DATEDIFF for dates, LIMIT for pagination, backticks for identifiers. 661 * Do NOT use REGEXP_REPLACE or CONVERT_TZ. Use REPLACE(), TRIM(), SUBSTRING().\ 662 """ 663 664 _POSTGRES_EXTRAS = """ 665 * Use PostgreSQL syntax: :: for casting, ILIKE for case-insensitive matching, LIMIT and OFFSET. 666 * regexp_replace is available for cleaning.\ 667 """ 668 669 SQL_GEN_PROMPTS = { 670 "sqlite": _SQL_GEN_BASE.format(dialect_label="SQLite") + _SQLITE_EXTRAS, 671 "mysql": _SQL_GEN_BASE.format(dialect_label="MySQL") + _MYSQL_EXTRAS, 672 "postgres": _SQL_GEN_BASE.format(dialect_label="PostgreSQL") + _POSTGRES_EXTRAS, 673 } 674 675 # ============================================================================= 676 # Judge Prompts 677 # ============================================================================= 678 679 PROMPT_JUDGE_TEXT = """\ 680 You are an expert product analyst who writes and reviews natural, human-like data requests. 681 Evaluate the **NL Prompt** quality. 682 683 ## NL Prompt 684 685 {{ sql_prompt }}\ 686 """ 687 688 SQL_JUDGE_TEXT = """\ 689 You are a SQL data expert. Grade the quality of **Generated SQL** based on the prompt and context. 690 691 Natural Language Prompt: 692 {{ sql_prompt }} 693 694 Database Context: 695 {{ sql_context }} 696 697 Generated SQL: 698 {{ sql }} 699 700 When scoring, pay special attention to: 701 - **Minimal Table Usage:** Penalize queries that unnecessarily join distractor tables. 702 - **Minimal Column Usage:** Distractor columns should be ignored unless explicitly needed. 703 - **Correctness:** The query must produce the correct result. 704 - **Efficiency:** Prefer simple, readable solutions over unnecessarily complex ones.\ 705 """ 706 707 _CONTEXT_JUDGE_TEMPLATE = """\ 708 You are a SQL database architect. Evaluate the **Generated Database Context** quality. 709 710 Natural Language Prompt: 711 {{{{ sql_prompt }}}} 712 713 Generated Database Context ({dialect}): 714 {{{{ sql_context }}}} 715 716 When scoring, verify: 717 - **Sufficient Tables:** 3-5 core tables plus 1-2 distractor tables. Penalize bare-minimum schemas. 718 - **Distractor Columns:** Each table should include realistic columns beyond those needed for the query. 719 - **Realistic Relationships:** Appropriate PK/FK relationships. Distractor tables should have logical FK links. 720 - **Sample Data Quality:** Realistic, varied INSERT data. 721 - **Executability:** Syntactically correct for {dialect}.\ 722 """ 723 724 CONTEXT_JUDGE_PROMPTS = { 725 "sqlite": _CONTEXT_JUDGE_TEMPLATE.format(dialect="SQLite"), 726 "mysql": _CONTEXT_JUDGE_TEMPLATE.format(dialect="MySQL"), 727 "postgres": _CONTEXT_JUDGE_TEMPLATE.format(dialect="PostgreSQL"), 728 } 729 730 DATA_QUALITY_JUDGE_TEXT = """\ 731 You are an expert in data quality and validation. Score the SQL's handling of messy data. 732 733 ## Natural Language Prompt 734 {{ sql_prompt }} 735 736 ## Data Quality Challenge 737 {{ data_quality_challenge }} / {{ data_quality_concept }} 738 739 ## Database Context 740 {{ sql_context }} 741 742 ## Generated SQL 743 {{ sql }}\ 744 """ 745 746 KNOWLEDGE_JUDGE_TEXT = """\ 747 You are an expert in business intelligence and semantic interpretation. \ 748 Score the SQL's application of implicit business knowledge. 749 750 ## Natural Language Prompt 751 {{ sql_prompt }} 752 753 ## Knowledge Dependency 754 {{ knowledge_dependency }} / {{ knowledge_concept }} 755 756 ## Database Context 757 {{ sql_context }} 758 759 ## Generated SQL 760 {{ sql }}\ 761 """ 762 763 # ============================================================================= 764 # Scoring Rubrics (5 judges, 15 dimensions) 765 # ============================================================================= 766 767 SQL_SCORES = [ 768 dd.Score( 769 name="relevance", 770 description="Uses only necessary tables/columns; ignores distractors", 771 options={ 772 "4": "Perfectly meets all requirements; uses only strictly necessary tables and columns.", 773 "3": "Meets most requirements with minor deviations; may include a slightly unnecessary column.", 774 "2": "Moderate deviation; joins an unnecessary table or selects several irrelevant columns.", 775 "1": "Significant deviations; multiple unnecessary table joins or largely irrelevant output.", 776 "0": "Does not adhere to instructions; query is unrelated or joins many unnecessary tables.", 777 }, 778 ), 779 dd.Score( 780 name="readability", 781 description="Formatting, clarity, and maintainability", 782 options={ 783 "4": "Excellently formatted, meaningful aliases, high readability and ease of maintenance.", 784 "3": "Well-formatted, relatively easy to understand; uses aliases with some organization.", 785 "2": "Somewhat readable with basic formatting but needs improvement.", 786 "1": "Minimal formatting, hard to understand; lacks meaningful names.", 787 "0": "Unreadable, no attempt at formatting.", 788 }, 789 ), 790 dd.Score( 791 name="scalability", 792 description="Scales well with larger datasets; avoids inefficient patterns", 793 options={ 794 "4": "Highly scalable; avoids Cartesian joins and unnecessary table joins.", 795 "3": "Scales well; minor areas for optimization such as an extra join.", 796 "2": "Moderately scalable; includes unnecessary joins or suboptimal access patterns.", 797 "1": "Poor scalability; joins multiple unnecessary tables or uses inefficient patterns.", 798 "0": "Does not scale; overlooks fundamental scalability practices.", 799 }, 800 ), 801 dd.Score( 802 name="standards", 803 description="Compliance with SQL standards and best practices", 804 options={ 805 "4": "Strictly adheres to SQL standards and best practices.", 806 "3": "Closely follows SQL standards and many best practices.", 807 "2": "Generally follows standards but has room for better alignment.", 808 "1": "Loosely follows standards, with several deviations.", 809 "0": "Does not follow standards; uses deprecated or non-standard syntax.", 810 }, 811 ), 812 ] 813 814 PROMPT_SCORES = [ 815 dd.Score( 816 name="naturalness_of_wording", 817 description="How human-like, colloquial, and non-robotic the phrasing is", 818 options={ 819 "4": "Reads like a native speaker; concise, fluent, and natural.", 820 "3": "Generally natural; minor stiffness or formalism.", 821 "2": "Somewhat stilted or templated; noticeable artifacts.", 822 "1": "Robotic or awkward; obviously machine-generated.", 823 "0": "Unnatural and hard to read.", 824 }, 825 ), 826 dd.Score( 827 name="specificity_and_clarity", 828 description="Is the request specific about outputs, filters, and operations?", 829 options={ 830 "4": "Very specific and clear outputs/filters/aggregations; minimal ambiguity.", 831 "3": "Mostly specific; minor ambiguity remains.", 832 "2": "Partially specific; key details are missing.", 833 "1": "Vague; unclear what should be returned or computed.", 834 "0": "Completely ambiguous.", 835 }, 836 ), 837 dd.Score( 838 name="absence_of_sql_jargon", 839 description="Avoids explicit SQL terms, table/column names, or schema hints", 840 options={ 841 "4": "No SQL jargon at all; entirely tool-agnostic phrasing.", 842 "3": "Tiny hints but no explicit SQL or schema leakage.", 843 "2": "Occasional SQL terms or schema leakage present.", 844 "1": "Frequent SQL jargon and schema references.", 845 "0": "Reads like a SQL spec; heavy jargon.", 846 }, 847 ), 848 ] 849 850 DATA_QUALITY_SCORES = [ 851 dd.Score( 852 name="correctness_of_cleaning_logic", 853 description="Does the query correctly and fully clean the messy data?", 854 options={ 855 "4": "Flawless cleaning logic; handles all transformations and edge cases perfectly.", 856 "3": "Correctly cleans data for most cases but might miss minor edge cases.", 857 "2": "Attempts to clean data, but logic is only partially correct.", 858 "1": "Cleaning logic is fundamentally flawed.", 859 "0": "No attempt to clean the data.", 860 }, 861 ), 862 dd.Score( 863 name="efficiency_of_cleaning_method", 864 description="Uses efficient, standard functions for cleaning?", 865 options={ 866 "4": "Highly efficient, optimal SQL functions for the task.", 867 "3": "Correct and standard functions, but a more performant approach exists.", 868 "2": "Convoluted or inefficient method where a simpler one would suffice.", 869 "1": "Very inefficient or non-standard method that would scale poorly.", 870 "0": "Completely inappropriate or non-functional method.", 871 }, 872 ), 873 ] 874 875 KNOWLEDGE_SCORES = [ 876 dd.Score( 877 name="correctness_of_knowledge_application", 878 description="Does the query correctly translate the implicit knowledge into SQL logic?", 879 options={ 880 "4": "Flawlessly translates abstract concept into precise, correct SQL logic.", 881 "3": "Logic correctly reflects the knowledge concept but could be expressed more directly.", 882 "2": "Partially applies the logic, misinterpreting some nuances.", 883 "1": "Fundamentally misinterprets the knowledge concept.", 884 "0": "No attempt to apply the required knowledge.", 885 }, 886 ), 887 dd.Score( 888 name="clarity_of_inference", 889 description="Is the applied logic clear and self-explanatory within the query?", 890 options={ 891 "4": "Logic is immediately obvious through well-chosen aliases, CTEs, or clear filtering.", 892 "3": "Logic is correct but requires some inspection to understand.", 893 "2": "Logic is technically correct but obscure, using magic numbers or hard-to-read conditions.", 894 "1": "Logic is convoluted and appears incorrect.", 895 "0": "Query is completely opaque, with no discernible link to the required knowledge.", 896 }, 897 ), 898 ] 899 900 SCORE_EXTRACTION_MAP = [ 901 ("sql_judge_result", ["relevance", "readability", "scalability", "standards"]), 902 ("sql_context_judge_result", ["relevance", "readability", "scalability", "standards"]), 903 ("sql_prompt_judge_result", ["naturalness_of_wording", "specificity_and_clarity", "absence_of_sql_jargon"]), 904 ("sql_data_quality_judge_result", ["correctness_of_cleaning_logic", "efficiency_of_cleaning_method"]), 905 ("sql_knowledge_judge_result", ["correctness_of_knowledge_application", "clarity_of_inference"]), 906 ] 907 908 if __name__ == "__main__": 909 from argparse import ArgumentParser 910 911 parser = ArgumentParser() 912 parser.add_argument("--model-alias", type=str, default="openai-text") 913 parser.add_argument("--num-records", type=int, default=5) 914 parser.add_argument("--artifact-path", type=str, default=None) 915 parser.add_argument( 916 "--dialect", 917 type=str, 918 default="sqlite", 919 choices=list(SQL_DIALECTS.keys()), 920 help="SQL dialect to generate for (default: sqlite)", 921 ) 922 args = parser.parse_args() 923 924 config_builder = build_config(model_alias=args.model_alias, dialect=args.dialect) 925 results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path) 926 927 print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") 928 929 results.load_analysis().to_report()