Dev Note For a deep dive into the pipeline design, production yield analysis, correctness challenges, and key takeaways, see Search Agent SFT Data: Teaching LLMs to Browse the Web.
Seed Dataset
This recipe includes built-in demo seeds (3 Wikidata knowledge graph paths) for quick testing. For production use, generate your own seed dataset from Wikidata random walks — the dev note above describes the seed generation process (SPARQL queries, anti-meta filters, hop range 4-8). Each seed row needs: seed_entity, final_answer_entity, readable_path, num_hops_in_graph, and ground_truth. Pass your seed file via --seed-path.
1 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 # SPDX-License-Identifier: Apache-2.0 3 # /// script 4 # requires-python = ">=3.10" 5 # dependencies = [ 6 # "data-designer", 7 # ] 8 # /// 9 """Nemotron Super Search Agent Recipe: Trajectories with Tavily Web Search 10 11 Generate multi-turn search agent trajectories where an LLM iteratively 12 searches the web, reads results, reasons about evidence, and synthesizes 13 answers -- the kind of data needed to train BrowseComp-style search agents. 14 15 This recipe implements the pipeline used to produce ~7,000 high-quality 16 tool-use trajectories for Nemotron Super post-training, starting from 17 50,000 Wikidata knowledge graph seeds. 18 19 Pipeline architecture: 20 21 ┌─────────────────────────────────────────────────────────────────────────┐ 22 │ STAGE 1: SEED DATA (Wikidata KG Walks) │ 23 │ │ 24 │ Random walks on the Wikidata knowledge graph produce multi-hop paths. │ 25 │ Each seed has: seed_entity, final_answer_entity, readable_path, │ 26 │ num_hops_in_graph, ground_truth. │ 27 │ Built-in demo seeds included; bring your own for production. │ 28 ├─────────────────────────────────────────────────────────────────────────┤ 29 │ STAGE 2: SEARCH RIDDLE GENERATION (LLM) │ 30 │ │ 31 │ user_query_draft ────────► user_query_obfuscated │ 32 │ (chain clues from path, (BrowseComp-style rewrite: │ 33 │ hide intermediate nodes, concise, natural, no breadcrumbs, │ 34 │ don't name the answer) 1-2 sentences max) │ 35 ├─────────────────────────────────────────────────────────────────────────┤ 36 │ STAGE 3: SEARCH TRAJECTORY ROLLOUTS (LLM + MCP) │ 37 │ │ 38 │ Thought-Action-Observation loop with live Tavily web search. │ 39 │ ├─ tavily_search tool via hosted MCP endpoint │ 40 │ ├─ Maximum 25 tool call turns; 300s timeout │ 41 │ ├─ Full trace captured via with_trace=ALL_MESSAGES │ 42 │ └─ Structured JSON output: final_answer, supporting_urls, │ 43 │ short_justification │ 44 ├─────────────────────────────────────────────────────────────────────────┤ 45 │ STAGE 4: STRUCTURED FORMATTING (LLM) │ 46 │ │ 47 │ Normalize raw agent output into clean JSON via LLMStructuredColumn. │ 48 │ Handles markdown fences, trailing text, single-quoted dicts. │ 49 │ │ 50 │ The agent_solution_raw__trace column IS the SFT training data: │ 51 │ complete ChatML conversation with every tool call and response. │ 52 └─────────────────────────────────────────────────────────────────────────┘ 53 54 Prerequisites: 55 - TAVILY_API_KEY environment variable (get a free key at https://tavily.com) 56 - OPENAI_API_KEY environment variable for OpenAI provider model aliases. 57 - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases (default model alias is "nvidia-text"). 58 59 Run: 60 # Basic usage with built-in demo seeds (generates 2 trajectories) 61 uv run search_agent.py 62 63 # Use a custom seed parquet 64 uv run search_agent.py --seed-path /path/to/seeds.parquet --num-records 10 65 66 # For help message and available options 67 uv run search_agent.py --help 68 """ 69 70 from __future__ import annotations 71 72 import json 73 import os 74 import tempfile 75 from pathlib import Path 76 77 from pydantic import BaseModel, Field 78 79 import data_designer.config as dd 80 from data_designer.interface import DataDesigner 81 82 # ============================================================================= 83 # Structured Output Schema 84 # ============================================================================= 85 86 87 class AgentSolution(BaseModel): 88 """Structured output for the search agent's final answer.""" 89 90 final_answer: str = Field(..., min_length=1, description="The final answer entity.") 91 supporting_urls: list[str] = Field( 92 default_factory=list, description="Authoritative URLs used to verify the answer." 93 ) 94 short_justification: str = Field(..., min_length=1, description="Brief explanation of reasoning (1-2 sentences).") 95 96 97 # ============================================================================= 98 # Prompt Templates 99 # ============================================================================= 100 101 QUERY_DRAFT_PROMPT = """\ 102 You are an expert Search Evaluator designing Grandmaster-Level search tests. 103 Create a complex, multi-step search riddle based on this knowledge path: 104 105 {{ readable_path }} 106 107 Start Entity: {{ seed_entity }} 108 Final Answer Entity: {{ final_answer_entity }} 109 110 CRITICAL RULES: 111 1. DO NOT name the intermediate nodes. Hide them behind descriptions. 112 2. DO NOT name the Final Answer. 113 3. Chain the clues logically -- describe each step relative to the previous one. 114 4. Audit the logic: if a step is weak or nonsensical, IGNORE IT. 115 5. Salvage and simplify: use only the strongest, most logical hops. 116 6. No hallucinations: do not invent relationships not in the path. 117 7. Aim for 4-8 meaningful hops. 118 119 VALIDATION - Output "INVALID_PATH" if: 120 - Final answer is generic/abstract (e.g. "technology", "people", "field") 121 - Path has weak/illogical relationships 122 - No coherent question can be formed 123 124 Return ONLY the question string (or "INVALID_PATH").\ 125 """ 126 127 OBFUSCATE_PROMPT = """\ 128 Rewrite this search riddle to better match BrowseComp-style tasks. 129 130 Original Riddle: {{ user_query_draft }} 131 132 Secret Path (do not leak entities): {{ readable_path }} 133 Start Entity: {{ seed_entity }} 134 Final Answer (do not leak): {{ final_answer_entity }} 135 136 HARD REQUIREMENTS: 137 1. NEVER reveal the step-by-step plan. No breadcrumb chains. 138 Avoid: "X is member of Y; Y is based in Z; Z is the capital of..." 139 Avoid meta language: "then search...", "next find...", "follow the chain..." 140 2. NEVER mention the final answer or any intermediate entity by name. 141 3. Keep it concise and natural: 1-2 sentences max (3 for very complex paths). 142 4. Use descriptive clues that require reasoning. 143 5. Include at least one disambiguating filter (date, count, or specific attribute). 144 6. If original == "INVALID_PATH", output exactly "INVALID_PATH". 145 146 Return ONLY the rewritten question string (or "INVALID_PATH").\ 147 """ 148 149 AGENT_SYSTEM_PROMPT = """\ 150 You are an expert search agent that uses web search to answer questions accurately. 151 152 You MUST output ONLY valid JSON matching this exact schema: 153 154 { 155 "final_answer": "string - the specific answer entity", 156 "supporting_urls": ["url1", "url2"], 157 "short_justification": "string - brief 1-2 sentence explanation" 158 } 159 160 AVAILABLE TOOLS: 161 You have access to ONE tool called "tavily_search" with parameter: query (string, required). 162 163 TOOL USAGE RULES: 164 1. Exact Tool Name: Always use "tavily_search" (no suffixes or prefixes). 165 2. Exact Args: Only send {"query": "..."} for the tool call. 166 3. Maximum 25 tool calls. Budget your searches wisely. 167 4. Search Strategy: 168 - Start with broad queries to understand the domain 169 - Refine to specific entities/relationships 170 - Cross-verify facts across multiple sources 171 - Use different query formulations for the same information 172 5. No Reasoning Tags: Do NOT use <think> tags or XML formatting. 173 6. No Intermediate Text: Do NOT output explanatory text between tool calls. 174 7. Final Output: After completing your searches, output ONLY the JSON object. 175 176 EXECUTION FLOW: 177 1. Read the user's question 178 2. Make tool calls using "tavily_search" to gather information 179 3. Verify information across multiple sources 180 4. Once confident, output the JSON result (no additional text)\ 181 """ 182 183 FORMATTER_PROMPT = """\ 184 You are a JSON normalizer. 185 186 You will be given a messy model output that should contain a JSON object with: 187 - final_answer (string) 188 - supporting_urls (list of strings) 189 - short_justification (string) 190 191 Rules: 192 - Return ONLY a JSON object. No markdown. No extra text. 193 - If the input contains code fences, tool chatter, or extra prose, ignore it. 194 - If the input contains invalid JSON, repair it. 195 - supporting_urls must be a list of valid http(s) URLs (dedupe, keep best 1-5). 196 197 Input: 198 {{ agent_solution_raw }}\ 199 """ 200 201 202 # ============================================================================= 203 # Data Designer Configuration 204 # ============================================================================= 205 206 207 def build_config(model_alias: str) -> tuple[dd.DataDesignerConfigBuilder, dd.MCPProvider]: 208 """Build the Data Designer configuration for search agent trajectory generation. 209 210 Returns: 211 A tuple of (config_builder, mcp_provider). 212 """ 213 tavily_api_key = os.environ.get("TAVILY_API_KEY", "") 214 mcp_provider = dd.MCPProvider( 215 name="tavily", 216 endpoint=f"https://mcp.tavily.com/mcp/?tavilyApiKey={tavily_api_key}", 217 provider_type="streamable_http", 218 ) 219 220 tool_config = dd.ToolConfig( 221 tool_alias="tavily-search", 222 providers=["tavily"], 223 allow_tools=["tavily_search"], 224 max_tool_call_turns=25, 225 timeout_sec=300.0, 226 ) 227 228 config_builder = dd.DataDesignerConfigBuilder(tool_configs=[tool_config]) 229 230 # Stage 2: Draft question from knowledge path 231 config_builder.add_column( 232 dd.LLMTextColumnConfig( 233 name="user_query_draft", 234 model_alias=model_alias, 235 prompt=QUERY_DRAFT_PROMPT, 236 ) 237 ) 238 239 # Stage 2: BrowseComp-style obfuscation 240 config_builder.add_column( 241 dd.LLMTextColumnConfig( 242 name="user_query_obfuscated", 243 model_alias=model_alias, 244 prompt=OBFUSCATE_PROMPT, 245 ) 246 ) 247 248 # Stage 3: Agent trajectory with MCP tool calling 249 config_builder.add_column( 250 dd.LLMTextColumnConfig( 251 name="agent_solution_raw", 252 model_alias=model_alias, 253 system_prompt=AGENT_SYSTEM_PROMPT, 254 prompt="Problem: {{ user_query_obfuscated }}", 255 tool_alias="tavily-search", 256 with_trace=dd.TraceType.ALL_MESSAGES, 257 ) 258 ) 259 260 # Stage 4: Structured JSON formatting 261 config_builder.add_column( 262 dd.LLMStructuredColumnConfig( 263 name="agent_solution", 264 model_alias=model_alias, 265 prompt=FORMATTER_PROMPT, 266 output_format=AgentSolution, 267 ) 268 ) 269 270 return config_builder, mcp_provider 271 272 273 # ============================================================================= 274 # Demo Seed Data 275 # ============================================================================= 276 277 DEMO_SEEDS = [ 278 { 279 "seed_entity": "NVIDIA", 280 "final_answer_entity": "Thomas Hart Benton", 281 "readable_path": ( 282 "START ENTITY: NVIDIA (Q182477)\n" 283 " \u2b07 [chief executive officer (P169)]\n" 284 " NODE: Jensen Huang (Q332838)\n" 285 " \u2b07 [educated at (P69)]\n" 286 " NODE: Oregon State University (Q861888)\n" 287 " \u2b07 [located in the administrative territorial entity (P131)]\n" 288 " NODE: Benton County (Q115372)\n" 289 " \u2b07 [named after (P138)]\n" 290 " NODE: Thomas Hart Benton (Q178712)" 291 ), 292 "num_hops_in_graph": 4, 293 "ground_truth": "Thomas Hart Benton", 294 }, 295 { 296 "seed_entity": "Python", 297 "final_answer_entity": "Centrum Wiskunde & Informatica", 298 "readable_path": ( 299 "START ENTITY: Python (Q28865)\n" 300 " \u2b07 [developer (P178)]\n" 301 " NODE: Guido van Rossum (Q19845)\n" 302 " \u2b07 [employer (P108)]\n" 303 " NODE: Centrum Wiskunde & Informatica (Q1060645)" 304 ), 305 "num_hops_in_graph": 2, 306 "ground_truth": "Centrum Wiskunde & Informatica", 307 }, 308 { 309 "seed_entity": "toothache", 310 "final_answer_entity": "ibuprofen", 311 "readable_path": ( 312 "START ENTITY: toothache (Q143925)\n" 313 " \u2b07 [risk factor (P564)]\n" 314 " NODE: smoking (Q662860)\n" 315 " \u2b07 [has effect (P1542)]\n" 316 " NODE: Crohn's disease (Q1472)\n" 317 " \u2b07 [drug or therapy used for treatment (P2176)]\n" 318 " NODE: TNF inhibitor (Q1536078)\n" 319 " \u2b07 [is possible treatment of (P2175)]\n" 320 " NODE: Beh\u00e7et's disease (Q911427)\n" 321 " \u2b07 [symptoms and signs (P780)]\n" 322 " NODE: inflammation (Q101991)\n" 323 " \u2b07 [drug or therapy used for treatment (P2176)]\n" 324 " NODE: flurbiprofen (Q419890)\n" 325 " \u2b07 [significant drug interaction (P769)]\n" 326 " NODE: parecoxib (Q347941)\n" 327 " \u2b07 [significant drug interaction (P769)]\n" 328 " NODE: ibuprofen (Q186969)" 329 ), 330 "num_hops_in_graph": 8, 331 "ground_truth": "ibuprofen", 332 }, 333 ] 334 335 336 def write_demo_seeds(output_dir: Path) -> Path: 337 """Write demo seed data to a JSONL file.""" 338 output_dir.mkdir(parents=True, exist_ok=True) 339 seed_path = output_dir / "demo_seeds.jsonl" 340 with open(seed_path, "w", encoding="utf-8") as f: 341 for seed in DEMO_SEEDS: 342 f.write(json.dumps(seed, ensure_ascii=False) + "\n") 343 return seed_path 344 345 346 # ============================================================================= 347 # Main Entry Point 348 # ============================================================================= 349 350 351 def parse_args(): 352 """Parse command line arguments.""" 353 from argparse import ArgumentParser 354 355 parser = ArgumentParser(description="Generate search agent trajectories using Tavily web search via MCP.") 356 parser.add_argument("--model-alias", type=str, default="nvidia-text", help="Model alias to use for generation") 357 parser.add_argument("--num-records", type=int, default=2, help="Number of trajectories to generate") 358 parser.add_argument("--seed-path", type=str, default=None, help="Path to seed parquet or JSONL file") 359 parser.add_argument("--artifact-path", type=str, default=None, help="Path to save artifacts") 360 return parser.parse_args() 361 362 363 def main() -> None: 364 """Main entry point for the demo.""" 365 args = parse_args() 366 367 if os.environ.get("TAVILY_API_KEY") is None: 368 raise RuntimeError("TAVILY_API_KEY must be set. Get a free key at https://tavily.com") 369 370 if os.environ.get("NVIDIA_API_KEY") is None and args.model_alias.startswith("nvidia"): 371 raise RuntimeError("NVIDIA_API_KEY must be set when using NVIDIA model aliases.") 372 373 if args.seed_path: 374 seed_path = args.seed_path 375 else: 376 demo_dir = Path(tempfile.mkdtemp(prefix="search_agent_demo_")) 377 seed_path = str(write_demo_seeds(demo_dir)) 378 print(f"Using demo seeds in: {demo_dir}") 379 380 config_builder, mcp_provider = build_config(model_alias=args.model_alias) 381 config_builder.with_seed_dataset( 382 dd.LocalFileSeedSource(path=seed_path), 383 sampling_strategy=dd.SamplingStrategy.SHUFFLE, 384 ) 385 386 data_designer = DataDesigner(artifact_path=args.artifact_path, mcp_providers=[mcp_provider]) 387 preview_results = data_designer.preview(config_builder, num_records=args.num_records) 388 389 print("\n" + "=" * 60) 390 print("GENERATED SEARCH AGENT TRAJECTORIES") 391 print("=" * 60) 392 preview_results.display_sample_record() 393 394 395 if __name__ == "__main__": 396 main()