Nemotron Super Search Agent | NVIDIA NeMo Data Designer

Dev Note For a deep dive into the pipeline design, production yield analysis, correctness challenges, and key takeaways, see Search Agent SFT Data: Teaching LLMs to Browse the Web.
Seed Dataset This recipe includes built-in demo seeds (3 Wikidata knowledge graph paths) for quick testing. For production use, generate your own seed dataset from Wikidata random walks — the dev note above describes the seed generation process (SPARQL queries, anti-meta filters, hop range 4-8). Each seed row needs: seed_entity, final_answer_entity, readable_path, num_hops_in_graph, and ground_truth. Pass your seed file via --seed-path.
Download Recipe

Download the complete recipe script
1 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 # SPDX-License-Identifier: Apache-2.0
3 # /// script
4 # requires-python = ">=3.10"
5 # dependencies = [
6 #     "data-designer",
7 # ]
8 # ///
9 """Nemotron Super Search Agent Recipe: Trajectories with Tavily Web Search
10 
11 Generate multi-turn search agent trajectories where an LLM iteratively
12 searches the web, reads results, reasons about evidence, and synthesizes
13 answers -- the kind of data needed to train BrowseComp-style search agents.
14 
15 This recipe implements the pipeline used to produce ~7,000 high-quality
16 tool-use trajectories for Nemotron Super post-training, starting from
17 50,000 Wikidata knowledge graph seeds.
18 
19 Pipeline architecture:
20 
21     ┌─────────────────────────────────────────────────────────────────────────┐
22     │                   STAGE 1: SEED DATA (Wikidata KG Walks)                │
23     │                                                                         │
24     │  Random walks on the Wikidata knowledge graph produce multi-hop paths.  │
25     │  Each seed has: seed_entity, final_answer_entity, readable_path,        │
26     │  num_hops_in_graph, ground_truth.                                       │
27     │  Built-in demo seeds included; bring your own for production.           │
28     ├─────────────────────────────────────────────────────────────────────────┤
29     │                   STAGE 2: SEARCH RIDDLE GENERATION (LLM)               │
30     │                                                                         │
31     │  user_query_draft ────────► user_query_obfuscated                       │
32     │  (chain clues from path,     (BrowseComp-style rewrite:                 │
33     │   hide intermediate nodes,    concise, natural, no breadcrumbs,         │
34     │   don't name the answer)      1-2 sentences max)                        │
35     ├─────────────────────────────────────────────────────────────────────────┤
36     │                   STAGE 3: SEARCH TRAJECTORY ROLLOUTS (LLM + MCP)       │
37     │                                                                         │
38     │  Thought-Action-Observation loop with live Tavily web search.           │
39     │  ├─ tavily_search tool via hosted MCP endpoint                          │
40     │  ├─ Maximum 25 tool call turns; 300s timeout                            │
41     │  ├─ Full trace captured via with_trace=ALL_MESSAGES                     │
42     │  └─ Structured JSON output: final_answer, supporting_urls,              │
43     │     short_justification                                                 │
44     ├─────────────────────────────────────────────────────────────────────────┤
45     │                   STAGE 4: STRUCTURED FORMATTING (LLM)                  │
46     │                                                                         │
47     │  Normalize raw agent output into clean JSON via LLMStructuredColumn.    │
48     │  Handles markdown fences, trailing text, single-quoted dicts.           │
49     │                                                                         │
50     │  The agent_solution_raw__trace column IS the SFT training data:         │
51     │  complete ChatML conversation with every tool call and response.        │
52     └─────────────────────────────────────────────────────────────────────────┘
53 
54 Prerequisites:
55     - TAVILY_API_KEY environment variable (get a free key at https://tavily.com)
56     - OPENAI_API_KEY environment variable for OpenAI provider model aliases.
57     - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases (default model alias is "nvidia-text").
58 
59 Run:
60     # Basic usage with built-in demo seeds (generates 2 trajectories)
61     uv run search_agent.py
62 
63     # Use a custom seed parquet
64     uv run search_agent.py --seed-path /path/to/seeds.parquet --num-records 10
65 
66     # For help message and available options
67     uv run search_agent.py --help
68 """
69 
70 from __future__ import annotations
71 
72 import json
73 import os
74 import tempfile
75 from pathlib import Path
76 
77 from pydantic import BaseModel, Field
78 
79 import data_designer.config as dd
80 from data_designer.interface import DataDesigner
81 
82 # =============================================================================
83 # Structured Output Schema
84 # =============================================================================
85 
86 
87 class AgentSolution(BaseModel):
88     """Structured output for the search agent's final answer."""
89 
90     final_answer: str = Field(..., min_length=1, description="The final answer entity.")
91     supporting_urls: list[str] = Field(
92         default_factory=list, description="Authoritative URLs used to verify the answer."
93     )
94     short_justification: str = Field(..., min_length=1, description="Brief explanation of reasoning (1-2 sentences).")
95 
96 
97 # =============================================================================
98 # Prompt Templates
99 # =============================================================================
100 
101 QUERY_DRAFT_PROMPT = """\
102 You are an expert Search Evaluator designing Grandmaster-Level search tests.
103 Create a complex, multi-step search riddle based on this knowledge path:
104 
105 {{ readable_path }}
106 
107 Start Entity: {{ seed_entity }}
108 Final Answer Entity: {{ final_answer_entity }}
109 
110 CRITICAL RULES:
111 1. DO NOT name the intermediate nodes. Hide them behind descriptions.
112 2. DO NOT name the Final Answer.
113 3. Chain the clues logically -- describe each step relative to the previous one.
114 4. Audit the logic: if a step is weak or nonsensical, IGNORE IT.
115 5. Salvage and simplify: use only the strongest, most logical hops.
116 6. No hallucinations: do not invent relationships not in the path.
117 7. Aim for 4-8 meaningful hops.
118 
119 VALIDATION - Output "INVALID_PATH" if:
120 - Final answer is generic/abstract (e.g. "technology", "people", "field")
121 - Path has weak/illogical relationships
122 - No coherent question can be formed
123 
124 Return ONLY the question string (or "INVALID_PATH").\
125 """
126 
127 OBFUSCATE_PROMPT = """\
128 Rewrite this search riddle to better match BrowseComp-style tasks.
129 
130 Original Riddle: {{ user_query_draft }}
131 
132 Secret Path (do not leak entities): {{ readable_path }}
133 Start Entity: {{ seed_entity }}
134 Final Answer (do not leak): {{ final_answer_entity }}
135 
136 HARD REQUIREMENTS:
137 1. NEVER reveal the step-by-step plan. No breadcrumb chains.
138    Avoid: "X is member of Y; Y is based in Z; Z is the capital of..."
139    Avoid meta language: "then search...", "next find...", "follow the chain..."
140 2. NEVER mention the final answer or any intermediate entity by name.
141 3. Keep it concise and natural: 1-2 sentences max (3 for very complex paths).
142 4. Use descriptive clues that require reasoning.
143 5. Include at least one disambiguating filter (date, count, or specific attribute).
144 6. If original == "INVALID_PATH", output exactly "INVALID_PATH".
145 
146 Return ONLY the rewritten question string (or "INVALID_PATH").\
147 """
148 
149 AGENT_SYSTEM_PROMPT = """\
150 You are an expert search agent that uses web search to answer questions accurately.
151 
152 You MUST output ONLY valid JSON matching this exact schema:
153 
154 {
155   "final_answer": "string - the specific answer entity",
156   "supporting_urls": ["url1", "url2"],
157   "short_justification": "string - brief 1-2 sentence explanation"
158 }
159 
160 AVAILABLE TOOLS:
161 You have access to ONE tool called "tavily_search" with parameter: query (string, required).
162 
163 TOOL USAGE RULES:
164 1. Exact Tool Name: Always use "tavily_search" (no suffixes or prefixes).
165 2. Exact Args: Only send {"query": "..."} for the tool call.
166 3. Maximum 25 tool calls. Budget your searches wisely.
167 4. Search Strategy:
168    - Start with broad queries to understand the domain
169    - Refine to specific entities/relationships
170    - Cross-verify facts across multiple sources
171    - Use different query formulations for the same information
172 5. No Reasoning Tags: Do NOT use <think> tags or XML formatting.
173 6. No Intermediate Text: Do NOT output explanatory text between tool calls.
174 7. Final Output: After completing your searches, output ONLY the JSON object.
175 
176 EXECUTION FLOW:
177 1. Read the user's question
178 2. Make tool calls using "tavily_search" to gather information
179 3. Verify information across multiple sources
180 4. Once confident, output the JSON result (no additional text)\
181 """
182 
183 FORMATTER_PROMPT = """\
184 You are a JSON normalizer.
185 
186 You will be given a messy model output that should contain a JSON object with:
187 - final_answer (string)
188 - supporting_urls (list of strings)
189 - short_justification (string)
190 
191 Rules:
192 - Return ONLY a JSON object. No markdown. No extra text.
193 - If the input contains code fences, tool chatter, or extra prose, ignore it.
194 - If the input contains invalid JSON, repair it.
195 - supporting_urls must be a list of valid http(s) URLs (dedupe, keep best 1-5).
196 
197 Input:
198 {{ agent_solution_raw }}\
199 """
200 
201 
202 # =============================================================================
203 # Data Designer Configuration
204 # =============================================================================
205 
206 
207 def build_config(model_alias: str) -> tuple[dd.DataDesignerConfigBuilder, dd.MCPProvider]:
208     """Build the Data Designer configuration for search agent trajectory generation.
209 
210     Returns:
211         A tuple of (config_builder, mcp_provider).
212     """
213     tavily_api_key = os.environ.get("TAVILY_API_KEY", "")
214     mcp_provider = dd.MCPProvider(
215         name="tavily",
216         endpoint=f"https://mcp.tavily.com/mcp/?tavilyApiKey={tavily_api_key}",
217         provider_type="streamable_http",
218     )
219 
220     tool_config = dd.ToolConfig(
221         tool_alias="tavily-search",
222         providers=["tavily"],
223         allow_tools=["tavily_search"],
224         max_tool_call_turns=25,
225         timeout_sec=300.0,
226     )
227 
228     config_builder = dd.DataDesignerConfigBuilder(tool_configs=[tool_config])
229 
230     # Stage 2: Draft question from knowledge path
231     config_builder.add_column(
232         dd.LLMTextColumnConfig(
233             name="user_query_draft",
234             model_alias=model_alias,
235             prompt=QUERY_DRAFT_PROMPT,
236         )
237     )
238 
239     # Stage 2: BrowseComp-style obfuscation
240     config_builder.add_column(
241         dd.LLMTextColumnConfig(
242             name="user_query_obfuscated",
243             model_alias=model_alias,
244             prompt=OBFUSCATE_PROMPT,
245         )
246     )
247 
248     # Stage 3: Agent trajectory with MCP tool calling
249     config_builder.add_column(
250         dd.LLMTextColumnConfig(
251             name="agent_solution_raw",
252             model_alias=model_alias,
253             system_prompt=AGENT_SYSTEM_PROMPT,
254             prompt="Problem: {{ user_query_obfuscated }}",
255             tool_alias="tavily-search",
256             with_trace=dd.TraceType.ALL_MESSAGES,
257         )
258     )
259 
260     # Stage 4: Structured JSON formatting
261     config_builder.add_column(
262         dd.LLMStructuredColumnConfig(
263             name="agent_solution",
264             model_alias=model_alias,
265             prompt=FORMATTER_PROMPT,
266             output_format=AgentSolution,
267         )
268     )
269 
270     return config_builder, mcp_provider
271 
272 
273 # =============================================================================
274 # Demo Seed Data
275 # =============================================================================
276 
277 DEMO_SEEDS = [
278     {
279         "seed_entity": "NVIDIA",
280         "final_answer_entity": "Thomas Hart Benton",
281         "readable_path": (
282             "START ENTITY: NVIDIA (Q182477)\n"
283             "  \u2b07 [chief executive officer (P169)]\n"
284             "  NODE: Jensen Huang (Q332838)\n"
285             "  \u2b07 [educated at (P69)]\n"
286             "  NODE: Oregon State University (Q861888)\n"
287             "  \u2b07 [located in the administrative territorial entity (P131)]\n"
288             "  NODE: Benton County (Q115372)\n"
289             "  \u2b07 [named after (P138)]\n"
290             "  NODE: Thomas Hart Benton (Q178712)"
291         ),
292         "num_hops_in_graph": 4,
293         "ground_truth": "Thomas Hart Benton",
294     },
295     {
296         "seed_entity": "Python",
297         "final_answer_entity": "Centrum Wiskunde & Informatica",
298         "readable_path": (
299             "START ENTITY: Python (Q28865)\n"
300             "  \u2b07 [developer (P178)]\n"
301             "  NODE: Guido van Rossum (Q19845)\n"
302             "  \u2b07 [employer (P108)]\n"
303             "  NODE: Centrum Wiskunde & Informatica (Q1060645)"
304         ),
305         "num_hops_in_graph": 2,
306         "ground_truth": "Centrum Wiskunde & Informatica",
307     },
308     {
309         "seed_entity": "toothache",
310         "final_answer_entity": "ibuprofen",
311         "readable_path": (
312             "START ENTITY: toothache (Q143925)\n"
313             "  \u2b07 [risk factor (P564)]\n"
314             "  NODE: smoking (Q662860)\n"
315             "  \u2b07 [has effect (P1542)]\n"
316             "  NODE: Crohn's disease (Q1472)\n"
317             "  \u2b07 [drug or therapy used for treatment (P2176)]\n"
318             "  NODE: TNF inhibitor (Q1536078)\n"
319             "  \u2b07 [is possible treatment of (P2175)]\n"
320             "  NODE: Beh\u00e7et's disease (Q911427)\n"
321             "  \u2b07 [symptoms and signs (P780)]\n"
322             "  NODE: inflammation (Q101991)\n"
323             "  \u2b07 [drug or therapy used for treatment (P2176)]\n"
324             "  NODE: flurbiprofen (Q419890)\n"
325             "  \u2b07 [significant drug interaction (P769)]\n"
326             "  NODE: parecoxib (Q347941)\n"
327             "  \u2b07 [significant drug interaction (P769)]\n"
328             "  NODE: ibuprofen (Q186969)"
329         ),
330         "num_hops_in_graph": 8,
331         "ground_truth": "ibuprofen",
332     },
333 ]
334 
335 
336 def write_demo_seeds(output_dir: Path) -> Path:
337     """Write demo seed data to a JSONL file."""
338     output_dir.mkdir(parents=True, exist_ok=True)
339     seed_path = output_dir / "demo_seeds.jsonl"
340     with open(seed_path, "w", encoding="utf-8") as f:
341         for seed in DEMO_SEEDS:
342             f.write(json.dumps(seed, ensure_ascii=False) + "\n")
343     return seed_path
344 
345 
346 # =============================================================================
347 # Main Entry Point
348 # =============================================================================
349 
350 
351 def parse_args():
352     """Parse command line arguments."""
353     from argparse import ArgumentParser
354 
355     parser = ArgumentParser(description="Generate search agent trajectories using Tavily web search via MCP.")
356     parser.add_argument("--model-alias", type=str, default="nvidia-text", help="Model alias to use for generation")
357     parser.add_argument("--num-records", type=int, default=2, help="Number of trajectories to generate")
358     parser.add_argument("--seed-path", type=str, default=None, help="Path to seed parquet or JSONL file")
359     parser.add_argument("--artifact-path", type=str, default=None, help="Path to save artifacts")
360     return parser.parse_args()
361 
362 
363 def main() -> None:
364     """Main entry point for the demo."""
365     args = parse_args()
366 
367     if os.environ.get("TAVILY_API_KEY") is None:
368         raise RuntimeError("TAVILY_API_KEY must be set. Get a free key at https://tavily.com")
369 
370     if os.environ.get("NVIDIA_API_KEY") is None and args.model_alias.startswith("nvidia"):
371         raise RuntimeError("NVIDIA_API_KEY must be set when using NVIDIA model aliases.")
372 
373     if args.seed_path:
374         seed_path = args.seed_path
375     else:
376         demo_dir = Path(tempfile.mkdtemp(prefix="search_agent_demo_"))
377         seed_path = str(write_demo_seeds(demo_dir))
378         print(f"Using demo seeds in: {demo_dir}")
379 
380     config_builder, mcp_provider = build_config(model_alias=args.model_alias)
381     config_builder.with_seed_dataset(
382         dd.LocalFileSeedSource(path=seed_path),
383         sampling_strategy=dd.SamplingStrategy.SHUFFLE,
384     )
385 
386     data_designer = DataDesigner(artifact_path=args.artifact_path, mcp_providers=[mcp_provider])
387     preview_results = data_designer.preview(config_builder, num_records=args.num_records)
388 
389     print("\n" + "=" * 60)
390     print("GENERATED SEARCH AGENT TRAJECTORIES")
391     print("=" * 60)
392     preview_results.display_sample_record()
393 
394 
395 if __name__ == "__main__":
396     main()