This recipe ingests built-in agent rollout traces with AgentRolloutSeedSource(...), selecting the format with
--format and optionally overriding the input directory with --trace-dir. It works with atif, claude_code,
codex, and hermes_agent; atif expects standalone .json trajectory files and requires --trace-dir, while
claude_code, codex, and hermes_agent can use their default locations when --trace-dir is omitted. The pipeline turns each
imported trace into a compact task digest, a standalone instruction-response pair for coding-assistant SFT, and a
judge-scored quality signal you can use for downstream filtering. It supports both full dataset creation and in-memory
preview mode via --preview.
Looking for ingestion details? See Agent Rollout Ingestion for supported formats, default paths, normalized columns, and rollout-specific parsing behavior. This recipe stays focused on the distillation pipeline.
1 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 # SPDX-License-Identifier: Apache-2.0 3 # /// script 4 # requires-python = ">=3.10" 5 # dependencies = [ 6 # "data-designer", 7 # "pydantic", 8 # ] 9 # /// 10 """Agent Rollout Trace Distillation Recipe 11 12 Read agent rollout traces from disk and turn them into a practical 13 supervised fine-tuning dataset for coding assistants. 14 15 This recipe demonstrates: 16 - ingesting built-in agent rollout formats with `AgentRolloutSeedSource` 17 - distilling traces into compact task digests 18 - generating standalone instruction-response training examples 19 - scoring each candidate for SFT utility with an LLM judge 20 - flattening the result into convenient `sft_instruction` / `sft_response` columns 21 22 Prerequisites: 23 - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases (default model alias is "nvidia-super"). 24 - Agent rollout files for one of the built-in formats. `atif` expects standalone JSON trajectory files and 25 requires `--trace-dir`. `claude_code`, `codex`, and `hermes_agent` can use their default locations when 26 `--trace-dir` is omitted. 27 28 Run: 29 uv run agent_rollout_distillation.py --format atif --trace-dir ./atif_traces 30 uv run agent_rollout_distillation.py --format claude_code 31 uv run agent_rollout_distillation.py --format codex --shuffle --num-records 20 32 uv run agent_rollout_distillation.py --format hermes_agent --num-records 20 33 uv run agent_rollout_distillation.py --format claude_code --num-records 32 --preview 34 uv run agent_rollout_distillation.py --format codex --partition-index 0 --num-partitions 8 35 """ 36 37 from __future__ import annotations 38 39 from argparse import ArgumentParser 40 from pathlib import Path 41 from typing import Literal 42 43 from pydantic import BaseModel, Field 44 45 import data_designer.config as dd 46 from data_designer.config.preview_results import PreviewResults 47 from data_designer.interface import DataDesigner, DatasetCreationResults 48 49 50 class AgentRolloutTraceDigest(BaseModel): 51 user_goal: str = Field(..., description="Standalone summary of the concrete user or delegated agent task.") 52 repository_context: str = Field( 53 ..., 54 description="The repo, codebase, or environment context that materially shaped the task.", 55 ) 56 task_type: str = Field(..., description="Short label for the kind of work in the trace.") 57 notable_actions: list[str] = Field( 58 ..., 59 min_length=1, 60 max_length=6, 61 description="Most important assistant actions, tools, or repo operations from the trace.", 62 ) 63 useful_outcome: str = Field( 64 ..., 65 description="The most useful result, conclusion, or next-step learned from the trace.", 66 ) 67 training_value: Literal["high", "medium", "low"] = Field( 68 ..., 69 description="Assessment of whether this trace is a good source for assistant fine-tuning.", 70 ) 71 quality_notes: str = Field( 72 ..., 73 description="Short note about anything that makes the trace especially useful, narrow, noisy, or partial.", 74 ) 75 76 77 class AgentRolloutFinetuningRecord(BaseModel): 78 instruction: str = Field( 79 ..., 80 description="A standalone user request suitable for supervised fine-tuning of a coding assistant.", 81 ) 82 response: str = Field( 83 ..., 84 description="A grounded assistant response that helps with the instruction without inventing unsupported details.", 85 ) 86 skill_tags: list[str] = Field( 87 ..., 88 min_length=1, 89 max_length=6, 90 description="Short tags describing the skills demonstrated in the example.", 91 ) 92 difficulty: Literal["easy", "medium", "hard"] = Field( 93 ..., 94 description="Approximate difficulty of the resulting training example.", 95 ) 96 97 98 TRACE_DIGEST_SYSTEM_PROMPT = """\ 99 You are curating real coding-assistant traces into training data for supervised fine-tuning. 100 Extract the practical substance of the task without copying long code blocks, logs, or markdown verbatim. 101 Prefer concrete repo work over generic chatter. If the trace is a sidechain, capture the delegated subtask accurately. 102 """ 103 104 105 TRACE_DIGEST_PROMPT = """\ 106 Create a compact trace digest from this agent rollout seed row. 107 108 <trace_metadata> 109 trace_id: {{ trace_id }} 110 source_kind: {{ source_kind }} 111 root_session_id: {{ root_session_id }} 112 agent_id: {{ agent_id }} 113 is_sidechain: {{ is_sidechain }} 114 project_path: {{ project_path }} 115 cwd: {{ cwd }} 116 git_branch: {{ git_branch }} 117 message_count: {{ message_count }} 118 tool_call_count: {{ tool_call_count }} 119 source_meta: {{ source_meta }} 120 </trace_metadata> 121 122 <trace_opening_messages> 123 {{ messages[:4] }} 124 </trace_opening_messages> 125 126 <trace_closing_messages> 127 {{ messages[-4:] }} 128 </trace_closing_messages> 129 130 <final_assistant_message> 131 {{ final_assistant_message }} 132 </final_assistant_message> 133 134 Requirements: 135 - Summarize; do not paste long code, logs, or markdown sections. 136 - Focus on the actual task, the repo context, the key actions, and the useful outcome. 137 - Mark `training_value` as `high` only when the trace teaches a concrete, reusable assistant behavior. 138 - Use `medium` when the trace is somewhat useful but noisy or partial. 139 - Use `low` when the trace is mostly bookkeeping, suggestion-mode filler, or too trace-specific to teach well. 140 """ 141 142 143 SFT_RECORD_SYSTEM_PROMPT = """\ 144 You create high-quality supervised fine-tuning examples for coding assistants. 145 Produce standalone instruction-response pairs that teach useful technical behavior. 146 The trace digest is authoritative. Do not invent file paths, commands, config keys, package names, APIs, or code that are not clearly supported by it. 147 If the digest suggests there was a strong implementation example but does not provide its exact contents, give grounded guidance and structure rather than fabricated snippets. 148 Prefer plain-language implementation guidance over code blocks, config fragments, or shell commands. 149 """ 150 151 152 SFT_RECORD_PROMPT = """\ 153 Transform this trace digest into one strong supervised fine-tuning example for a coding assistant. 154 155 <trace_digest> 156 {{ trace_digest }} 157 </trace_digest> 158 159 Requirements: 160 - The instruction must be self-contained and realistic. 161 - Do not mention the trace, session, seed row, or that this was distilled from prior work. 162 - Preserve repo context only when it materially helps the task. 163 - The response should answer the instruction as a strong assistant would, not narrate what happened in the trace. 164 - Prefer actionable technical help over retrospective summaries. 165 - Avoid placeholders like TODO, <path>, or "I would". 166 - If the original trace was partial or blocked, write the best next-step assistant response to move the task forward. 167 - Do not fabricate commands, file paths, config blocks, code, package names, or API names unless they are explicitly justified by the digest. 168 - If the digest only supports high-level guidance, return a high-level answer with concrete checks, structure, and cautions rather than made-up implementation details. 169 - Prefer short numbered or bulleted steps in plain language. Avoid code fences and command examples unless the digest explicitly contains those exact details. 170 - If the digest mentions that a preview or validation run happened but does not provide the exact invocation, describe that step generically instead of inventing the command. 171 - Keep the response concise and high-signal, ideally under 220 words. 172 """ 173 174 175 SFT_JUDGE_SYSTEM_PROMPT = """\ 176 You are a strict curator for coding-assistant supervised fine-tuning data. 177 Use the trace digest as the source of truth and score whether the candidate example is worth keeping. 178 Invented implementation details are a serious defect. If the response fabricates commands, code, config keys, file names, APIs, or package details not supported by the digest, score it harshly. 179 """ 180 181 182 SFT_JUDGE_PROMPT = """\ 183 Evaluate this candidate supervised fine-tuning example for a coding assistant. 184 185 Trace digest: 186 {{ trace_digest }} 187 188 Candidate instruction: 189 {{ sft_record.instruction }} 190 191 Candidate response: 192 {{ sft_record.response }} 193 194 Hard rules: 195 - Penalize invented commands, code, config keys, file names, APIs, or package details that are not explicitly justified by the digest. 196 - Prefer grounded advisory answers over fabricated implementation snippets. 197 """ 198 199 200 SFT_JUDGE_SCORES = [ 201 dd.Score( 202 name="groundedness", 203 description="Is the candidate example clearly grounded in the trace digest rather than generic filler?", 204 options={ 205 4: "Strongly grounded in the trace digest with concrete task fidelity.", 206 3: "Mostly grounded but slightly generic or overgeneralized.", 207 2: "Partially grounded but missing important trace-specific substance.", 208 1: "Weakly grounded and mostly generic.", 209 0: "Not grounded in the trace digest.", 210 }, 211 ), 212 dd.Score( 213 name="standalone_task", 214 description="Would a new reader understand the instruction without seeing the underlying trace?", 215 options={ 216 4: "Fully standalone and immediately understandable.", 217 3: "Mostly standalone with minor missing context.", 218 2: "Understandable but noticeably dependent on hidden trace context.", 219 1: "Hard to understand without the trace.", 220 0: "Not standalone.", 221 }, 222 ), 223 dd.Score( 224 name="response_quality", 225 description="How helpful, technically specific, and instruction-following is the assistant response?", 226 options={ 227 4: "Highly useful, technically specific, and directly responsive.", 228 3: "Useful overall with minor omissions or verbosity.", 229 2: "Partially helpful but shallow, vague, or uneven.", 230 1: "Low-quality response with major gaps.", 231 0: "Unhelpful or incorrect response.", 232 }, 233 ), 234 dd.Score( 235 name="faithfulness", 236 description="Does the candidate avoid inventing unsupported implementation details beyond what the trace digest justifies?", 237 options={ 238 4: "Faithful to the digest; no meaningful unsupported details are invented.", 239 3: "Mostly faithful with minor speculative details.", 240 2: "Noticeable invented details or overconfident extrapolation.", 241 1: "Many unsupported implementation details are fabricated.", 242 0: "Severely unfaithful to the digest.", 243 }, 244 ), 245 dd.Score( 246 name="training_utility", 247 description="Would this example be worth keeping in an SFT dataset for a coding assistant?", 248 options={ 249 4: "Very strong SFT example worth keeping.", 250 3: "Reasonably useful SFT example.", 251 2: "Marginal example; probably not worth the tokens.", 252 1: "Poor SFT example.", 253 0: "Should not be kept.", 254 }, 255 ), 256 ] 257 258 259 MODEL_NAME = "nvidia/nemotron-3-super-120b-a12b" 260 261 262 def build_config( 263 trace_dir: Path | None, 264 rollout_format: dd.AgentRolloutFormat, 265 model_alias: str, 266 *, 267 sampling_strategy: dd.SamplingStrategy, 268 selection_strategy: dd.PartitionBlock | None, 269 ) -> dd.DataDesignerConfigBuilder: 270 config_builder = dd.DataDesignerConfigBuilder() 271 config_builder.add_model_config( 272 dd.ModelConfig( 273 alias=model_alias, 274 model=MODEL_NAME, 275 provider="nvidia", 276 ) 277 ) 278 seed_source = build_seed_source(trace_dir=trace_dir, rollout_format=rollout_format) 279 config_builder.with_seed_dataset( 280 seed_source, sampling_strategy=sampling_strategy, selection_strategy=selection_strategy 281 ) 282 283 config_builder.add_column( 284 dd.LLMStructuredColumnConfig( 285 name="trace_digest", 286 model_alias=model_alias, 287 output_format=AgentRolloutTraceDigest, 288 system_prompt=TRACE_DIGEST_SYSTEM_PROMPT, 289 prompt=TRACE_DIGEST_PROMPT, 290 ) 291 ) 292 config_builder.add_column( 293 dd.LLMStructuredColumnConfig( 294 name="sft_record", 295 model_alias=model_alias, 296 output_format=AgentRolloutFinetuningRecord, 297 system_prompt=SFT_RECORD_SYSTEM_PROMPT, 298 prompt=SFT_RECORD_PROMPT, 299 ) 300 ) 301 config_builder.add_column( 302 dd.LLMJudgeColumnConfig( 303 name="sft_quality_judge_result", 304 model_alias=model_alias, 305 system_prompt=SFT_JUDGE_SYSTEM_PROMPT, 306 prompt=SFT_JUDGE_PROMPT, 307 scores=SFT_JUDGE_SCORES, 308 ) 309 ) 310 config_builder.add_column( 311 dd.ExpressionColumnConfig( 312 name="sft_instruction", 313 expr="{{ sft_record.instruction }}", 314 ) 315 ) 316 config_builder.add_column( 317 dd.ExpressionColumnConfig( 318 name="sft_response", 319 expr="{{ sft_record.response }}", 320 ) 321 ) 322 config_builder.add_column( 323 dd.ExpressionColumnConfig( 324 name="sft_skill_tags", 325 expr="{{ sft_record.skill_tags }}", 326 ) 327 ) 328 config_builder.add_column( 329 dd.ExpressionColumnConfig( 330 name="groundedness_score", 331 expr="{{ sft_quality_judge_result.groundedness.score if sft_quality_judge_result.groundedness.score is not none else 0 }}", 332 dtype="int", 333 ) 334 ) 335 config_builder.add_column( 336 dd.ExpressionColumnConfig( 337 name="standalone_task_score", 338 expr="{{ sft_quality_judge_result.standalone_task.score if sft_quality_judge_result.standalone_task.score is not none else 0 }}", 339 dtype="int", 340 ) 341 ) 342 config_builder.add_column( 343 dd.ExpressionColumnConfig( 344 name="response_quality_score", 345 expr="{{ sft_quality_judge_result.response_quality.score if sft_quality_judge_result.response_quality.score is not none else 0 }}", 346 dtype="int", 347 ) 348 ) 349 config_builder.add_column( 350 dd.ExpressionColumnConfig( 351 name="faithfulness_score", 352 expr="{{ sft_quality_judge_result.faithfulness.score if sft_quality_judge_result.faithfulness.score is not none else 0 }}", 353 dtype="int", 354 ) 355 ) 356 config_builder.add_column( 357 dd.ExpressionColumnConfig( 358 name="training_utility_score", 359 expr="{{ sft_quality_judge_result.training_utility.score if sft_quality_judge_result.training_utility.score is not none else 0 }}", 360 dtype="int", 361 ) 362 ) 363 config_builder.add_column( 364 dd.ExpressionColumnConfig( 365 name="trace_training_value", 366 expr="{{ trace_digest.training_value }}", 367 ) 368 ) 369 config_builder.add_column( 370 dd.ExpressionColumnConfig( 371 name="recommended_for_sft", 372 expr=( 373 "{{ " 374 "groundedness_score >= 4 and " 375 "standalone_task_score >= 4 and " 376 "response_quality_score >= 4 and " 377 "faithfulness_score >= 4 and " 378 "training_utility_score >= 4 and " 379 "trace_training_value == 'high' " 380 "}}" 381 ), 382 dtype="bool", 383 ) 384 ) 385 386 return config_builder 387 388 389 def run_recipe( 390 config_builder: dd.DataDesignerConfigBuilder, 391 *, 392 num_records: int, 393 artifact_path: Path | str | None = None, 394 dataset_name: str = "agent_rollout_trace_workflows", 395 preview: bool = False, 396 ) -> DatasetCreationResults | PreviewResults: 397 data_designer = DataDesigner(artifact_path=artifact_path) 398 if preview: 399 return data_designer.preview(config_builder, num_records=num_records) 400 return data_designer.create(config_builder, num_records=num_records, dataset_name=dataset_name) 401 402 403 def build_arg_parser() -> ArgumentParser: 404 parser = ArgumentParser() 405 parser.add_argument( 406 "--format", 407 type=str, 408 required=True, 409 choices=[rollout_format.value for rollout_format in dd.AgentRolloutFormat], 410 help="Built-in rollout format to read.", 411 ) 412 parser.add_argument( 413 "--trace-dir", 414 type=Path, 415 default=None, 416 help=( 417 "Optional directory containing rollout trace files. `atif` expects standalone JSON trajectory files " 418 "and requires `--trace-dir`. When omitted, `claude_code` defaults to ~/.claude/projects, " 419 "`codex` defaults to ~/.codex/sessions, and `hermes_agent` defaults to ~/.hermes/sessions." 420 ), 421 ) 422 parser.add_argument("--model-alias", type=str, default="nvidia-super") 423 parser.add_argument("--num-records", type=int, default=5) 424 parser.add_argument("--artifact-path", type=str, default=None) 425 parser.add_argument("--dataset-name", type=str, default="agent_rollout_trace_workflows") 426 parser.add_argument( 427 "--preview", 428 action="store_true", 429 help="Run the recipe in preview mode and keep the generated dataset in memory.", 430 ) 431 parser.add_argument( 432 "--shuffle", 433 action="store_true", 434 help="Shuffle the normalized trace rows before sampling.", 435 ) 436 parser.add_argument( 437 "--partition-index", 438 type=int, 439 default=None, 440 help="Optional partition index for large trace corpora.", 441 ) 442 parser.add_argument( 443 "--num-partitions", 444 type=int, 445 default=None, 446 help="Optional total number of partitions for large trace corpora.", 447 ) 448 return parser 449 450 451 def resolve_selection_strategy( 452 partition_index: int | None, 453 num_partitions: int | None, 454 ) -> dd.PartitionBlock | None: 455 if partition_index is None and num_partitions is None: 456 return None 457 if partition_index is None or num_partitions is None: 458 raise ValueError("--partition-index and --num-partitions must be provided together.") 459 return dd.PartitionBlock(index=partition_index, num_partitions=num_partitions) 460 461 462 def build_seed_source( 463 trace_dir: Path | None, 464 rollout_format: dd.AgentRolloutFormat, 465 ) -> dd.AgentRolloutSeedSource: 466 if rollout_format == dd.AgentRolloutFormat.ATIF and trace_dir is None: 467 raise ValueError("--trace-dir is required when --format atif.") 468 seed_source_kwargs: dict[str, str | dd.AgentRolloutFormat] = {"format": rollout_format} 469 if trace_dir is not None: 470 seed_source_kwargs["path"] = str(trace_dir) 471 return dd.AgentRolloutSeedSource(**seed_source_kwargs) 472 473 474 def main() -> None: 475 args = build_arg_parser().parse_args() 476 rollout_format = dd.AgentRolloutFormat(args.format) 477 trace_dir = args.trace_dir.expanduser().resolve() if args.trace_dir is not None else None 478 sampling_strategy = dd.SamplingStrategy.SHUFFLE if args.shuffle else dd.SamplingStrategy.ORDERED 479 selection_strategy = resolve_selection_strategy(args.partition_index, args.num_partitions) 480 481 config_builder = build_config( 482 trace_dir=trace_dir, 483 rollout_format=rollout_format, 484 model_alias=args.model_alias, 485 sampling_strategy=sampling_strategy, 486 selection_strategy=selection_strategy, 487 ) 488 results = run_recipe( 489 config_builder, 490 num_records=args.num_records, 491 artifact_path=args.artifact_path, 492 dataset_name=args.dataset_name, 493 preview=args.preview, 494 ) 495 496 if args.preview: 497 print(f"Preview generated {len(results.dataset)} rows in memory.") 498 else: 499 print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") 500 results.display_sample_record() 501 502 503 if __name__ == "__main__": 504 main()