TAO Python SDK and CLI#

TAO Toolkit provides two integrated tools for interacting with the TAO API v2:

Python SDK - The nvidia-tao-client package for programmatic access
CLI - The tao command-line interface for terminal-based workflows

Both tools use environment variables for authentication (set automatically by the login method) and provide unified access to the TAO API v2’s job-centric architecture.

Note

Datasets provided in these examples are subject to the following license Dataset License.

Installation#

Setting up Your Python Environment#

We recommend setting up a Python environment using miniconda. The following instructions show how to set up a Python conda environment.

Follow the instructions to set up a Conda environment using Miniconda.
After you have installed miniconda, create a new environment and set the Python version to 3.12.
conda create -n tao python=3.12
Activate the conda environment that you have just created.
conda activate tao
Verify that the command prompt shows the name of your Conda environment.
(tao) desktop:

When you are done with your session, you can deactivate your conda environment using the deactivate command:

conda deactivate

You may re-instantiate this conda environment using the following command:

conda activate tao

Install the TAO SDK and CLI#

After you setup and activate the python environment, install the TAO client using the following command:

pip install nvidia-tao-client

This installs both the Python SDK and the CLI tool.

Python SDK Usage#

The TAO Python SDK provides programmatic access to TAO Toolkit using the TAO API v2. The SDK uses environment variables for authentication (set by the login method) and offers a unified, job-centric interface for all TAO operations.

Quick Start with SDK#

Import and Initialize#

from tao_sdk.client import TaoClient

# Initialize client (loads credentials from environment variables set by login)
client = TaoClient()

# Or login directly to obtain credentials
client = TaoClient()
client.login(
    ngc_key="your_ngc_key",
    ngc_org_name="your_org"
)

Authentication#

Option A: Login programmatically

# Login and save credentials to environment variables (Python process only)
credentials = client.login(
    ngc_key="your_ngc_key",
    ngc_org_name="your_org"
)

# Login with custom TAO base URL
credentials = client.login(
    ngc_key="your_ngc_key",
    ngc_org_name="your_org",
    tao_base_url="https://custom.tao.endpoint.com/api/v2"
)

print(f"Logged in as: {client.org_name}")

Option B: Logout

# Clear credentials from current Python process
result = client.logout()
print(result["message"])

SDK Core Operations#

Workspace Management#

# List workspaces
workspaces = client.list_workspaces()
print(f"Found {len(workspaces)} workspaces")

# Create workspace
workspace_config = {
    "bucket_name": "my-tao-bucket",
    "access_key": "AKIAIOSFODNN7EXAMPLE",
    "secret_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
    "region": "us-west-2"
}

workspace = client.create_workspace(
    name="my_workspace",
    cloud_type="aws",
    cloud_specific_details=workspace_config
)
workspace_id = workspace["id"]

# Get workspace metadata
metadata = client.get_workspace_metadata(workspace_id)

# Delete workspace
client.delete_workspace(workspace_id)

Dataset Operations#

# List datasets
datasets = client.list_datasets()

# Create dataset
dataset = client.create_dataset(
    dataset_type="object_detection",
    dataset_format="coco",
    workspace=workspace_id,
    cloud_file_path="/path/to/dataset"
)
dataset_id = dataset["id"]

# Get dataset metadata
dataset_metadata = client.get_dataset_metadata(dataset_id)

# Delete dataset
client.delete_dataset(dataset_id)

Job Management (Unified v2 API)#

# Create experiment job
job = client.create_job(
    kind="experiment",
    name="image_classification_job",
    network_arch="classification_pyt",
    encryption_key="my_encryption_key",
    workspace=workspace_id,
    action="train",
    specs={
        "epochs": 100,
        "batch_size": 32,
        "learning_rate": 0.001,
        "model": {
            "backbone": "resnet18"
        }
    },
    train_dataset_uris=["<scheme>://my-bucket/train-data"],
    eval_dataset_uri="<scheme>://my-bucket/eval-data",
    automl_settings={
        "automl_enabled": True,
        "automl_algorithm": "bayesian"
    }
)

job_id = job["id"]

# Get job status
status = client.get_job_status(job_id)

# Job control operations
client.pause_job(job_id)
client.resume_job(job_id, parent_job_id="", specs={})
client.cancel_job(job_id)

# Delete completed job
client.delete_job(job_id)

Job Files and Artifacts#

# List job files
files = client.list_job_files(
    job_id=job_id,
    retrieve_logs=True,
    retrieve_specs=True
)

# Download selective files
client.download_job_files(
    job_id=job_id,
    workdir="./downloads",
    best_model=True,
    latest_model=False
)

# Download entire job
client.download_entire_job(
    job_id=job_id,
    workdir="./downloads"
)

# Get job logs
logs = client.get_job_logs(job_id)

Inference Microservices#

# Start inference microservice
microservice = client.start_inference_microservice(
    docker_image="nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf2.11.0",
    gpu_type="a100",
    num_gpus=1,
    parent_job_id="job_12345",
    kind="experiment",
    model_path="/workspace/models/best_model.pth",
    workspace="workspace_789",
    checkpoint_choose_method="best_model",
    network_arch="classification_pyt"
)

microservice_id = microservice["id"]

# Make inference request
result = client.inference_request(
    microservice_job_id=microservice_id,
    input=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEA..."],
    model="my_classification_model"
)

# Stop microservice
client.stop_inference_microservice(microservice_id)

Complete SDK Workflow Example#

from tao_sdk.client import TaoClient
import time

def complete_classification_workflow():
    """Complete image classification workflow example"""

    # 1. Initialize and authenticate
    client = TaoClient()
    if not client.is_authenticated():
        client.login(
            ngc_key="your_ngc_key",
            ngc_org_name="your_org"
        )

    # 2. Setup resources
    workspaces = client.list_workspaces()
    workspace_id = workspaces[0]["id"]

    # 3. Create training job
    job = client.create_job(
        kind="experiment",
        name="production_classifier",
        network_arch="classification_pyt",
        encryption_key="prod_key",
        workspace=workspace_id,
        action="train",
        specs={
            "epochs": 100,
            "batch_size": 32,
            "learning_rate": 0.001,
            "model": {
                "backbone": "resnet18"
            }
        },
        train_dataset_uris=["<scheme>://my-bucket/train-data"],
        eval_dataset_uri="<scheme>://my-bucket/eval-data"
    )

    job_id = job["id"]
    print(f"Created training job: {job_id}")

    # 4. Monitor training
    while True:
        status = client.get_job_status(job_id)["status"]
        if status == "Done":
            print("Training completed!")
            break
        elif status == "Error":
            print("Training failed!")
            return None
        else:
            print(f"Training status: {status}")
            time.sleep(60)

    # 5. Download results
    client.download_job_files(
        job_id=job_id,
        workdir="./production_model",
        best_model=True
    )

    return job_id

# Run the workflow
if __name__ == "__main__":
    result = complete_classification_workflow()
    print(f"Workflow completed: {result}")

Command-Line Interface (CLI)#

The TAO CLI provides command-line access to all TAO Toolkit functionality using the TAO API v2. The CLI uses environment variables for authentication and is organized around network architectures.

CLI Authentication#

The TAO CLI uses environment variables for authentication, which are set automatically by the login command.

Login Command

# Interactive login (prompts for credentials and TAO base URL if not set)
tao login --ngc-key YOUR_NGC_KEY --ngc-org-name YOUR_ORG

# Login with custom base URL
tao login --ngc-key YOUR_NGC_KEY --ngc-org-name YOUR_ORG --tao-base-url https://custom.tao.endpoint.com/api/v2

Check Authentication Status#

# Check if you're authenticated and which org you're using
tao whoami

# Clear authentication (logout)
tao logout

CLI Architecture#

The TAO CLI is organized around network architectures. Each network provides a consistent set of commands:

tao <network_name> <command> [options]

Supported Networks

The CLI supports 39+ network architectures including:

Classification: classification_pyt, nvdinov2
Object Detection: rtdetr, deformable_detr, grounding_dino
Segmentation: mask2former, segformer, oneformer
Pose Estimation: centerpose, pose_classification
Action Recognition: action_recognition
OCR: ocdnet, ocrnet
Autonomous Driving: bevfusion, pointpillars, sparse4d
3D Reconstruction: nvpanoptix3d
Embedding Model: clip
VLM Fine-Tuning: cosmos_rl
Data Services: annotations, auto_label, augmentation, analytics

CLI Command Organization#

Within each network architecture, commands are logically organized into groups:

JOB-RELATED Commands

create-job - Create experiment or dataset jobs
list-jobs - List jobs with filtering options
delete-job - Delete jobs (with confirmation)
get-job-status - Get job execution status
get-job-metadata - Get job metadata
get-job-schema - Get job specifications schema
get-job-logs - Download job logs and files
list-base-experiments - List available base experiments

DATASET-RELATED Commands

dataset-create - Create datasets
list-datasets - List available datasets
dataset-delete - Delete datasets
get-dataset-metadata - Get dataset metadata

WORKSPACE-RELATED Commands

workspace-create - Create workspaces
list-workspaces - List available workspaces
delete-workspace - Delete workspaces
get-workspace-metadata - Get workspace metadata

INFERENCE MICROSERVICE Commands

start-inference-microservice - Start inference microservice
inference-request - Make inference requests to running microservice
get-inference-microservice-status - Get microservice status
stop-inference-microservice - Stop running microservice

Output Formats#

Commands that produce output support an --output option to control format and destination.

Supported Values

text - Human-readable output (default for most commands). For list-style data, a console table is shown.
json - Output as JSON (for piping to jq or scripting).
yaml - Output as YAML.

Writing to a File

Use @path/to/file with an extension to choose format and destination:

--output @results.json - JSON written to results.json
--output @results.yaml or @results.yml - YAML written to that file
Other extensions or no extension - text written to the file

Defaults

Most commands (list-workspaces, list-jobs, get-job-metadata, get-dataset-metadata, get-gpu-types, and others): default is text.
get-job-schema, get-automl-defaults, get-automl-param-details: default is yaml so specifications and AutoML defaults are easy to edit and save to file.

# Default text (table for list commands)
tao classification_pyt list-workspaces
tao classification_pyt list-jobs

# JSON for scripting / jq
tao get-gpu-types --output json | jq .
tao classification_pyt list-workspaces --output json
tao classification_pyt list-datasets --output json | jq -r '.[] | "\(.id)\t\(.type)\t\(.format)\t\(.name)"'
tao classification_pyt get-job-metadata --job-id "$JOB_ID" --output json | jq -r '.status'

# Write schema to file (YAML default for get-job-schema), then use with create-job
tao classification_pyt get-job-schema --action train --base-experiment-id "$SELECTED_PTM_ID" --output @train_spec.yaml

Input Formats#

Specifications (–specs)

Used by create-job and resume-job. Accepts:

Inline JSON string: --specs '{"epochs": 50}'
File with @ prefix: --specs @path/to/specs.yaml or --specs @path/to/specs.json
Path to an existing file: If the value is a path to an existing .json, .yaml, or .yml file (without @), the CLI loads from that file.

Recommended workflow: Get the schema with get-job-schema --output @path/job_specs.yaml, edit the YAML if needed, then pass it with create-job ... --specs @path/job_specs.yaml.

AutoML Settings (–automl-settings)

Used by create-job (optional). Accepts:

Multiple key=value pairs: --automl-settings key1=value1 --automl-settings key2=value2
Single file with @ prefix: --automl-settings @path/to/automl_defaults.json or @path/to/automl_defaults.yaml

Recommended workflow: Run get-automl-defaults --base-experiment-id ID --action train --output @automl_defaults.json, then use create-job ... --automl-settings @automl_defaults.json.

Filter Parameters (–filter-param)

Used by list-datasets and list-base-experiments. Accepts key=value pairs, repeated per option:

tao classification_pyt list-base-experiments --filter-param network_arch=classification_pyt --output json

Common CLI Commands#

Workspace Management#

# List workspaces (default text/table output)
tao classification_pyt list-workspaces

# List workspaces with JSON output for scripting
tao classification_pyt list-workspaces --output json

# Create workspace
tao classification_pyt workspace-create \
  --name "my_workspace" \
  --cloud_type aws \
  --cloud_details '{"bucket_name": "my-bucket", "access_key": "key", "secret_key": "secret", "region": "us-west-2"}'

# Get workspace metadata
tao classification_pyt get-workspace-metadata --workspace-id "workspace_id"

# Delete workspace
tao classification_pyt delete-workspace --workspace-id "workspace_id" --confirm

Dataset Operations#

# List datasets
tao classification_pyt list-datasets

# Create dataset
tao classification_pyt dataset-create \
  --dataset-type object_detection \
  --dataset-format coco \
  --workspace workspace_id \
  --cloud-file-path "/path/to/dataset"

# Get dataset metadata
tao classification_pyt get-dataset-metadata --dataset-id "dataset_id"

# Delete dataset
tao classification_pyt dataset-delete --id "dataset_id"

Job Management (Unified v2 API)#

# List all jobs
tao classification_pyt list-jobs

# Create experiment job (get schema first, then create job)
tao classification_pyt get-job-schema --action train --output @train_spec.yaml
# Edit train_spec.yaml as needed

tao classification_pyt create-job \
  --kind experiment \
  --name "my_experiment" \
  --encryption-key "my_key" \
  --workspace "workspace_id" \
  --action train \
  --specs @train_spec.yaml \
  --train-dataset-uri "<scheme>://my-bucket/train-data" \
  --eval-dataset-uri "<scheme>://my-bucket/eval-data" \
  --automl-settings automl_enabled=true \
  --automl-settings automl_algorithm=bayesian

# Get job status
tao classification_pyt get-job-status --job-id "job_id"

# Job control operations
tao classification_pyt job-pause --job-id "job_id"
tao classification_pyt resume-job --job-id "job_id" --specs @resume_spec.yaml
tao classification_pyt job-cancel --job-id "job_id"
tao classification_pyt delete-job --job-id "job_id" --confirm

Job Files and Logs#

# Get job logs
tao classification_pyt get-job-logs --job-id "job_id"

# Download job files
tao classification_pyt download-job-files \
  --job-id "job_id" \
  --workdir "./downloads" \
  --best-model true \
  --latest-model false

# Download entire job
tao classification_pyt download-entire-job \
  --job-id "job_id" \
  --workdir "./downloads"

CLI Workflow Examples#

Classification Workflow#

# Complete image classification workflow with v2 API
export WORKSPACE_ID="workspace_123"
export DATASET_URI="<scheme>://my-bucket/classification-data"  # aws://, azure://, lustre://, file://, seaweedfs://

# 1. Get base experiment (PTM) and training schema
tao classification_pyt list-base-experiments --filter-param network_arch=classification_pyt --output json | jq .
export SELECTED_PTM_ID="ptm_id_from_list"

# Get training schema and save to file for editing
tao classification_pyt get-job-schema --action train --base-experiment-id "$SELECTED_PTM_ID" --output @train_spec.yaml
# Edit train_spec.yaml as needed. For example, update "epochs": 100 and "learning_rate": 0.001
# Then copy to train_spec_filled.yaml

# 2. Create classification experiment job using specs from file
JOB_ID=$(tao classification_pyt create-job \
  --kind experiment \
  --name "image_classification_v1" \
  --encryption-key "my_encryption_key" \
  --workspace $WORKSPACE_ID \
  --action train \
  --base-experiment-id "$SELECTED_PTM_ID" \
  --specs @train_spec_filled.yaml \
  --train-dataset-uri "$DATASET_URI" \
  --eval-dataset-uri "$DATASET_URI" \
  --automl-settings automl_enabled=true \
  --automl-settings automl_algorithm=bayesian \
  --output json | jq -r '.id')

# 3. Monitor training progress
tao classification_pyt get-job-status --job-id $JOB_ID
tao classification_pyt get-job-metadata --job-id $JOB_ID --output json | jq -r '.status'
tao classification_pyt get-job-logs --job-id $JOB_ID

# 4. Run evaluation after training completes
tao classification_pyt get-job-schema --action evaluate --output @eval_spec.yaml
# Edit eval_spec.yaml as needed
EVAL_JOB_ID=$(tao classification_pyt create-job \
  --kind experiment \
  --name "evaluation_job" \
  --encryption-key "my_encryption_key" \
  --workspace $WORKSPACE_ID \
  --action evaluate \
  --parent-job-id $JOB_ID \
  --eval-dataset-uri "$DATASET_URI" \
  --specs @eval_spec.yaml \
  --output json | jq -r '.id')

# 5. Export model for deployment
tao classification_pyt get-job-schema --action export --output @export_spec.yaml
# Edit export_spec.yaml as needed
EXPORT_JOB_ID=$(tao classification_pyt create-job \
  --kind experiment \
  --name "export_job" \
  --encryption-key "my_encryption_key" \
  --workspace $WORKSPACE_ID \
  --action export \
  --parent-job-id $JOB_ID \
  --train-dataset-uri "$DATASET_URI" \
  --specs @export_spec.yaml \
  --output json | jq -r '.id')

Object Detection Workflow#

# Complete object detection workflow with v2 API
export WORKSPACE_ID="workspace_789"
export DATASET_URI="<scheme>://my-bucket/coco-dataset"  # aws://, azure://, lustre://, file://, seaweedfs://

# 1. Get base experiment and training schema
tao rtdetr list-base-experiments --filter-param network_arch=rtdetr --output json | jq .
export SELECTED_PTM_ID="ptm_id_from_list"

# Get training schema and save to file
tao rtdetr get-job-schema --action train --base-experiment-id "$SELECTED_PTM_ID" --output @train_spec.yaml
# Edit train_spec.yaml as needed. For example, update "epochs": 100 and "learning_rate": 0.001

# 2. Create object detection experiment using specs from file
JOB_ID=$(tao rtdetr create-job \
  --kind experiment \
  --name "rtdetr_detection_v1" \
  --encryption-key "my_encryption_key" \
  --workspace $WORKSPACE_ID \
  --action train \
  --base-experiment-id "$SELECTED_PTM_ID" \
  --train-dataset-uri "$DATASET_URI" \
  --eval-dataset-uri "$DATASET_URI" \
  --specs @train_spec.yaml \
  --output json | jq -r '.id')

# 3. Monitor and manage training
tao rtdetr get-job-status --job-id $JOB_ID
tao rtdetr get-job-metadata --job-id $JOB_ID --output json | jq -r '.status'

# 4. Download trained models
tao rtdetr download-job-files \
  --job-id $JOB_ID \
  --workdir "./models" \
  --best-model true \
  --latest-model true

Inference Microservices#

# Complete inference microservice workflow
export JOB_ID="job_12345"  # ID of trained model job
export WORKSPACE_ID="workspace_789"

# 1. Start inference microservice
MICROSERVICE_ID=$(tao classification_pyt start-inference-microservice \
  --docker-image "nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf2.11.0" \
  --gpu-type "a100" \
  --num-gpus 1 \
  --parent-job-id "$JOB_ID" \
  --kind "experiment" \
  --model-path "/workspace/models/best_model.pth" \
  --workspace "$WORKSPACE_ID" \
  --checkpoint-choose-method "best_model" \
  --network-arch "classification_pyt" \
  --output json | jq -r '.id')

# 2. Wait for microservice to be ready
while true; do
  STATUS=$(tao classification_pyt get-inference-microservice-status \
    --microservice-job-id "$MICROSERVICE_ID" --output json | jq -r '.status')
  if [ "$STATUS" = "Running" ]; then
    echo "Microservice is ready for inference"
    break
  fi
  echo "Waiting for microservice... Status: $STATUS"
  sleep 10
done

# 3. Make inference request
tao classification_pyt inference-request \
  --microservice-job-id "$MICROSERVICE_ID" \
  --input '["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEA..."]' \
  --model "my_classification_model"

# 4. Stop microservice when done
tao classification_pyt stop-inference-microservice \
  --microservice-job-id "$MICROSERVICE_ID"

Environment Variables Reference#

The SDK and CLI use these environment variables for configuration:

TAO_BASE_URL - TAO API base URL (example: https://api.tao.ngc.nvidia.com/api/v2)
TAO_ORG - NGC organization name (example: my_org)
TAO_TOKEN - JWT authentication token (set by tao login or client.login())

Legacy Support

For backward compatibility, these legacy variables are also supported:

BASE_URL → TAO_BASE_URL
ORG → TAO_ORG
TOKEN → TAO_TOKEN

Migration from v1 to v2#

If you’re migrating from TAO SDK/CLI v1, note these key changes:

Command Changes

experiment-create → create-job --kind experiment
dataset-run-action → create-job --kind dataset --action <action>
get-action-status → get-job-status
get-spec → get-job-schema
get-metadata → get-dataset-metadata, get-workspace-metadata, get-job-metadata
job-resume → resume-job

Authentication Changes

File-based configuration (~/.tao/config) → Environment variables (TAO_*)
Automatic login on CLI usage → Explicit login with tao login

New Features in v2

Unified job management with create-job --kind parameter
Environment variable authentication
Job deletion capabilities
Resource-specific metadata commands
Improved job control (pause/resume/cancel/delete)

Troubleshooting#

Authentication Issues#

# Check if authenticated
tao whoami

# If not authenticated, login again
tao login --ngc-key YOUR_KEY --ngc-org-name YOUR_ORG

# Clear credentials if having issues
tao logout

Job Management#

# List all jobs to find stuck jobs
tao classification_pyt list-jobs

# Cancel problematic jobs
tao classification_pyt job-cancel --job-id "problematic_job_id"

# Delete jobs that are no longer needed
tao classification_pyt delete-job --job-id "old_job_id" --confirm

Getting Help#

# Get help for any command
tao --help
tao classification_pyt --help
tao classification_pyt create-job --help

Best Practices#

1. Authentication Management

# Always check authentication before operations
def safe_operation(client):
    if not client.is_authenticated():
        raise TaoAuthenticationError("Please authenticate first")

    return client.list_workspaces()

2. Resource Cleanup

# Clean up completed jobs regularly
def cleanup_old_jobs(client, keep_days=7):
    import datetime

    jobs = client.list_jobs()
    cutoff_date = datetime.datetime.now() - datetime.timedelta(days=keep_days)

    for job in jobs:
        if job["status"] in ["Done", "Failed"] and job["created_date"] < cutoff_date:
            try:
                client.delete_job(job["id"])
            except Exception as e:
                print(f"Failed to delete job {job['id']}: {e}")

3. Error Handling

from tao_sdk.exceptions import TaoAPIError, TaoAuthenticationError

try:
    client.require_authentication()
    workspaces = client.list_workspaces()

except TaoAuthenticationError as e:
    print(f"Authentication error: {e}")
    client.login(ngc_key="...", ngc_org_name="...")

except TaoAPIError as e:
    print(f"API error: {e.status_code} - {e.message}")

Resources#

API Documentation: REST API
API Reference: OpenAPI Specifications
Inference Microservices: Inference Microservices
AutoML: AutoML

For detailed technical documentation, consult the SDK source code and API v2 OpenAPI specifications.