TAO Python SDK and CLI#

TAO Toolkit provides two integrated tools for interacting with the TAO API v2:

  1. Python SDK - The nvidia-tao-client package for programmatic access

  2. CLI - The tao command-line interface for terminal-based workflows

Both tools use environment variables for authentication (set automatically by the login method) and provide unified access to the TAO API v2’s job-centric architecture.

Note

Datasets provided in these examples are subject to the following license Dataset License.

Installation#

Setting up Your Python Environment#

We recommend setting up a Python environment using miniconda. The following instructions show how to set up a Python conda environment.

  1. Follow the instructions to set up a Conda environment using Miniconda.

  2. After you have installed miniconda, create a new environment and set the Python version to 3.12.

    conda create -n tao python=3.12
    
  3. Activate the conda environment that you have just created.

    conda activate tao
    
  4. Verify that the command prompt shows the name of your Conda environment.

    (tao) desktop:
    

When you are done with your session, you can deactivate your conda environment using the deactivate command:

conda deactivate

You may re-instantiate this conda environment using the following command:

conda activate tao

Install the TAO SDK and CLI#

After you setup and activate the python environment, install the TAO client using the following command:

pip install nvidia-tao-client

This installs both the Python SDK and the CLI tool.

Python SDK Usage#

The TAO Python SDK provides programmatic access to TAO Toolkit using the TAO API v2. The SDK uses environment variables for authentication (set by the login method) and offers a unified, job-centric interface for all TAO operations.

Quick Start with SDK#

Import and Initialize#

from tao_sdk.client import TaoClient

# Initialize client (loads credentials from environment variables set by login)
client = TaoClient()

# Or login directly to obtain credentials
client = TaoClient()
client.login(
    ngc_key="your_ngc_key",
    ngc_org_name="your_org"
)

Authentication#

Option A: Login programmatically

# Login and save credentials to environment variables (Python process only)
credentials = client.login(
    ngc_key="your_ngc_key",
    ngc_org_name="your_org"
)

# Login with custom TAO base URL
credentials = client.login(
    ngc_key="your_ngc_key",
    ngc_org_name="your_org",
    tao_base_url="https://custom.tao.endpoint.com/api/v2"
)

print(f"Logged in as: {client.org_name}")

Option B: Logout

# Clear credentials from current Python process
result = client.logout()
print(result["message"])

SDK Core Operations#

Workspace Management#

# List workspaces
workspaces = client.list_workspaces()
print(f"Found {len(workspaces)} workspaces")

# Create workspace
workspace_config = {
    "bucket_name": "my-tao-bucket",
    "access_key": "AKIAIOSFODNN7EXAMPLE",
    "secret_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
    "region": "us-west-2"
}

workspace = client.create_workspace(
    name="my_workspace",
    cloud_type="aws",
    cloud_specific_details=workspace_config
)
workspace_id = workspace["id"]

# Get workspace metadata
metadata = client.get_workspace_metadata(workspace_id)

# Delete workspace
client.delete_workspace(workspace_id)

Dataset Operations#

# List datasets
datasets = client.list_datasets()

# Create dataset
dataset = client.create_dataset(
    dataset_type="object_detection",
    dataset_format="coco",
    workspace=workspace_id,
    cloud_file_path="/path/to/dataset"
)
dataset_id = dataset["id"]

# Get dataset metadata
dataset_metadata = client.get_dataset_metadata(dataset_id)

# Delete dataset
client.delete_dataset(dataset_id)

Job Management (Unified v2 API)#

# Create experiment job
job = client.create_job(
    kind="experiment",
    name="image_classification_job",
    network_arch="classification_pyt",
    encryption_key="my_encryption_key",
    workspace=workspace_id,
    action="train",
    specs={
        "epochs": 100,
        "batch_size": 32,
        "learning_rate": 0.001,
        "model": {
            "backbone": "resnet18"
        }
    },
    train_datasets=[dataset_id],
    eval_dataset=dataset_id,
    automl_settings={
        "automl_enabled": True,
        "automl_algorithm": "bayesian"
    }
)

job_id = job["id"]

# Get job status
status = client.get_job_status(job_id)

# Job control operations
client.pause_job(job_id)
client.resume_job(job_id, parent_job_id="", specs={})
client.cancel_job(job_id)

# Delete completed job
client.delete_job(job_id)

Job Files and Artifacts#

# List job files
files = client.list_job_files(
    job_id=job_id,
    retrieve_logs=True,
    retrieve_specs=True
)

# Download selective files
client.download_job_files(
    job_id=job_id,
    workdir="./downloads",
    best_model=True,
    latest_model=False
)

# Download entire job
client.download_entire_job(
    job_id=job_id,
    workdir="./downloads"
)

# Get job logs
logs = client.get_job_logs(job_id)

Inference Microservices#

# Start inference microservice
microservice = client.start_inference_microservice(
    docker_image="nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf2.11.0",
    gpu_type="a100",
    num_gpus=1,
    parent_job_id="job_12345",
    kind="experiment",
    model_path="/workspace/models/best_model.pth",
    workspace="workspace_789",
    checkpoint_choose_method="best_model",
    network_arch="classification_pyt"
)

microservice_id = microservice["id"]

# Make inference request
result = client.inference_request(
    microservice_job_id=microservice_id,
    input=["..."],
    model="my_classification_model"
)

# Stop microservice
client.stop_inference_microservice(microservice_id)

Complete SDK Workflow Example#

from tao_sdk.client import TaoClient
import time

def complete_classification_workflow():
    """Complete image classification workflow example"""

    # 1. Initialize and authenticate
    client = TaoClient()
    if not client.is_authenticated():
        client.login(
            ngc_key="your_ngc_key",
            ngc_org_name="your_org"
        )

    # 2. Setup resources
    workspaces = client.list_workspaces()
    workspace_id = workspaces[0]["id"]

    datasets = client.list_datasets()
    dataset_id = datasets[0]["id"]

    # 3. Create training job
    job = client.create_job(
        kind="experiment",
        name="production_classifier",
        network_arch="classification_pyt",
        encryption_key="prod_key",
        workspace=workspace_id,
        action="train",
        specs={
            "epochs": 100,
            "batch_size": 32,
            "learning_rate": 0.001,
            "model": {
                "backbone": "resnet18"
            }
        },
        train_datasets=[dataset_id],
        eval_dataset=dataset_id
    )

    job_id = job["id"]
    print(f"Created training job: {job_id}")

    # 4. Monitor training
    while True:
        status = client.get_job_status(job_id)["status"]
        if status == "Done":
            print("Training completed!")
            break
        elif status == "Error":
            print("Training failed!")
            return None
        else:
            print(f"Training status: {status}")
            time.sleep(60)

    # 5. Download results
    client.download_job_files(
        job_id=job_id,
        workdir="./production_model",
        best_model=True
    )

    return job_id

# Run the workflow
if __name__ == "__main__":
    result = complete_classification_workflow()
    print(f"Workflow completed: {result}")

Command-Line Interface (CLI)#

The TAO CLI provides command-line access to all TAO Toolkit functionality using the TAO API v2. The CLI uses environment variables for authentication and is organized around network architectures.

CLI Authentication#

The TAO CLI uses environment variables for authentication, which are set automatically by the login command.

Login Command

# Interactive login (prompts for credentials and TAO base URL if not set)
tao login --ngc-key YOUR_NGC_KEY --ngc-org-name YOUR_ORG

# Login with custom base URL
tao login --ngc-key YOUR_NGC_KEY --ngc-org-name YOUR_ORG --tao-base-url https://custom.tao.endpoint.com/api/v2

Check Authentication Status#

# Check if you're authenticated and which org you're using
tao whoami

# Clear authentication (logout)
tao logout

CLI Architecture#

The TAO CLI is organized around network architectures. Each network provides a consistent set of commands:

tao <network_name> <command> [options]

Supported Networks

The CLI supports 36+ network architectures including:

  • Classification: classification_pyt, nvdinov2

  • Object Detection: rtdetr, deformable_detr, grounding_dino

  • Segmentation: mask2former, segformer, oneformer

  • Pose Estimation: centerpose, pose_classification

  • Action Recognition: action_recognition

  • OCR: ocdnet, ocrnet

  • Autonomous Driving: bevfusion, pointpillars, sparse4d

  • Data Services: annotations, auto_label, augmentation, analytics

CLI Command Organization#

Within each network architecture, commands are logically organized into groups:

JOB-RELATED Commands
  • create-job - Create experiment or dataset jobs

  • list-jobs - List jobs with filtering options

  • delete-job - Delete jobs (with confirmation)

  • get-job-status - Get job execution status

  • get-job-metadata - Get job metadata

  • get-job-schema - Get job specifications schema

  • get-job-logs - Download job logs and files

  • list-base-experiments - List available base experiments

DATASET-RELATED Commands
  • dataset-create - Create datasets

  • list-datasets - List available datasets

  • dataset-delete - Delete datasets

  • get-dataset-metadata - Get dataset metadata

WORKSPACE-RELATED Commands
  • workspace-create - Create workspaces

  • list-workspaces - List available workspaces

  • delete-workspace - Delete workspaces

  • get-workspace-metadata - Get workspace metadata

INFERENCE MICROSERVICE Commands
  • start-inference-microservice - Start inference microservice

  • inference-request - Make inference requests to running microservice

  • get-inference-microservice-status - Get microservice status

  • stop-inference-microservice - Stop running microservice

Common CLI Commands#

Workspace Management#

# List workspaces
tao classification_pyt list-workspaces

# Create workspace
tao classification_pyt workspace-create \
  --name "my_workspace" \
  --cloud_type aws \
  --cloud_details '{"bucket_name": "my-bucket", "access_key": "key", "secret_key": "secret", "region": "us-west-2"}'

# Get workspace metadata
tao classification_pyt get-workspace-metadata --workspace-id "workspace_id"

# Delete workspace
tao classification_pyt delete-workspace --workspace-id "workspace_id" --confirm

Dataset Operations#

# List datasets
tao classification_pyt list-datasets

# Create dataset
tao classification_pyt dataset-create \
  --dataset-type object_detection \
  --dataset-format coco \
  --workspace workspace_id \
  --cloud-file-path "/path/to/dataset"

# Get dataset metadata
tao classification_pyt get-dataset-metadata --dataset-id "dataset_id"

# Delete dataset
tao classification_pyt dataset-delete --id "dataset_id"

Job Management (Unified v2 API)#

# List all jobs
tao classification_pyt list-jobs

# Create experiment job
tao classification_pyt create-job \
  --kind experiment \
  --name "my_experiment" \
  --encryption-key "my_key" \
  --workspace "workspace_id" \
  --action train \
  --specs '{"epochs": 100, "learning_rate": 0.001, "model": {"backbone": "resnet18"}}' \
  --train-datasets '["train-dataset-id"]' \
  --eval-dataset "eval-dataset-id" \
  --automl-settings '{"automl_enabled": true, "automl_algorithm": "bayesian"}'

# Get job status
tao classification_pyt get-job-status --job-id "job_id"

# Job control operations
tao classification_pyt job-pause --job-id "job_id"
tao classification_pyt job-resume --job-id "job_id" --parent_job_id "" --specs '{}'
tao classification_pyt job-cancel --job-id "job_id"
tao classification_pyt delete-job --job-id "job_id" --confirm

Job Files and Logs#

# Get job logs
tao classification_pyt get-job-logs --job-id "job_id"

# Download job files
tao classification_pyt download-job-files \
  --job-id "job_id" \
  --workdir "./downloads" \
  --best-model true \
  --latest-model false

# Download entire job
tao classification_pyt download-entire-job \
  --job-id "job_id" \
  --workdir "./downloads"

CLI Workflow Examples#

Classification Workflow#

# Complete image classification workflow with v2 API
export WORKSPACE_ID="workspace_123"
export DATASET_ID="dataset_456"

# 1. Create classification experiment job
JOB_ID=$(tao classification_pyt create-job \
  --kind experiment \
  --name "image_classification_v1" \
  --encryption-key "my_encryption_key" \
  --workspace $WORKSPACE_ID \
  --action train \
  --specs '{
    "epochs": 100,
    "batch_size": 32,
    "learning_rate": 0.001,
    "model": {
      "backbone": "resnet18"
    }
  }' \
  --train-datasets '["'$DATASET_ID'"]' \
  --eval-dataset "$DATASET_ID" \
  --automl-settings '{
    "automl_enabled": true,
    "automl_algorithm": "bayesian",
    "max_iterations": 10
  }' | jq -r '.id')

# 2. Monitor training progress
tao classification_pyt get-job-status --job-id $JOB_ID
tao classification_pyt get-job-logs --job-id $JOB_ID

# 3. Run evaluation after training completes
EVAL_JOB_ID=$(tao classification_pyt create-job \
  --kind experiment \
  --name "evaluation_job" \
  --encryption-key "my_encryption_key" \
  --workspace $WORKSPACE_ID \
  --action evaluate \
  --parent-job-id $JOB_ID \
  --eval-dataset "$DATASET_ID" \
  --specs '{
    "checkpoint_path": "/workspace/models/latest.pth",
    "batch_size": 64
  }' | jq -r '.id')

# 4. Export model for deployment
EXPORT_JOB_ID=$(tao classification_pyt create-job \
  --kind experiment \
  --name "export_job" \
  --encryption-key "my_encryption_key" \
  --workspace $WORKSPACE_ID \
  --action export \
  --parent-job-id $JOB_ID \
  --specs '{
    "output_format": "onnx",
    "batch_size": 1
  }' | jq -r '.id')

Object Detection Workflow#

# Complete object detection workflow with v2 API
export WORKSPACE_ID="workspace_789"
export DATASET_ID="coco_dataset_001"

# Create object detection experiment
JOB_ID=$(tao rtdetr create-job \
  --kind experiment \
  --name "rtdetr_detection_v1" \
  --encryption-key "my_encryption_key" \
  --workspace $WORKSPACE_ID \
  --action train \
  --train-datasets '["'$DATASET_ID'"]' \
  --eval-dataset "$DATASET_ID" \
  --specs '{
    "epochs": 150,
    "batch_size": 16,
    "learning_rate": 0.0001,
    "model": {
      "backbone": "resnet50"
    },
    "augmentation": {
      "horizontal_flip": true,
      "rotation": 15
    }
  }' | jq -r '.id')

# Monitor and manage training
tao rtdetr get-job-status --job-id $JOB_ID

# Download trained models
tao rtdetr download-job-files \
  --job-id $JOB_ID \
  --workdir "./models" \
  --best-model true \
  --latest-model true

Inference Microservices#

# Complete inference microservice workflow
export JOB_ID="job_12345"  # ID of trained model job
export WORKSPACE_ID="workspace_789"

# 1. Start inference microservice
MICROSERVICE_ID=$(tao classification_pyt start-inference-microservice \
  --docker-image "nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf2.11.0" \
  --gpu-type "a100" \
  --num-gpus 1 \
  --parent-job-id "$JOB_ID" \
  --kind "experiment" \
  --model-path "/workspace/models/best_model.pth" \
  --workspace "$WORKSPACE_ID" \
  --checkpoint-choose-method "best_model" \
  --network-arch "classification_pyt" | jq -r '.id')

# 2. Wait for microservice to be ready
while true; do
  STATUS=$(tao classification_pyt get-inference-microservice-status \
    --microservice-job-id "$MICROSERVICE_ID" | jq -r '.status')
  if [ "$STATUS" = "Running" ]; then
    echo "Microservice is ready for inference"
    break
  fi
  echo "Waiting for microservice... Status: $STATUS"
  sleep 10
done

# 3. Make inference request
tao classification_pyt inference-request \
  --microservice-job-id "$MICROSERVICE_ID" \
  --input '["..."]' \
  --model "my_classification_model"

# 4. Stop microservice when done
tao classification_pyt stop-inference-microservice \
  --microservice-job-id "$MICROSERVICE_ID"

Environment Variables Reference#

The SDK and CLI use these environment variables for configuration:

  • TAO_BASE_URL - TAO API base URL (example: https://api.tao.ngc.nvidia.com/api/v2)

  • TAO_ORG - NGC organization name (example: my_org)

  • TAO_TOKEN - JWT authentication token (set by tao login or client.login())

Legacy Support

For backward compatibility, these legacy variables are also supported:

  • BASE_URLTAO_BASE_URL

  • ORGTAO_ORG

  • TOKENTAO_TOKEN

Migration from v1 to v2#

If you’re migrating from TAO SDK/CLI v1, note these key changes:

Command Changes

  • experiment-createcreate-job --kind experiment

  • dataset-run-actioncreate-job --kind dataset --action <action>

  • get-action-statusget-job-status

  • get-specget-job-schema

  • get-metadataget-dataset-metadata, get-workspace-metadata, get-job-metadata

Authentication Changes

  • File-based configuration (~/.tao/config) → Environment variables (TAO_*)

  • Automatic login on CLI usage → Explicit login with tao login

New Features in v2

  • Unified job management with create-job --kind parameter

  • Environment variable authentication

  • Job deletion capabilities

  • Resource-specific metadata commands

  • Improved job control (pause/resume/cancel/delete)

Troubleshooting#

Authentication Issues#

# Check if authenticated
tao whoami

# If not authenticated, login again
tao login --ngc-key YOUR_KEY --ngc-org-name YOUR_ORG

# Clear credentials if having issues
tao logout

Job Management#

# List all jobs to find stuck jobs
tao classification_pyt list-jobs

# Cancel problematic jobs
tao classification_pyt job-cancel --job-id "problematic_job_id"

# Delete jobs that are no longer needed
tao classification_pyt delete-job --job-id "old_job_id" --confirm

Getting Help#

# Get help for any command
tao --help
tao classification_pyt --help
tao classification_pyt create-job --help

Best Practices#

1. Authentication Management

# Always check authentication before operations
def safe_operation(client):
    if not client.is_authenticated():
        raise TaoAuthenticationError("Please authenticate first")

    return client.list_workspaces()

2. Resource Cleanup

# Clean up completed jobs regularly
def cleanup_old_jobs(client, keep_days=7):
    import datetime

    jobs = client.list_jobs()
    cutoff_date = datetime.datetime.now() - datetime.timedelta(days=keep_days)

    for job in jobs:
        if job["status"] in ["Done", "Failed"] and job["created_date"] < cutoff_date:
            try:
                client.delete_job(job["id"])
            except Exception as e:
                print(f"Failed to delete job {job['id']}: {e}")

3. Error Handling

from tao_sdk.exceptions import TaoAPIError, TaoAuthenticationError

try:
    client.require_authentication()
    workspaces = client.list_workspaces()

except TaoAuthenticationError as e:
    print(f"Authentication error: {e}")
    client.login(ngc_key="...", ngc_org_name="...")

except TaoAPIError as e:
    print(f"API error: {e.status_code} - {e.message}")

Resources#

For detailed technical documentation, consult the SDK source code and API v2 OpenAPI specifications.