TAO Python SDK and CLI#
TAO Toolkit provides two integrated tools for interacting with the TAO API v2:
Python SDK - The
nvidia-tao-clientpackage for programmatic accessCLI - The
taocommand-line interface for terminal-based workflows
Both tools use environment variables for authentication (set automatically by the login method) and provide unified access to the TAO API v2’s job-centric architecture.
Note
Datasets provided in these examples are subject to the following license Dataset License.
Installation#
Setting up Your Python Environment#
We recommend setting up a Python environment using miniconda. The following instructions
show how to set up a Python conda environment.
Follow the instructions to set up a Conda environment using Miniconda.
After you have installed
miniconda, create a new environment and set the Python version to 3.12.conda create -n tao python=3.12
Activate the
condaenvironment that you have just created.conda activate tao
Verify that the command prompt shows the name of your Conda environment.
(tao) desktop:
When you are done with your session, you can deactivate your conda environment using the
deactivate command:
conda deactivate
You may re-instantiate this conda environment using the following command:
conda activate tao
Install the TAO SDK and CLI#
After you setup and activate the python environment, install the TAO client using the following command:
pip install nvidia-tao-client
This installs both the Python SDK and the CLI tool.
Python SDK Usage#
The TAO Python SDK provides programmatic access to TAO Toolkit using the TAO API v2. The SDK uses environment variables for authentication (set by the login method) and offers a unified, job-centric interface for all TAO operations.
Quick Start with SDK#
Import and Initialize#
from tao_sdk.client import TaoClient
# Initialize client (loads credentials from environment variables set by login)
client = TaoClient()
# Or login directly to obtain credentials
client = TaoClient()
client.login(
ngc_key="your_ngc_key",
ngc_org_name="your_org"
)
Authentication#
Option A: Login programmatically
# Login and save credentials to environment variables (Python process only)
credentials = client.login(
ngc_key="your_ngc_key",
ngc_org_name="your_org"
)
# Login with custom TAO base URL
credentials = client.login(
ngc_key="your_ngc_key",
ngc_org_name="your_org",
tao_base_url="https://custom.tao.endpoint.com/api/v2"
)
print(f"Logged in as: {client.org_name}")
Option B: Logout
# Clear credentials from current Python process
result = client.logout()
print(result["message"])
SDK Core Operations#
Workspace Management#
# List workspaces
workspaces = client.list_workspaces()
print(f"Found {len(workspaces)} workspaces")
# Create workspace
workspace_config = {
"bucket_name": "my-tao-bucket",
"access_key": "AKIAIOSFODNN7EXAMPLE",
"secret_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
"region": "us-west-2"
}
workspace = client.create_workspace(
name="my_workspace",
cloud_type="aws",
cloud_specific_details=workspace_config
)
workspace_id = workspace["id"]
# Get workspace metadata
metadata = client.get_workspace_metadata(workspace_id)
# Delete workspace
client.delete_workspace(workspace_id)
Dataset Operations#
# List datasets
datasets = client.list_datasets()
# Create dataset
dataset = client.create_dataset(
dataset_type="object_detection",
dataset_format="coco",
workspace=workspace_id,
cloud_file_path="/path/to/dataset"
)
dataset_id = dataset["id"]
# Get dataset metadata
dataset_metadata = client.get_dataset_metadata(dataset_id)
# Delete dataset
client.delete_dataset(dataset_id)
Job Management (Unified v2 API)#
# Create experiment job
job = client.create_job(
kind="experiment",
name="image_classification_job",
network_arch="classification_pyt",
encryption_key="my_encryption_key",
workspace=workspace_id,
action="train",
specs={
"epochs": 100,
"batch_size": 32,
"learning_rate": 0.001,
"model": {
"backbone": "resnet18"
}
},
train_dataset_uris=["<scheme>://my-bucket/train-data"],
eval_dataset_uri="<scheme>://my-bucket/eval-data",
automl_settings={
"automl_enabled": True,
"automl_algorithm": "bayesian"
}
)
job_id = job["id"]
# Get job status
status = client.get_job_status(job_id)
# Job control operations
client.pause_job(job_id)
client.resume_job(job_id, parent_job_id="", specs={})
client.cancel_job(job_id)
# Delete completed job
client.delete_job(job_id)
Job Files and Artifacts#
# List job files
files = client.list_job_files(
job_id=job_id,
retrieve_logs=True,
retrieve_specs=True
)
# Download selective files
client.download_job_files(
job_id=job_id,
workdir="./downloads",
best_model=True,
latest_model=False
)
# Download entire job
client.download_entire_job(
job_id=job_id,
workdir="./downloads"
)
# Get job logs
logs = client.get_job_logs(job_id)
Inference Microservices#
# Start inference microservice
microservice = client.start_inference_microservice(
docker_image="nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf2.11.0",
gpu_type="a100",
num_gpus=1,
parent_job_id="job_12345",
kind="experiment",
model_path="/workspace/models/best_model.pth",
workspace="workspace_789",
checkpoint_choose_method="best_model",
network_arch="classification_pyt"
)
microservice_id = microservice["id"]
# Make inference request
result = client.inference_request(
microservice_job_id=microservice_id,
input=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEA..."],
model="my_classification_model"
)
# Stop microservice
client.stop_inference_microservice(microservice_id)
Complete SDK Workflow Example#
from tao_sdk.client import TaoClient
import time
def complete_classification_workflow():
"""Complete image classification workflow example"""
# 1. Initialize and authenticate
client = TaoClient()
if not client.is_authenticated():
client.login(
ngc_key="your_ngc_key",
ngc_org_name="your_org"
)
# 2. Setup resources
workspaces = client.list_workspaces()
workspace_id = workspaces[0]["id"]
# 3. Create training job
job = client.create_job(
kind="experiment",
name="production_classifier",
network_arch="classification_pyt",
encryption_key="prod_key",
workspace=workspace_id,
action="train",
specs={
"epochs": 100,
"batch_size": 32,
"learning_rate": 0.001,
"model": {
"backbone": "resnet18"
}
},
train_dataset_uris=["<scheme>://my-bucket/train-data"],
eval_dataset_uri="<scheme>://my-bucket/eval-data"
)
job_id = job["id"]
print(f"Created training job: {job_id}")
# 4. Monitor training
while True:
status = client.get_job_status(job_id)["status"]
if status == "Done":
print("Training completed!")
break
elif status == "Error":
print("Training failed!")
return None
else:
print(f"Training status: {status}")
time.sleep(60)
# 5. Download results
client.download_job_files(
job_id=job_id,
workdir="./production_model",
best_model=True
)
return job_id
# Run the workflow
if __name__ == "__main__":
result = complete_classification_workflow()
print(f"Workflow completed: {result}")
Command-Line Interface (CLI)#
The TAO CLI provides command-line access to all TAO Toolkit functionality using the TAO API v2. The CLI uses environment variables for authentication and is organized around network architectures.
CLI Authentication#
The TAO CLI uses environment variables for authentication, which are set automatically by the login command.
Login Command
# Interactive login (prompts for credentials and TAO base URL if not set)
tao login --ngc-key YOUR_NGC_KEY --ngc-org-name YOUR_ORG
# Login with custom base URL
tao login --ngc-key YOUR_NGC_KEY --ngc-org-name YOUR_ORG --tao-base-url https://custom.tao.endpoint.com/api/v2
Check Authentication Status#
# Check if you're authenticated and which org you're using
tao whoami
# Clear authentication (logout)
tao logout
CLI Architecture#
The TAO CLI is organized around network architectures. Each network provides a consistent set of commands:
tao <network_name> <command> [options]
Supported Networks
The CLI supports 39+ network architectures including:
Classification:
classification_pyt,nvdinov2Object Detection:
rtdetr,deformable_detr,grounding_dinoSegmentation:
mask2former,segformer,oneformerPose Estimation:
centerpose,pose_classificationAction Recognition:
action_recognitionOCR:
ocdnet,ocrnetAutonomous Driving:
bevfusion,pointpillars,sparse4d3D Reconstruction:
nvpanoptix3dMultimodal:
clipVLM Fine-Tuning:
cosmos_rlData Services:
annotations,auto_label,augmentation,analytics
CLI Command Organization#
Within each network architecture, commands are logically organized into groups:
- JOB-RELATED Commands
create-job- Create experiment or dataset jobslist-jobs- List jobs with filtering optionsdelete-job- Delete jobs (with confirmation)get-job-status- Get job execution statusget-job-metadata- Get job metadataget-job-schema- Get job specifications schemaget-job-logs- Download job logs and fileslist-base-experiments- List available base experiments
- DATASET-RELATED Commands
dataset-create- Create datasetslist-datasets- List available datasetsdataset-delete- Delete datasetsget-dataset-metadata- Get dataset metadata
- WORKSPACE-RELATED Commands
workspace-create- Create workspaceslist-workspaces- List available workspacesdelete-workspace- Delete workspacesget-workspace-metadata- Get workspace metadata
- INFERENCE MICROSERVICE Commands
start-inference-microservice- Start inference microserviceinference-request- Make inference requests to running microserviceget-inference-microservice-status- Get microservice statusstop-inference-microservice- Stop running microservice
Output Formats#
Commands that produce output support an --output option to control format and destination.
Supported Values
text- Human-readable output (default for most commands). For list-style data, a console table is shown.json- Output as JSON (for piping tojqor scripting).yaml- Output as YAML.
Writing to a File
Use @path/to/file with an extension to choose format and destination:
--output @results.json- JSON written toresults.json--output @results.yamlor@results.yml- YAML written to that fileOther extensions or no extension - text written to the file
Defaults
Most commands (
list-workspaces,list-jobs,get-job-metadata,get-dataset-metadata,get-gpu-types, and others): default istext.get-job-schema,get-automl-defaults,get-automl-param-details: default isyamlso specifications and AutoML defaults are easy to edit and save to file.
# Default text (table for list commands)
tao classification_pyt list-workspaces
tao classification_pyt list-jobs
# JSON for scripting / jq
tao get-gpu-types --output json | jq .
tao classification_pyt list-workspaces --output json
tao classification_pyt list-datasets --output json | jq -r '.[] | "\(.id)\t\(.type)\t\(.format)\t\(.name)"'
tao classification_pyt get-job-metadata --job-id "$JOB_ID" --output json | jq -r '.status'
# Write schema to file (YAML default for get-job-schema), then use with create-job
tao classification_pyt get-job-schema --action train --base-experiment-id "$SELECTED_PTM_ID" --output @train_spec.yaml
Input Formats#
Specifications (–specs)
Used by create-job and resume-job. Accepts:
Inline JSON string:
--specs '{"epochs": 50}'File with
@prefix:--specs @path/to/specs.yamlor--specs @path/to/specs.jsonPath to an existing file: If the value is a path to an existing
.json,.yaml, or.ymlfile (without@), the CLI loads from that file.
Recommended workflow: Get the schema with get-job-schema --output @path/job_specs.yaml, edit the YAML if needed, then pass it with create-job ... --specs @path/job_specs.yaml.
AutoML Settings (–automl-settings)
Used by create-job (optional). Accepts:
Multiple key=value pairs:
--automl-settings key1=value1 --automl-settings key2=value2Single file with
@prefix:--automl-settings @path/to/automl_defaults.jsonor@path/to/automl_defaults.yaml
Recommended workflow: Run get-automl-defaults --base-experiment-id ID --action train --output @automl_defaults.json, then use create-job ... --automl-settings @automl_defaults.json.
Filter Parameters (–filter-param)
Used by list-datasets and list-base-experiments. Accepts key=value pairs, repeated per option:
tao classification_pyt list-base-experiments --filter-param network_arch=classification_pyt --output json
Common CLI Commands#
Workspace Management#
# List workspaces (default text/table output)
tao classification_pyt list-workspaces
# List workspaces with JSON output for scripting
tao classification_pyt list-workspaces --output json
# Create workspace
tao classification_pyt workspace-create \
--name "my_workspace" \
--cloud_type aws \
--cloud_details '{"bucket_name": "my-bucket", "access_key": "key", "secret_key": "secret", "region": "us-west-2"}'
# Get workspace metadata
tao classification_pyt get-workspace-metadata --workspace-id "workspace_id"
# Delete workspace
tao classification_pyt delete-workspace --workspace-id "workspace_id" --confirm
Dataset Operations#
# List datasets
tao classification_pyt list-datasets
# Create dataset
tao classification_pyt dataset-create \
--dataset-type object_detection \
--dataset-format coco \
--workspace workspace_id \
--cloud-file-path "/path/to/dataset"
# Get dataset metadata
tao classification_pyt get-dataset-metadata --dataset-id "dataset_id"
# Delete dataset
tao classification_pyt dataset-delete --id "dataset_id"
Job Management (Unified v2 API)#
# List all jobs
tao classification_pyt list-jobs
# Create experiment job (get schema first, then create job)
tao classification_pyt get-job-schema --action train --output @train_spec.yaml
# Edit train_spec.yaml as needed
tao classification_pyt create-job \
--kind experiment \
--name "my_experiment" \
--encryption-key "my_key" \
--workspace "workspace_id" \
--action train \
--specs @train_spec.yaml \
--train-dataset-uri "<scheme>://my-bucket/train-data" \
--eval-dataset-uri "<scheme>://my-bucket/eval-data" \
--automl-settings automl_enabled=true \
--automl-settings automl_algorithm=bayesian
# Get job status
tao classification_pyt get-job-status --job-id "job_id"
# Job control operations
tao classification_pyt job-pause --job-id "job_id"
tao classification_pyt resume-job --job-id "job_id" --specs @resume_spec.yaml
tao classification_pyt job-cancel --job-id "job_id"
tao classification_pyt delete-job --job-id "job_id" --confirm
Job Files and Logs#
# Get job logs
tao classification_pyt get-job-logs --job-id "job_id"
# Download job files
tao classification_pyt download-job-files \
--job-id "job_id" \
--workdir "./downloads" \
--best-model true \
--latest-model false
# Download entire job
tao classification_pyt download-entire-job \
--job-id "job_id" \
--workdir "./downloads"
CLI Workflow Examples#
Classification Workflow#
# Complete image classification workflow with v2 API
export WORKSPACE_ID="workspace_123"
export DATASET_URI="<scheme>://my-bucket/classification-data" # aws://, azure://, lustre://, file://, seaweedfs://
# 1. Get base experiment (PTM) and training schema
tao classification_pyt list-base-experiments --filter-param network_arch=classification_pyt --output json | jq .
export SELECTED_PTM_ID="ptm_id_from_list"
# Get training schema and save to file for editing
tao classification_pyt get-job-schema --action train --base-experiment-id "$SELECTED_PTM_ID" --output @train_spec.yaml
# Edit train_spec.yaml as needed. For example, update "epochs": 100 and "learning_rate": 0.001
# Then copy to train_spec_filled.yaml
# 2. Create classification experiment job using specs from file
JOB_ID=$(tao classification_pyt create-job \
--kind experiment \
--name "image_classification_v1" \
--encryption-key "my_encryption_key" \
--workspace $WORKSPACE_ID \
--action train \
--base-experiment-id "$SELECTED_PTM_ID" \
--specs @train_spec_filled.yaml \
--train-dataset-uri "$DATASET_URI" \
--eval-dataset-uri "$DATASET_URI" \
--automl-settings automl_enabled=true \
--automl-settings automl_algorithm=bayesian \
--output json | jq -r '.id')
# 3. Monitor training progress
tao classification_pyt get-job-status --job-id $JOB_ID
tao classification_pyt get-job-metadata --job-id $JOB_ID --output json | jq -r '.status'
tao classification_pyt get-job-logs --job-id $JOB_ID
# 4. Run evaluation after training completes
tao classification_pyt get-job-schema --action evaluate --output @eval_spec.yaml
# Edit eval_spec.yaml as needed
EVAL_JOB_ID=$(tao classification_pyt create-job \
--kind experiment \
--name "evaluation_job" \
--encryption-key "my_encryption_key" \
--workspace $WORKSPACE_ID \
--action evaluate \
--parent-job-id $JOB_ID \
--eval-dataset-uri "$DATASET_URI" \
--specs @eval_spec.yaml \
--output json | jq -r '.id')
# 5. Export model for deployment
tao classification_pyt get-job-schema --action export --output @export_spec.yaml
# Edit export_spec.yaml as needed
EXPORT_JOB_ID=$(tao classification_pyt create-job \
--kind experiment \
--name "export_job" \
--encryption-key "my_encryption_key" \
--workspace $WORKSPACE_ID \
--action export \
--parent-job-id $JOB_ID \
--train-dataset-uri "$DATASET_URI" \
--specs @export_spec.yaml \
--output json | jq -r '.id')
Object Detection Workflow#
# Complete object detection workflow with v2 API
export WORKSPACE_ID="workspace_789"
export DATASET_URI="<scheme>://my-bucket/coco-dataset" # aws://, azure://, lustre://, file://, seaweedfs://
# 1. Get base experiment and training schema
tao rtdetr list-base-experiments --filter-param network_arch=rtdetr --output json | jq .
export SELECTED_PTM_ID="ptm_id_from_list"
# Get training schema and save to file
tao rtdetr get-job-schema --action train --base-experiment-id "$SELECTED_PTM_ID" --output @train_spec.yaml
# Edit train_spec.yaml as needed. For example, update "epochs": 100 and "learning_rate": 0.001
# 2. Create object detection experiment using specs from file
JOB_ID=$(tao rtdetr create-job \
--kind experiment \
--name "rtdetr_detection_v1" \
--encryption-key "my_encryption_key" \
--workspace $WORKSPACE_ID \
--action train \
--base-experiment-id "$SELECTED_PTM_ID" \
--train-dataset-uri "$DATASET_URI" \
--eval-dataset-uri "$DATASET_URI" \
--specs @train_spec.yaml \
--output json | jq -r '.id')
# 3. Monitor and manage training
tao rtdetr get-job-status --job-id $JOB_ID
tao rtdetr get-job-metadata --job-id $JOB_ID --output json | jq -r '.status'
# 4. Download trained models
tao rtdetr download-job-files \
--job-id $JOB_ID \
--workdir "./models" \
--best-model true \
--latest-model true
Inference Microservices#
# Complete inference microservice workflow
export JOB_ID="job_12345" # ID of trained model job
export WORKSPACE_ID="workspace_789"
# 1. Start inference microservice
MICROSERVICE_ID=$(tao classification_pyt start-inference-microservice \
--docker-image "nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf2.11.0" \
--gpu-type "a100" \
--num-gpus 1 \
--parent-job-id "$JOB_ID" \
--kind "experiment" \
--model-path "/workspace/models/best_model.pth" \
--workspace "$WORKSPACE_ID" \
--checkpoint-choose-method "best_model" \
--network-arch "classification_pyt" \
--output json | jq -r '.id')
# 2. Wait for microservice to be ready
while true; do
STATUS=$(tao classification_pyt get-inference-microservice-status \
--microservice-job-id "$MICROSERVICE_ID" --output json | jq -r '.status')
if [ "$STATUS" = "Running" ]; then
echo "Microservice is ready for inference"
break
fi
echo "Waiting for microservice... Status: $STATUS"
sleep 10
done
# 3. Make inference request
tao classification_pyt inference-request \
--microservice-job-id "$MICROSERVICE_ID" \
--input '["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEA..."]' \
--model "my_classification_model"
# 4. Stop microservice when done
tao classification_pyt stop-inference-microservice \
--microservice-job-id "$MICROSERVICE_ID"
Environment Variables Reference#
The SDK and CLI use these environment variables for configuration:
TAO_BASE_URL- TAO API base URL (example:https://api.tao.ngc.nvidia.com/api/v2)TAO_ORG- NGC organization name (example:my_org)TAO_TOKEN- JWT authentication token (set bytao loginorclient.login())
Legacy Support
For backward compatibility, these legacy variables are also supported:
BASE_URL→TAO_BASE_URLORG→TAO_ORGTOKEN→TAO_TOKEN
Migration from v1 to v2#
If you’re migrating from TAO SDK/CLI v1, note these key changes:
Command Changes
experiment-create→create-job --kind experimentdataset-run-action→create-job --kind dataset --action <action>get-action-status→get-job-statusget-spec→get-job-schemaget-metadata→get-dataset-metadata,get-workspace-metadata,get-job-metadatajob-resume→resume-job
Authentication Changes
File-based configuration (
~/.tao/config) → Environment variables (TAO_*)Automatic login on CLI usage → Explicit login with
tao login
New Features in v2
Unified job management with
create-job --kindparameterEnvironment variable authentication
Job deletion capabilities
Resource-specific metadata commands
Improved job control (pause/resume/cancel/delete)
Troubleshooting#
Authentication Issues#
# Check if authenticated
tao whoami
# If not authenticated, login again
tao login --ngc-key YOUR_KEY --ngc-org-name YOUR_ORG
# Clear credentials if having issues
tao logout
Job Management#
# List all jobs to find stuck jobs
tao classification_pyt list-jobs
# Cancel problematic jobs
tao classification_pyt job-cancel --job-id "problematic_job_id"
# Delete jobs that are no longer needed
tao classification_pyt delete-job --job-id "old_job_id" --confirm
Getting Help#
# Get help for any command
tao --help
tao classification_pyt --help
tao classification_pyt create-job --help
Best Practices#
1. Authentication Management
# Always check authentication before operations
def safe_operation(client):
if not client.is_authenticated():
raise TaoAuthenticationError("Please authenticate first")
return client.list_workspaces()
2. Resource Cleanup
# Clean up completed jobs regularly
def cleanup_old_jobs(client, keep_days=7):
import datetime
jobs = client.list_jobs()
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=keep_days)
for job in jobs:
if job["status"] in ["Done", "Failed"] and job["created_date"] < cutoff_date:
try:
client.delete_job(job["id"])
except Exception as e:
print(f"Failed to delete job {job['id']}: {e}")
3. Error Handling
from tao_sdk.exceptions import TaoAPIError, TaoAuthenticationError
try:
client.require_authentication()
workspaces = client.list_workspaces()
except TaoAuthenticationError as e:
print(f"Authentication error: {e}")
client.login(ngc_key="...", ngc_org_name="...")
except TaoAPIError as e:
print(f"API error: {e.status_code} - {e.message}")
Resources#
API Documentation: REST API
API Reference: OpenAPI Specifications
Inference Microservices: Inference Microservices
AutoML: AutoML
For detailed technical documentation, consult the SDK source code and API v2 OpenAPI specifications.