TAO Python SDK and CLI#
TAO Toolkit provides two integrated tools for interacting with the TAO API v2:
Python SDK - The
nvidia-taopackage for programmatic accessCLI - The
taocommand-line interface for terminal-based workflows
Both tools use environment variables for authentication (set automatically by the login method) and provide unified access to the TAO API v2’s job-centric architecture.
Note
Datasets provided in these examples are subject to the following license Dataset License.
Installation#
Setting up Your Python Environment#
We recommend setting up a Python environment using miniconda. The following instructions
show how to set up a Python conda environment.
Follow the instructions to set up a Conda environment using Miniconda.
After you have installed
miniconda, create a new environment and set the Python version to 3.12.conda create -n tao python=3.12
Activate the
condaenvironment that you have just created.conda activate tao
Verify that the command prompt shows the name of your Conda environment.
(tao) desktop:
When you are done with your session, you can deactivate your conda environment using the
deactivate command:
conda deactivate
You may re-instantiate this conda environment using the following command:
conda activate tao
Install the TAO SDK and CLI#
After you setup and activate the python environment, install the TAO client using the following command:
pip install nvidia-tao
This installs both the Python SDK and the CLI tool.
Python SDK Usage#
The TAO Python SDK provides programmatic access to TAO Toolkit using the TAO API v2. The SDK uses environment variables for authentication (set by the login method) and offers a unified, job-centric interface for all TAO operations.
Quick Start with SDK#
Import and Initialize#
from tao_sdk.client import TaoClient
# Initialize client (loads credentials from environment variables set by login)
client = TaoClient()
# Or login directly to obtain credentials
client = TaoClient()
client.login(
ngc_key="your_ngc_key",
ngc_org_name="your_org"
)
Authentication#
Option A: Login programmatically
# Login and save credentials to environment variables (Python process only)
credentials = client.login(
ngc_key="your_ngc_key",
ngc_org_name="your_org"
)
# Login with custom TAO base URL
credentials = client.login(
ngc_key="your_ngc_key",
ngc_org_name="your_org",
tao_base_url="https://custom.tao.endpoint.com/api/v2"
)
print(f"Logged in as: {client.org_name}")
Option B: Logout
# Clear credentials from current Python process
result = client.logout()
print(result["message"])
SDK Core Operations#
Workspace Management#
# List workspaces
workspaces = client.list_workspaces()
print(f"Found {len(workspaces)} workspaces")
# Create workspace
workspace_config = {
"bucket_name": "my-tao-bucket",
"access_key": "AKIAIOSFODNN7EXAMPLE",
"secret_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
"region": "us-west-2"
}
workspace = client.create_workspace(
name="my_workspace",
cloud_type="aws",
cloud_specific_details=workspace_config
)
workspace_id = workspace["id"]
# Get workspace metadata
metadata = client.get_workspace_metadata(workspace_id)
# Delete workspace
client.delete_workspace(workspace_id)
Dataset Operations#
# List datasets
datasets = client.list_datasets()
# Create dataset
dataset = client.create_dataset(
dataset_type="object_detection",
dataset_format="coco",
workspace=workspace_id,
cloud_file_path="/path/to/dataset"
)
dataset_id = dataset["id"]
# Get dataset metadata
dataset_metadata = client.get_dataset_metadata(dataset_id)
# Delete dataset
client.delete_dataset(dataset_id)
Job Management (Unified v2 API)#
# Create experiment job
job = client.create_job(
kind="experiment",
name="image_classification_job",
network_arch="classification_pyt",
encryption_key="my_encryption_key",
workspace=workspace_id,
action="train",
specs={
"epochs": 100,
"batch_size": 32,
"learning_rate": 0.001,
"model": {
"backbone": "resnet18"
}
},
train_datasets=[dataset_id],
eval_dataset=dataset_id,
automl_settings={
"automl_enabled": True,
"automl_algorithm": "bayesian"
}
)
job_id = job["id"]
# Get job status
status = client.get_job_status(job_id)
# Job control operations
client.pause_job(job_id)
client.resume_job(job_id, parent_job_id="", specs={})
client.cancel_job(job_id)
# Delete completed job
client.delete_job(job_id)
Job Files and Artifacts#
# List job files
files = client.list_job_files(
job_id=job_id,
retrieve_logs=True,
retrieve_specs=True
)
# Download selective files
client.download_job_files(
job_id=job_id,
workdir="./downloads",
best_model=True,
latest_model=False
)
# Download entire job
client.download_entire_job(
job_id=job_id,
workdir="./downloads"
)
# Get job logs
logs = client.get_job_logs(job_id)
Inference Microservices#
# Start inference microservice
microservice = client.start_inference_microservice(
docker_image="nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf2.11.0",
gpu_type="a100",
num_gpus=1,
parent_job_id="job_12345",
kind="experiment",
model_path="/workspace/models/best_model.pth",
workspace="workspace_789",
checkpoint_choose_method="best_model",
network_arch="classification_pyt"
)
microservice_id = microservice["id"]
# Make inference request
result = client.inference_request(
microservice_job_id=microservice_id,
input=["..."],
model="my_classification_model"
)
# Stop microservice
client.stop_inference_microservice(microservice_id)
Complete SDK Workflow Example#
from tao_sdk.client import TaoClient
import time
def complete_classification_workflow():
"""Complete image classification workflow example"""
# 1. Initialize and authenticate
client = TaoClient()
if not client.is_authenticated():
client.login(
ngc_key="your_ngc_key",
ngc_org_name="your_org"
)
# 2. Setup resources
workspaces = client.list_workspaces()
workspace_id = workspaces[0]["id"]
datasets = client.list_datasets()
dataset_id = datasets[0]["id"]
# 3. Create training job
job = client.create_job(
kind="experiment",
name="production_classifier",
network_arch="classification_pyt",
encryption_key="prod_key",
workspace=workspace_id,
action="train",
specs={
"epochs": 100,
"batch_size": 32,
"learning_rate": 0.001,
"model": {
"backbone": "resnet18"
}
},
train_datasets=[dataset_id],
eval_dataset=dataset_id
)
job_id = job["id"]
print(f"Created training job: {job_id}")
# 4. Monitor training
while True:
status = client.get_job_status(job_id)["status"]
if status == "Done":
print("Training completed!")
break
elif status == "Error":
print("Training failed!")
return None
else:
print(f"Training status: {status}")
time.sleep(60)
# 5. Download results
client.download_job_files(
job_id=job_id,
workdir="./production_model",
best_model=True
)
return job_id
# Run the workflow
if __name__ == "__main__":
result = complete_classification_workflow()
print(f"Workflow completed: {result}")
Command-Line Interface (CLI)#
The TAO CLI provides command-line access to all TAO Toolkit functionality using the TAO API v2. The CLI uses environment variables for authentication and is organized around network architectures.
CLI Authentication#
The TAO CLI uses environment variables for authentication, which are set automatically by the login command.
Login Command
# Interactive login (prompts for credentials and TAO base URL if not set)
tao login --ngc-key YOUR_NGC_KEY --ngc-org-name YOUR_ORG
# Login with custom base URL
tao login --ngc-key YOUR_NGC_KEY --ngc-org-name YOUR_ORG --tao-base-url https://custom.tao.endpoint.com/api/v2
Check Authentication Status#
# Check if you're authenticated and which org you're using
tao whoami
# Clear authentication (logout)
tao logout
CLI Architecture#
The TAO CLI is organized around network architectures. Each network provides a consistent set of commands:
tao <network_name> <command> [options]
Supported Networks
The CLI supports 36+ network architectures including:
Classification:
classification_pyt,nvdinov2Object Detection:
rtdetr,deformable_detr,grounding_dinoSegmentation:
mask2former,segformer,oneformerPose Estimation:
centerpose,pose_classificationAction Recognition:
action_recognitionOCR:
ocdnet,ocrnetAutonomous Driving:
bevfusion,pointpillars,sparse4dData Services:
annotations,auto_label,augmentation,analytics
CLI Command Organization#
Within each network architecture, commands are logically organized into groups:
- JOB-RELATED Commands
create-job- Create experiment or dataset jobslist-jobs- List jobs with filtering optionsdelete-job- Delete jobs (with confirmation)get-job-status- Get job execution statusget-job-metadata- Get job metadataget-job-schema- Get job specifications schemaget-job-logs- Download job logs and fileslist-base-experiments- List available base experiments
- DATASET-RELATED Commands
dataset-create- Create datasetslist-datasets- List available datasetsdataset-delete- Delete datasetsget-dataset-metadata- Get dataset metadata
- WORKSPACE-RELATED Commands
workspace-create- Create workspaceslist-workspaces- List available workspacesdelete-workspace- Delete workspacesget-workspace-metadata- Get workspace metadata
- INFERENCE MICROSERVICE Commands
start-inference-microservice- Start inference microserviceinference-request- Make inference requests to running microserviceget-inference-microservice-status- Get microservice statusstop-inference-microservice- Stop running microservice
Common CLI Commands#
Workspace Management#
# List workspaces
tao classification_pyt list-workspaces
# Create workspace
tao classification_pyt workspace-create \
--name "my_workspace" \
--cloud_type aws \
--cloud_details '{"bucket_name": "my-bucket", "access_key": "key", "secret_key": "secret", "region": "us-west-2"}'
# Get workspace metadata
tao classification_pyt get-workspace-metadata --workspace-id "workspace_id"
# Delete workspace
tao classification_pyt delete-workspace --workspace-id "workspace_id" --confirm
Dataset Operations#
# List datasets
tao classification_pyt list-datasets
# Create dataset
tao classification_pyt dataset-create \
--dataset-type object_detection \
--dataset-format coco \
--workspace workspace_id \
--cloud-file-path "/path/to/dataset"
# Get dataset metadata
tao classification_pyt get-dataset-metadata --dataset-id "dataset_id"
# Delete dataset
tao classification_pyt dataset-delete --id "dataset_id"
Job Management (Unified v2 API)#
# List all jobs
tao classification_pyt list-jobs
# Create experiment job
tao classification_pyt create-job \
--kind experiment \
--name "my_experiment" \
--encryption-key "my_key" \
--workspace "workspace_id" \
--action train \
--specs '{"epochs": 100, "learning_rate": 0.001, "model": {"backbone": "resnet18"}}' \
--train-datasets '["train-dataset-id"]' \
--eval-dataset "eval-dataset-id" \
--automl-settings '{"automl_enabled": true, "automl_algorithm": "bayesian"}'
# Get job status
tao classification_pyt get-job-status --job-id "job_id"
# Job control operations
tao classification_pyt job-pause --job-id "job_id"
tao classification_pyt job-resume --job-id "job_id" --parent_job_id "" --specs '{}'
tao classification_pyt job-cancel --job-id "job_id"
tao classification_pyt delete-job --job-id "job_id" --confirm
Job Files and Logs#
# Get job logs
tao classification_pyt get-job-logs --job-id "job_id"
# Download job files
tao classification_pyt download-job-files \
--job-id "job_id" \
--workdir "./downloads" \
--best-model true \
--latest-model false
# Download entire job
tao classification_pyt download-entire-job \
--job-id "job_id" \
--workdir "./downloads"
CLI Workflow Examples#
Classification Workflow#
# Complete image classification workflow with v2 API
export WORKSPACE_ID="workspace_123"
export DATASET_ID="dataset_456"
# 1. Create classification experiment job
JOB_ID=$(tao classification_pyt create-job \
--kind experiment \
--name "image_classification_v1" \
--encryption-key "my_encryption_key" \
--workspace $WORKSPACE_ID \
--action train \
--specs '{
"epochs": 100,
"batch_size": 32,
"learning_rate": 0.001,
"model": {
"backbone": "resnet18"
}
}' \
--train-datasets '["'$DATASET_ID'"]' \
--eval-dataset "$DATASET_ID" \
--automl-settings '{
"automl_enabled": true,
"automl_algorithm": "bayesian",
"max_iterations": 10
}' | jq -r '.id')
# 2. Monitor training progress
tao classification_pyt get-job-status --job-id $JOB_ID
tao classification_pyt get-job-logs --job-id $JOB_ID
# 3. Run evaluation after training completes
EVAL_JOB_ID=$(tao classification_pyt create-job \
--kind experiment \
--name "evaluation_job" \
--encryption-key "my_encryption_key" \
--workspace $WORKSPACE_ID \
--action evaluate \
--parent-job-id $JOB_ID \
--eval-dataset "$DATASET_ID" \
--specs '{
"checkpoint_path": "/workspace/models/latest.pth",
"batch_size": 64
}' | jq -r '.id')
# 4. Export model for deployment
EXPORT_JOB_ID=$(tao classification_pyt create-job \
--kind experiment \
--name "export_job" \
--encryption-key "my_encryption_key" \
--workspace $WORKSPACE_ID \
--action export \
--parent-job-id $JOB_ID \
--specs '{
"output_format": "onnx",
"batch_size": 1
}' | jq -r '.id')
Object Detection Workflow#
# Complete object detection workflow with v2 API
export WORKSPACE_ID="workspace_789"
export DATASET_ID="coco_dataset_001"
# Create object detection experiment
JOB_ID=$(tao rtdetr create-job \
--kind experiment \
--name "rtdetr_detection_v1" \
--encryption-key "my_encryption_key" \
--workspace $WORKSPACE_ID \
--action train \
--train-datasets '["'$DATASET_ID'"]' \
--eval-dataset "$DATASET_ID" \
--specs '{
"epochs": 150,
"batch_size": 16,
"learning_rate": 0.0001,
"model": {
"backbone": "resnet50"
},
"augmentation": {
"horizontal_flip": true,
"rotation": 15
}
}' | jq -r '.id')
# Monitor and manage training
tao rtdetr get-job-status --job-id $JOB_ID
# Download trained models
tao rtdetr download-job-files \
--job-id $JOB_ID \
--workdir "./models" \
--best-model true \
--latest-model true
Inference Microservices#
# Complete inference microservice workflow
export JOB_ID="job_12345" # ID of trained model job
export WORKSPACE_ID="workspace_789"
# 1. Start inference microservice
MICROSERVICE_ID=$(tao classification_pyt start-inference-microservice \
--docker-image "nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf2.11.0" \
--gpu-type "a100" \
--num-gpus 1 \
--parent-job-id "$JOB_ID" \
--kind "experiment" \
--model-path "/workspace/models/best_model.pth" \
--workspace "$WORKSPACE_ID" \
--checkpoint-choose-method "best_model" \
--network-arch "classification_pyt" | jq -r '.id')
# 2. Wait for microservice to be ready
while true; do
STATUS=$(tao classification_pyt get-inference-microservice-status \
--microservice-job-id "$MICROSERVICE_ID" | jq -r '.status')
if [ "$STATUS" = "Running" ]; then
echo "Microservice is ready for inference"
break
fi
echo "Waiting for microservice... Status: $STATUS"
sleep 10
done
# 3. Make inference request
tao classification_pyt inference-request \
--microservice-job-id "$MICROSERVICE_ID" \
--input '["..."]' \
--model "my_classification_model"
# 4. Stop microservice when done
tao classification_pyt stop-inference-microservice \
--microservice-job-id "$MICROSERVICE_ID"
Environment Variables Reference#
The SDK and CLI use these environment variables for configuration:
TAO_BASE_URL- TAO API base URL (example:https://api.tao.ngc.nvidia.com/api/v2)TAO_ORG- NGC organization name (example:my_org)TAO_TOKEN- JWT authentication token (set bytao loginorclient.login())
Legacy Support
For backward compatibility, these legacy variables are also supported:
BASE_URL→TAO_BASE_URLORG→TAO_ORGTOKEN→TAO_TOKEN
Migration from v1 to v2#
If you’re migrating from TAO SDK/CLI v1, note these key changes:
Command Changes
experiment-create→create-job --kind experimentdataset-run-action→create-job --kind dataset --action <action>get-action-status→get-job-statusget-spec→get-job-schemaget-metadata→get-dataset-metadata,get-workspace-metadata,get-job-metadata
Authentication Changes
File-based configuration (
~/.tao/config) → Environment variables (TAO_*)Automatic login on CLI usage → Explicit login with
tao login
New Features in v2
Unified job management with
create-job --kindparameterEnvironment variable authentication
Job deletion capabilities
Resource-specific metadata commands
Improved job control (pause/resume/cancel/delete)
Troubleshooting#
Authentication Issues#
# Check if authenticated
tao whoami
# If not authenticated, login again
tao login --ngc-key YOUR_KEY --ngc-org-name YOUR_ORG
# Clear credentials if having issues
tao logout
Job Management#
# List all jobs to find stuck jobs
tao classification_pyt list-jobs
# Cancel problematic jobs
tao classification_pyt job-cancel --job-id "problematic_job_id"
# Delete jobs that are no longer needed
tao classification_pyt delete-job --job-id "old_job_id" --confirm
Getting Help#
# Get help for any command
tao --help
tao classification_pyt --help
tao classification_pyt create-job --help
Best Practices#
1. Authentication Management
# Always check authentication before operations
def safe_operation(client):
if not client.is_authenticated():
raise TaoAuthenticationError("Please authenticate first")
return client.list_workspaces()
2. Resource Cleanup
# Clean up completed jobs regularly
def cleanup_old_jobs(client, keep_days=7):
import datetime
jobs = client.list_jobs()
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=keep_days)
for job in jobs:
if job["status"] in ["Done", "Failed"] and job["created_date"] < cutoff_date:
try:
client.delete_job(job["id"])
except Exception as e:
print(f"Failed to delete job {job['id']}: {e}")
3. Error Handling
from tao_sdk.exceptions import TaoAPIError, TaoAuthenticationError
try:
client.require_authentication()
workspaces = client.list_workspaces()
except TaoAuthenticationError as e:
print(f"Authentication error: {e}")
client.login(ngc_key="...", ngc_org_name="...")
except TaoAPIError as e:
print(f"API error: {e.status_code} - {e.message}")
Resources#
API Documentation: REST API
API Reference: OpenAPI Specifications
Inference Microservices: Inference Microservices
AutoML: AutoML
For detailed technical documentation, consult the SDK source code and API v2 OpenAPI specifications.