Storage & Credentials Configuration#
Configure storage access, API keys, and security credentials for NeMo Curator deployments. This guide focuses on operational setup for different storage backends and credential management.
Tip
Using These Credentials: After configuring storage access, you can use these credentials in your deployments:
Kubernetes Deployment: Apply credentials via Kubernetes secrets
Slurm Deployment: Mount credential files in Slurm containers
Deployment Environment Configuration: Environment-specific credential patterns
Cloud Storage Configuration#
Amazon S3 Configuration#
AWS Credentials Setup#
# ~/.aws/credentials
[default]
aws_access_key_id = YOUR_ACCESS_KEY_ID
aws_secret_access_key = YOUR_SECRET_ACCESS_KEY
[production]
aws_access_key_id = PROD_ACCESS_KEY_ID
aws_secret_access_key = PROD_SECRET_ACCESS_KEY
# AWS credentials via environment variables
export AWS_ACCESS_KEY_ID="your-access-key-id"
export AWS_SECRET_ACCESS_KEY="your-secret-access-key"
export AWS_DEFAULT_REGION="us-west-2"
export AWS_PROFILE="production" # Optional: use specific profile
# Use IAM roles for EC2/EKS deployments
export AWS_ROLE_ARN="arn:aws:iam::123456789012:role/NemoCuratorRole"
export AWS_WEB_IDENTITY_TOKEN_FILE="/var/run/secrets/eks.amazonaws.com/serviceaccount/token"
S3 Configuration Options#
# S3-specific settings
export AWS_S3_ENDPOINT_URL="https://s3.amazonaws.com" # Custom endpoint
export AWS_S3_USE_SSL="true"
export AWS_S3_VERIFY_SSL="true"
export AWS_S3_ADDRESSING_STYLE="virtual" # or "path"
# Performance tuning
export AWS_S3_MAX_CONCURRENT_REQUESTS="10"
export AWS_S3_MAX_BANDWIDTH="100MB/s"
export AWS_S3_MULTIPART_THRESHOLD="64MB"
export AWS_S3_MULTIPART_CHUNKSIZE="16MB"
Azure Blob Storage Configuration#
Azure Credentials Setup#
export AZURE_CLIENT_ID="your-client-id"
export AZURE_CLIENT_SECRET="your-client-secret"
export AZURE_TENANT_ID="your-tenant-id"
export AZURE_SUBSCRIPTION_ID="your-subscription-id"
export AZURE_USE_MSI="true"
export AZURE_CLIENT_ID="managed-identity-client-id" # Optional
export AZURE_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=myaccount;AccountKey=mykey;EndpointSuffix=core.windows.net"
Azure Storage Configuration#
# Azure Blob Storage settings
export AZURE_STORAGE_ACCOUNT="your-storage-account"
export AZURE_STORAGE_CONTAINER="nemo-curator-data"
export AZURE_STORAGE_ENDPOINT="https://myaccount.blob.core.windows.net/"
# Performance settings
export AZURE_STORAGE_MAX_CONCURRENCY="10"
export AZURE_STORAGE_BLOCK_SIZE="4MB"
Google Cloud Storage Configuration#
GCS Credentials Setup#
export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account-key.json"
export GOOGLE_CLOUD_PROJECT="your-project-id"
export GOOGLE_CLOUD_PROJECT="your-project-id"
# Workload Identity automatically handles authentication
GCS Configuration Options#
# GCS-specific settings
export GCS_PROJECT_ID="your-project-id"
export GCS_BUCKET="nemo-curator-bucket"
export GCS_DEFAULT_LOCATION="US"
# Performance tuning
export GCS_MAX_RETRY_DELAY="60"
export GCS_TOTAL_TIMEOUT="300"
API Keys and Model Access#
Hugging Face Configuration#
# Hugging Face Hub authentication
export HUGGINGFACE_HUB_TOKEN="hf_your_token_here"
export HF_HOME="/shared/cache/huggingface" # Cache directory
export HF_HUB_CACHE="/shared/cache/huggingface/hub"
export HF_DATASETS_CACHE="/shared/cache/huggingface/datasets"
# Offline mode (for air-gapped environments)
export HF_HUB_OFFLINE="1"
export TRANSFORMERS_OFFLINE="1"
OpenAI API Configuration#
# OpenAI API credentials
export OPENAI_API_KEY="sk-your-openai-api-key"
export OPENAI_ORGANIZATION="org-your-organization-id" # Optional
export OPENAI_BASE_URL="https://api.openai.com/v1" # Custom endpoint
# Rate limiting and timeouts
export OPENAI_MAX_RETRIES="3"
export OPENAI_TIMEOUT="60"
NVIDIA API Configuration#
# NVIDIA NGC API
export NGC_API_KEY="your-ngc-api-key"
export NGC_ORG="your-organization"
export NGC_TEAM="your-team" # Optional
# NVIDIA NIM (NVIDIA Inference Microservices)
export NVIDIA_API_KEY="nvapi-your-api-key"
export NIM_BASE_URL="https://integrate.api.nvidia.com/v1"
Anthropic API Configuration#
# Anthropic Claude API
export ANTHROPIC_API_KEY="sk-ant-your-api-key"
export ANTHROPIC_BASE_URL="https://api.anthropic.com"
File System Configuration#
Local Storage Optimization#
# Local SSD optimization
export LOCAL_SSD_DIR="/mnt/local-ssd"
export TEMP_DIR="${LOCAL_SSD_DIR}/tmp"
export SCRATCH_DIR="${LOCAL_SSD_DIR}/scratch"
# I/O optimization
export IO_THREADS="8"
export BUFFER_SIZE="64MB"
Security Configuration#
SSL/TLS Configuration#
# SSL certificate paths
export SSL_CERT_FILE="/etc/ssl/certs/ca-certificates.crt"
export SSL_CERT_DIR="/etc/ssl/certs"
export REQUESTS_CA_BUNDLE="/etc/ssl/certs/ca-certificates.crt"
export CURL_CA_BUNDLE="/etc/ssl/certs/ca-certificates.crt"
# Disable SSL verification (NOT recommended for production)
export PYTHONHTTPSVERIFY="0"
export CURL_INSECURE="1"
Proxy Configuration#
# HTTP/HTTPS proxy settings
export HTTP_PROXY="http://proxy.company.com:8080"
export HTTPS_PROXY="http://proxy.company.com:8080"
export NO_PROXY="localhost,127.0.0.1,.company.com"
# Proxy authentication
export HTTP_PROXY="http://username:password@proxy.company.com:8080"
export HTTPS_PROXY="http://username:password@proxy.company.com:8080"
Secrets Management#
Kubernetes Secrets#
# kubernetes-secrets.yaml
apiVersion: v1
kind: Secret
metadata:
name: nemo-curator-secrets
type: Opaque
stringData:
AWS_ACCESS_KEY_ID: "your-access-key"
AWS_SECRET_ACCESS_KEY: "your-secret-key"
HUGGINGFACE_HUB_TOKEN: "hf_your_token"
OPENAI_API_KEY: "sk-your-openai-key"
HashiCorp Vault Integration#
# Vault configuration
export VAULT_ADDR="https://vault.company.com:8200"
export VAULT_TOKEN="your-vault-token"
export VAULT_NAMESPACE="nemo-curator"
# Vault secret paths
export VAULT_AWS_PATH="secret/nemo-curator/aws"
export VAULT_OPENAI_PATH="secret/nemo-curator/openai"
Credential Validation#
Storage Access Validation#
# Validate S3 access
import boto3
try:
s3 = boto3.client('s3')
buckets = s3.list_buckets()
print("✓ S3 access configured correctly")
print(f"Available buckets: {[b['Name'] for b in buckets['Buckets']]}")
except Exception as e:
print(f"✗ S3 access failed: {e}")
# Validate Azure access
try:
from azure.storage.blob import BlobServiceClient
blob_service = BlobServiceClient.from_connection_string(
os.environ['AZURE_STORAGE_CONNECTION_STRING']
)
containers = list(blob_service.list_containers())
print("✓ Azure Blob Storage access configured correctly")
except Exception as e:
print(f"✗ Azure access failed: {e}")
# Validate GCS access
try:
from google.cloud import storage
client = storage.Client()
buckets = list(client.list_buckets())
print("✓ GCS access configured correctly")
except Exception as e:
print(f"✗ GCS access failed: {e}")
API Key Validation#
# Validate Hugging Face access
try:
from huggingface_hub import whoami
user_info = whoami()
print(f"✓ Hugging Face authenticated as: {user_info['name']}")
except Exception as e:
print(f"✗ Hugging Face authentication failed: {e}")
# Validate OpenAI access
try:
import openai
client = openai.OpenAI()
models = client.models.list()
print("✓ OpenAI API access configured correctly")
except Exception as e:
print(f"✗ OpenAI API access failed: {e}")
Deployment-Specific Configurations#
Development Environment#
# Development storage configuration
export AWS_PROFILE="development"
export NEMO_CURATOR_CACHE_DIR="./cache"
export NEMO_CURATOR_DATA_DIR="./data"
export NEMO_CURATOR_OUTPUT_DIR="./output"
# Use local storage for development
export USE_LOCAL_STORAGE="true"
Staging Environment#
# Staging environment configuration
export AWS_PROFILE="staging"
export NEMO_CURATOR_CACHE_DIR="/shared/staging/cache"
export NEMO_CURATOR_DATA_DIR="s3://staging-bucket/data"
export NEMO_CURATOR_OUTPUT_DIR="s3://staging-bucket/output"
# Reduced performance settings for cost optimization
export AWS_S3_MAX_CONCURRENT_REQUESTS="5"
Production Environment#
# Production storage configuration
export AWS_PROFILE="production"
export NEMO_CURATOR_CACHE_DIR="/shared/prod/cache"
export NEMO_CURATOR_DATA_DIR="s3://prod-data-bucket/input"
export NEMO_CURATOR_OUTPUT_DIR="s3://prod-data-bucket/output"
# Optimized performance settings
export AWS_S3_MAX_CONCURRENT_REQUESTS="20"
export AWS_S3_MULTIPART_THRESHOLD="128MB"
export AWS_S3_MULTIPART_CHUNKSIZE="32MB"
# Security settings
export SSL_VERIFY="true"
export ENABLE_AUDIT_LOGGING="true"