NeMo Platform configuration reference#

This document describes the structure and defaults for the global config file for the NeMo Platform. All sections are shown in YAML format with inline comments for description, default, and possible values.

Configuration sections#

platform#

Platform-wide configuration settings.

platform:
  # Control plane used by the platform. Used to auto-detect default backends to use. | default: 'docker' | values: 'kubernetes' | 'docker'
  control_plane: docker
  # Base URL for the NeMo Platform api. Used as the default URL for all services. | default: 'http://localhost:8080'
  base_url: http://localhost:8080
  # Optional loopback address override for job containers to reach platform services. If not specified, automatically determined based on platform: macOS uses 'host.docker.internal', Docker containers use container hostname, Linux host network uses no override. Can be set via config file or NMP_LOOPBACK_ADDRESS env var.
  loopback_address:
  # Global image pull secrets for the platform
  image_pull_secrets: []
  # Docker registry for NMP platform images (e.g., 'nvcr.io/nvidia/nemo-microservices'). | default: 'my-registry'
  image_registry: my-registry
  # Default tag for NMP platform images. | default: 'local'
  image_tag: local
  # Shared Docker configuration for services using Docker backends.
  docker:
    # GPU device IDs to reserve for the Docker GPU pool. Use 'all' to auto-detect and use all available GPUs, 'none' or empty string to disable GPU support, or a comma-separated list of device IDs (e.g., '0,1,2,3'). | default: 'all'
    reserved_gpu_device_ids: all
  # Name of the secret containing the default NGC API key. Defaults to 'system/ngc-api-key'. | default: 'system/ngc-api-key'
  ngc_api_key_secret: system/ngc-api-key
  # Environment variable name to source the NGC API key from. | default: 'NGC_API_KEY'
  ngc_api_key_env_var: NGC_API_KEY
  # When true, GET / returns a 301 redirect to /studio. When false, GET / returns 404. This is used to redirect the root URL to the Studio UI. | default: True
  redirect_root_to_studio: true

service#

Common configuration shared by all services.

service:
  # Format for logs generated by the service | default: 'plain' | values: 'json' | 'plain'
  log_format: plain
  # Logging level for the NeMo Platform. | default: 'INFO' | values: 'DEBUG' | 'INFO' | 'WARN' | 'ERROR'
  log_level: INFO
  # Scheme for the NeMo Platform service. | default: 'http'
  scheme: http
  # Host for the NeMo Platform service. | default: '0.0.0.0'
  host: 0.0.0.0
  # Port for the NeMo Platform service. | default: 8080
  port: 8080

auth#

Configuration for the Auth Service.

auth:
  # Master switch for authorization. If False, all requests are allowed. | default: False
  enabled: false
  # Base URL for the Policy Decision Point (auth service or external OPA). | default: 'http://localhost:8080'
  policy_decision_point_base_url: http://localhost:8080
  # Policy Decision Point provider: 'embedded' for auth service's built-in WASM engine, 'opa' for external OPA sidecar. | default: 'embedded' | values: 'embedded' | 'opa'
  policy_decision_point_provider: embedded
  # Auto-authorize /internal/ endpoints without checking PDP. | default: True
  auto_authorize_internal: true
  # Strip /internal prefix from versioned API paths (e.g., /internal/v1/... -> /v1/...). | default: True
  strip_internal_prefix: true
  # Allow unsigned JWTs (`alg=none`) for local development/testing. Disabled by default and should not be enabled in production. | default: False
  allow_unsigned_jwt: false
  # OIDC configuration for native token validation.
  oidc:
    # Enable native OIDC token validation. | default: False
    enabled: false
    # OIDC issuer URL (e.g., https://sso.nvidia.com). Used for token validation and .well-known discovery. | default: ''
    issuer: ''
    # Additional valid issuers for token validation. Useful for Azure AD where access tokens use v1.0 issuer format (https://sts.windows.net/{tenant}/) while endpoints use v2.0.
    additional_issuers: []
    # OAuth client ID for this NMP deployment. Used for device flow and token audience validation. | default: ''
    client_id: ''
    # Override authorization endpoint (defaults to discovery).
    authorization_endpoint:
    # Override token endpoint (defaults to discovery).
    token_endpoint:
    # Override device authorization endpoint (defaults to discovery).
    device_authorization_endpoint:
    # Override JWKS URI for token validation (defaults to discovery).
    jwks_uri:
    # Expected token audience. When set, tokens must include this value in their 'aud' claim. When not set, audience validation is skipped entirely.
    audience:
    # JWT claim containing user email. Defaults to 'upn' for Microsoft/Azure AD issuers. | default: 'email'
    email_claim: email
    # JWT claim containing user groups. Supports 'groups' (standard) and 'cognito:groups' (AWS Cognito). | default: 'groups'
    groups_claim: groups
    # JWT claim to use as principal ID. Defaults to 'oid' for Microsoft/Azure AD issuers. | default: 'sub'
    subject_claim: sub
    # Space-separated OAuth scopes to request during authentication. For Azure AD with custom API, use: 'api://{app-id}/.default openid profile email' | default: 'openid profile email offline_access'
    default_scopes: openid profile email offline_access
    # Prefix to strip from token scopes before authorization. For example, if IdP returns 'api://my-app/models:read', set prefix to 'api://my-app/' to normalize to 'models:read'. If not set, scopes are used as-is.
    scope_prefix:
    # TTL in seconds for caching IdP discovery document responses. Used by the discovery endpoint to avoid per-request IdP calls. Set to 0 to disable caching. | default: 300
    discovery_cache_ttl: 300
  # Port to run the service on | default: 8000
  port: 8000
  # Refresh interval for policy data in seconds | default: 30
  policy_data_refresh_interval: 30
  # Seconds to cache the OPA bundle | default: 5
  bundle_cache_seconds: 5
  # Bootstrap admin email for platform setup
  admin_email:
  # Name of the default workspace where all authenticated users get Editor role | default: 'default'
  default_workspace: default
  # CPU budget for embedded PDP policy evaluation, in millions of WASM fuel units. Typical evaluations use 20-25; default of 100 provides ~4x headroom. | default: 100
  embedded_pdp_cpu_limit: 100
  # Maximum linear memory (MB) the embedded PDP WASM runtime can consume. | default: 32
  embedded_pdp_memory_limit_mb: 32

entities#

Configuration for the Entities Service.

entities:
  # Run Alembic migrations (upgrade head) at startup. | default: True
  run_migrations: true
  database_config:
    # Full database URL (overrides other settings)
    url:
    # Database dialect - either sqlite or postgresql | default: 'postgresql'
    dialect: postgresql
    # Database hostname | default: ''
    host: ''
    # Database path | default: ''
    path: ''
    # Database name | default: ''
    name: ''
    # Optional database port
    port:
    # Optional database username
    user:
    # Optional database password
    password:
    # Maximum number of connections in the database connection pool | default: 10
    connections_limit: 10
    # Connection timeout in seconds. For PostgreSQL (asyncpg) and SQLite, how long to wait when connecting or acquiring a lock. | default: 30
    connect_timeout_seconds: 30
    # Enable SQLAlchemy echo for the database connection | default: False
    echo: false
  # Interval in seconds to run the workspace cleanup routine. | default: 10
  workspace_cleanup_interval: 10
  # Interval in seconds for the background database health ping (used for readiness). | default: 10.0
  db_health_check_interval_seconds: 10.0

files#

Configuration for the Files Service.

files:
  default_storage_config:
    # Chunk size in bytes for reading/streaming files. Larger chunks reduce async overhead but increase memory per concurrent download. Default: 1MB. | default: 1048576
    read_chunk_size: 1048576
    # default: 'local' | values: <StorageConfigType.LOCAL: 'local'>
    type: local
    path: /data/files_storage
    # How many bytes to buffer before flushing to disk | default: 16777216
    write_buffer_size: 16777216
  # Comma-separated list of external hosts the Files service is allowed to access. | default: 'https://api.ngc.nvidia.com,https://huggingface.co'
  allowed_external_hosts: https://api.ngc.nvidia.com,https://huggingface.co
  # Allow users to explicitly create filesets with local storage config. Security-sensitive: enable only in trusted deployments. | default: False
  allow_user_local_storage: false
  # TTL for file locks in seconds (default 5 minutes) | default: 300
  file_lock_ttl_seconds: 300
  # Maximum concurrent downloads during cache warming | default: 3
  cache_warming_max_concurrent: 3

inference_gateway#

Configuration for the Inference Gateway Service.

inference_gateway:
  # How frequently (in seconds) to refresh the internal model cache from the Models service. If set to 0, disable automatic refreshing. | default: 3
  refresh_model_cache_interval_sec: 3
  # Time-to-live (in seconds) for cached secrets before they expire and need refreshing. If set to 0, we will refresh this each time the model providers are refreshed. | default: 0
  secrets_ttl_sec: 0

jobs#

Configuration for the Jobs Service.

jobs:
  # List of executor profiles for the Jobs service
  executors: []
  # Default executor profile configurations
  executor_defaults:
    # Default Docker execution profile configuration
    docker:
      # default: 1800
      ttl_seconds_before_active: 1800
      # default: 86400
      ttl_seconds_active: 86400
      # default: 3600
      ttl_seconds_after_finished: 3600
      # default: True
      cleanup_completed_jobs_immediately: true
      # Path to the jobs launcher tool | default: '/tools/jobs-launcher'
      launcher_tool_path: /tools/jobs-launcher
      # Optional env vars applied to all jobs (e.g. HOME=/tmp). Keys must not conflict with platform-reserved names. Job steps may override these variables.
      env: {}
      # Docker storage configuration
      storage:
        # Name of the Docker volume for persistent storage | default: 'nemo-jobs-storage'
        volume_name: nemo-jobs-storage
        # Docker image used to set permissions on the volume | default: 'busybox'
        volume_permissions_image: busybox
        # List of additional Docker volume mounts for the job
        additional_volume_mounts: []
      # Docker networking configuration
      networking:
        # Docker network for the job container | default: 'host'
        job_container_network: host
    # Default Kubernetes execution profile configuration
    kubernetes_job:
      # default: 1800
      ttl_seconds_before_active: 1800
      # default: 86400
      ttl_seconds_active: 86400
      # default: 3600
      ttl_seconds_after_finished: 3600
      # default: True
      cleanup_completed_jobs_immediately: true
      # Path to the jobs launcher tool | default: '/tools/jobs-launcher'
      launcher_tool_path: /tools/jobs-launcher
      # Optional env vars applied to all jobs (e.g. HOME=/tmp). Keys must not conflict with platform-reserved names. Job steps may override these variables.
      env: {}
      # Kubernetes namespace to submit the job to. If not set, it will be determined from the environment.
      namespace:
      # Kubernetes service account name for job pods. Uses the Kubernetes default service account when set to 'default'. | default: 'default'
      service_account_name: default
      # Tolerations for the Kubernetes job pods.
      tolerations: []
      # Node selector for the Kubernetes job pods.
      node_selector: {}
      # Affinity for the Kubernetes job pods.
      affinity: {}
      # Resource requests and limits for the Kubernetes job pods.
      resources:
        # Minimum resources requested for the container.
        requests:
          # CPU specification (e.g., '250m', '1', '2.5').
          cpu:
          # Memory specification (e.g., '128Mi', '1Gi', '512M').
          memory:
        # Maximum resources the container can use.
        limits:
          # CPU specification (e.g., '250m', '1', '2.5').
          cpu:
          # Memory specification (e.g., '128Mi', '1Gi', '512M').
          memory:
        # Number of nodes to use. | default: 1
        num_nodes: 1
        # Step requesting number of GPUs.
        num_gpus:
        # Shared memory (/dev/shm) size as a Kubernetes quantity (e.g. '1Gi', '4Gi'). Used for GPU and distributed-GPU job executors. When unset, defaults to 1Gi per allocated GPU.
        shm_size:
      # Pod security context for the Kubernetes job pods.
      pod_security_context: {}
      # Image pull secrets for the Kubernetes job pods.
      image_pull_secrets: []
      # Metadata to add to each job object in the Kubernetes job.
      job_metadata:
        labels: {}
        annotations: {}
      # Metadata to add to each pod in the Kubernetes job.
      pod_metadata:
        labels: {}
        annotations: {}
      # Storage configuration for the Kubernetes job pods.
      storage:
        # Persistent Volume Claim Name to use for job storage. | default: ''
        pvc_name: ''
        # Image used to set volume permissions | default: 'busybox'
        volume_permissions_image: busybox
        # Additional volumes to mount
        additional_volumes: []
        # Additional volume mounts
        additional_volume_mounts: []
      # Number of GPUs to request for the job | default: 1
      num_gpus: 1
      # Container image that contains the jobs-launcher binary. | default: 'nvcr.io/nvidia/nemo-microservices/jobs-launcher:latest'
      launcher_image: nvcr.io/nvidia/nemo-microservices/jobs-launcher:latest
    # Default Volcano execution profile configuration
    volcano_job:
      # default: 1800
      ttl_seconds_before_active: 1800
      # default: 86400
      ttl_seconds_active: 86400
      # default: 3600
      ttl_seconds_after_finished: 3600
      # default: True
      cleanup_completed_jobs_immediately: true
      # Path to the jobs launcher tool | default: '/tools/jobs-launcher'
      launcher_tool_path: /tools/jobs-launcher
      # Optional env vars applied to all jobs (e.g. HOME=/tmp). Keys must not conflict with platform-reserved names. Job steps may override these variables.
      env: {}
      # Kubernetes namespace to submit the job to. If not set, it will be determined from the environment.
      namespace:
      # Kubernetes service account name for job pods. Uses the Kubernetes default service account when set to 'default'. | default: 'default'
      service_account_name: default
      # Tolerations for the Kubernetes job pods.
      tolerations: []
      # Node selector for the Kubernetes job pods.
      node_selector: {}
      # Affinity for the Kubernetes job pods.
      affinity: {}
      # Resource requests and limits for the Kubernetes job pods.
      resources:
        # Minimum resources requested for the container.
        requests:
          # CPU specification (e.g., '250m', '1', '2.5').
          cpu:
          # Memory specification (e.g., '128Mi', '1Gi', '512M').
          memory:
        # Maximum resources the container can use.
        limits:
          # CPU specification (e.g., '250m', '1', '2.5').
          cpu:
          # Memory specification (e.g., '128Mi', '1Gi', '512M').
          memory:
        # Number of nodes to use. | default: 1
        num_nodes: 1
        # Step requesting number of GPUs.
        num_gpus:
        # Shared memory (/dev/shm) size as a Kubernetes quantity (e.g. '1Gi', '4Gi'). Used for GPU and distributed-GPU job executors. When unset, defaults to 1Gi per allocated GPU.
        shm_size:
      # Pod security context for the Kubernetes job pods.
      pod_security_context: {}
      # Image pull secrets for the Kubernetes job pods.
      image_pull_secrets: []
      # Metadata to add to each job object in the Kubernetes job.
      job_metadata:
        labels: {}
        annotations: {}
      # Metadata to add to each pod in the Kubernetes job.
      pod_metadata:
        labels: {}
        annotations: {}
      # Storage configuration for the Kubernetes job pods.
      storage:
        # Persistent Volume Claim Name to use for job storage. | default: ''
        pvc_name: ''
        # Image used to set volume permissions | default: 'busybox'
        volume_permissions_image: busybox
        # Additional volumes to mount
        additional_volumes: []
        # Additional volume mounts
        additional_volume_mounts: []
      # Number of GPUs to request for the job | default: 1
      num_gpus: 1
      # Container image that contains the jobs-launcher binary. | default: 'nvcr.io/nvidia/nemo-microservices/jobs-launcher:latest'
      launcher_image: nvcr.io/nvidia/nemo-microservices/jobs-launcher:latest
      # The Volcano queue to submit the job to. | default: 'default'
      queue: default
      # The scheduler name to use for the Volcano job. | default: 'volcano'
      scheduler_name: volcano
      # maxRetry indicates the maximum number of retries allowed by the job | default: 0
      max_retry: 0
      # plugins indicates the plugins used by Volcano when the job is scheduled. We always add the pytorch plugin if more than one node.
      plugins: {}
      # Enable multi-node networking injection. Sets annotations to trigger Kyverno policy mutations. | default: True
      enable_multi_node_networking: true
  # Interval in seconds for the job reconciler to run | default: 2
  reconcile_interval_seconds: 2
  # Interval in seconds for the job scheduler to run | default: 5
  schedule_interval_seconds: 5

models#

Consolidated configuration for the Models service.

models:
  # HuggingFace model puller image for weights in data store or huggingface | default: 'nvcr.io/nvidia/nemo-microservices/nds-v2-huggingface-cli:25.10'
  huggingface_model_puller: nvcr.io/nvidia/nemo-microservices/nds-v2-huggingface-cli:25.10
  # Controller service configuration
  controller:
    # Interval in seconds for the Models Controller to run its control loop | default: 5
    interval_seconds: 5
    # Dict of custom backend configurations for the Models Controller
    backends:
      docker:
        # Default NIM image when none is specified (multi-LLM image) | default: 'nvcr.io/nim/nvidia/llm-nim'
        default_nimservice_image: nvcr.io/nim/nvidia/llm-nim
        # Default NIM image tag when none is specified | default: '1.13.1'
        default_nimservice_image_tag: 1.13.1
        # NIM guided decoding backend | default: 'outlines'
        nim_guided_decoding_backend: outlines
        # PEFT/LoRA source URL for models service | default: ''
        peft_source: ''
        # PEFT/LoRA refresh interval in seconds | default: 30
        peft_refresh_interval: 30
        # Secret name for Files service authentication | default: 'files-hf-token'
        files_auth_secret: files-hf-token
        # Docker client timeout in seconds for long-running operations (default: 10 minutes) | default: 600
        docker_timeout: 600
        # Networking mode for NIM containers: 'local' (port forwarding to localhost for local dev), 'dond' (container names on shared network for quickstart), 'dind' (port forwarding to docker service for DinD setups). | default: 'local' | values: 'local' | 'dond' | 'dind'
        models_docker_networking_mode: local
        # Docker network name for 'dond' mode. NIMs will join this network to communicate with the NMP container. Required when MODELS_DOCKER_NETWORKING_MODE='dond'. Quickstart sets this automatically via MODELS_DOCKER_NETWORK env var. | default: ''
        models_docker_network: ''
        # Container name for 'dond' mode. Used to replace localhost in URLs passed to NIMs so they can reach services via the Docker network. Quickstart sets this automatically via MODELS_DOCKER_CONTAINER_NAME env var. | default: ''
        models_docker_container_name: ''
        # Hostname for port forwarding in 'dind' mode. Typically 'localhost' or the DinD service name. Used to construct URLs like http://{hostname}:{port}. | default: 'localhost'
        models_docker_host_service_name: localhost
        # Start of port range for port forwarding (inclusive). Defaults to start of IANA dynamic/ephemeral port range. | default: 49152
        models_docker_port_range_start: 49152
        # End of port range for port forwarding (inclusive). Defaults to 500-port range within IANA dynamic/ephemeral range. | default: 49652
        models_docker_port_range_end: 49652
        # HuggingFace model puller image for downloading SFT model weights from Files service | default: 'nvcr.io/nvidia/nemo-microservices/nds-v2-huggingface-cli:25.10'
        huggingface_model_puller: nvcr.io/nvidia/nemo-microservices/nds-v2-huggingface-cli:25.10
        # Timeout in seconds for the model puller container to complete (default: 30 minutes) | default: 1800
        model_puller_timeout: 1800
        # Per-file HTTP download timeout in seconds for the model puller (HF_HUB_DOWNLOAD_TIMEOUT). Increase if large model files fail with IncompleteRead/ChunkedEncodingError (default: 2 hours). | default: 7200
        model_puller_download_timeout: 7200
        # Max concurrent file downloads in the model puller (hf download --max-workers). Default 1 (sequential) reduces IncompleteRead/ChunkedEncodingError on slow or flaky links; increase for speed. | default: 1
        model_puller_max_workers: 1
        # Number of retries when the puller fails with a transient error (IncompleteRead, ChunkedEncodingError, connection broken). Same volume is reused so partial downloads can be completed. | default: 3
        model_puller_retries: 3
        # BusyBox image repository used for helper containers (permissions/find/chown). | default: 'busybox'
        busybox_image: busybox
        # BusyBox image tag used for helper containers. | default: 'latest'
        busybox_image_tag: latest
        # Maximum time (in seconds) a deployment may stay in PENDING before being transitioned to ERROR. Default: 7200 (2 hours). | default: 7200
        pending_timeout_seconds: 7200
        # Maximum number of container restarts before a PENDING deployment is transitioned to ERROR (crash loop detection). Default: 5. | default: 5
        max_restart_count: 5
        # Optional fixed shared memory size (/dev/shm) for multi-GPU NIM containers. If set, overrides the per-GPU calculation. Leave empty to use shm_size_per_gpu x GPU count. Override via MODELS_DOCKER_NIM_MULTI_GPU_SHM_SIZE env. Format: e.g. '2g', '4g'. | default: ''
        nim_multi_gpu_shm_size: ''
        # Shared memory size (/dev/shm) per GPU in megabytes for multi-GPU NIM containers. Total shm = this value x GPU count (e.g. 1024 x 2 GPUs = 2048m). Override via MODELS_DOCKER_NIM_MULTI_GPU_SHM_SIZE_PER_GPU env (integer string). | default: 1024
        nim_multi_gpu_shm_size_per_gpu: 1024
        # Whether this backend is enabled | default: False
        enabled: false
      nim_operator:
        # Default storage class for PVCs. If not set, the cluster's default StorageClass is used.
        default_storage_class:
        # Default PVC size for model storage (used if not specified in deployment config) | default: '200Gi'
        default_pvc_size: 200Gi
        # LoRA/PEFT source endpoint (only used when lora_enabled is true) | default: 'http://nemo-entity-store:8000'
        peft_source: http://nemo-entity-store:8000
        # PEFT refresh interval in seconds (only used when lora_enabled is true) | default: 30
        peft_refresh_interval: 30
        # Default user ID for NIM containers (security context)
        default_user_id:
        # Default group ID for NIM containers (security context)
        default_group_id:
        # Kubernetes secret name for Files service authentication (HF_TOKEN) | default: 'nemo-models-files-token'
        files_auth_secret: nemo-models-files-token
        # The name of the image pull secret for the modelPuller image | default: 'nvcrimagepullsecret'
        huggingface_model_puller_image_pull_secret: nvcrimagepullsecret
        # BusyBox image repository used by plugin init containers. | default: 'busybox'
        busybox_image: busybox
        # BusyBox image tag used by plugin init containers. | default: 'latest'
        busybox_image_tag: latest
        # NGC API key secret name for pulling NIM images | default: 'ngc-api'
        auth_secret: ngc-api
        # Default NIMService image repository (used if not specified in deployment config) | default: 'nvcr.io/nim/nvidia/llm-nim'
        default_nimservice_image: nvcr.io/nim/nvidia/llm-nim
        # Default NIMService image tag (used if not specified in deployment config) | default: '1.13.1'
        default_nimservice_image_tag: 1.13.1
        # Default guided decoding backend for NIM (e.g., 'outlines', 'auto', 'lm-format-enforcer') | default: 'outlines'
        nim_guided_decoding_backend: outlines
        # Kubernetes namespace for NIM deployments (defaults to controller's namespace if not set)
        namespace:
        # Default Kubernetes resource requirements for all NIM deployments. Can be overridden per-deployment via k8s_nim_operator_config. Example: {'requests': {'cpu': '2', 'memory': '8Gi'}, 'limits': {'memory': '16Gi'}}
        default_resources:
        # Default Kubernetes tolerations for all NIM deployments. Can be overridden per-deployment via k8s_nim_operator_config. Example: [{'key': 'nvidia.com/gpu', 'operator': 'Exists', 'effect': 'NoSchedule'}]
        default_tolerations:
        # Default Kubernetes node selector for all NIM deployments. Can be overridden per-deployment via k8s_nim_operator_config. Example: {'node-type': 'gpu-node', 'zone': 'us-west1-a'}
        default_node_selector:
        # Default Kubernetes labels applied to NIMService and NIMCache resources and their child resources (e.g. pods). Merged with controller-managed labels; controller labels take precedence on conflict. Example: {'team': 'ml-platform', 'environment': 'prod'}
        default_labels:
        # Default Kubernetes annotations applied to NIMService and NIMCache resources and their child resources (e.g. pods, PVCs). Merged with controller-managed annotations; controller annotations take precedence on conflict. Example: {'prometheus.io/scrape': 'true'}
        default_annotations:
        # Default grace period in seconds for NIM startup. Can be overridden per-deployment via k8s_nim_operator_config. Determines how long Kubernetes will wait for the NIM to become ready before restarting it. If not set, defaults to 600 seconds (10 minutes). Example: 600 (10 minutes)
        default_startup_probe_grace_period_seconds:
        # Maximum time in seconds a deployment may stay in PENDING before being transitioned to ERROR. Default: 7200 (2 hours). | default: 7200
        pending_timeout_seconds: 7200
        # Maximum number of pod container restarts before a PENDING deployment is transitioned to ERROR (crash loop detection). Default: 5. | default: 5
        max_restart_count: 5
        # Whether this backend is enabled | default: False
        enabled: false
    # Time-to-live in seconds for DELETED deployments before they are permanently removed from the database | default: 30
    model_deployment_garbage_collection_ttl_seconds: 30
    # Time-to-live in seconds for ERROR deployments before backend resources are garbage collected | default: 10800
    error_deployment_ttl_seconds: 10800
    # Maximum number of attempts to recover a deployment when backend resources are lost | default: 5
    drift_recovery_max_attempts: 5
    # Base delay in seconds between drift recovery attempts (used for exponential backoff) | default: 30
    drift_recovery_base_delay_seconds: 30
    # Maximum delay in seconds between drift recovery attempts (caps exponential backoff) | default: 300
    drift_recovery_max_delay_seconds: 300
  # Parallelism estimation heuristics (model size thresholds, TP/PP/DP/CP/EP costs, balance bonuses)
  parallelism:
    # Standard node configuration (e.g., DGX H100) | default: 8
    gpus_per_node_default: 8
    # Standard GPU memory in GB (e.g., H100 80GB) | default: 80
    gpu_memory_gb_default: 80
    model_size_thresholds:
      # >300B: Very large models | default: 300.0
      very_large: 300.0
      # 100-300B: Large models | default: 100.0
      large: 100.0
      # 50-100B: Medium models | default: 50.0
      medium: 50.0
      # <70B: Small models for TP cost | default: 70.0
      small_tp: 70.0
      # <40B: Small MoE models | default: 40.0
      small_moe: 40.0
    memory:
      # Start penalizing above 60% memory usage | default: 0.6
      pressure_threshold: 0.6
      # Moderate memory pressure | default: 0.5
      pressure_moderate: 0.5
      # Low memory pressure | default: 0.45
      pressure_low: 0.45
      # Base penalty for exceeding threshold | default: 1000000000.0
      base_penalty: 1000000000.0
      # Divisor for quadratic penalty: (excess / divisor) ** 2 | default: 0.1
      scale_divisor: 0.1
      # Maximum discount factor (70% off) | default: 0.7
      pp_discount_max: 0.7
      # Scale factor for discount calculation | default: 2.0
      pp_discount_scale: 2.0
    tensor_parallelism:
      # For models > 300B | default: 50.0
      base_cost_very_large_model: 50.0
      # For models <= 300B | default: 100.0
      base_cost_standard_model: 100.0
      # TP=8 is standard for 340B+ | default: 8
      excessive_very_large: 8
      # TP=4 is standard for 70B and below | default: 4
      excessive_standard: 4
      # Penalty for excessive TP on large models (>70B) | default: 100000000.0
      penalty_large_model: 100000000.0
      # Penalty for excessive TP on small models (<70B) | default: 300000000.0
      penalty_small_model: 300000000.0
    data_parallelism:
      # total_parallelism >= 64 | default: -10000.0
      bonus_minimal: -10000.0
      # total_parallelism >= 32 | default: -50000.0
      bonus_small: -50000.0
      # total_parallelism <= 2 | default: -50000000.0
      bonus_very_strong: -50000000.0
      # total_parallelism <= 4 | default: -30000000.0
      bonus_strong: -30000000.0
      # total_parallelism <= 8 | default: -15000000.0
      bonus_moderate: -15000000.0
      # total_parallelism > 8 | default: -20000000.0
      bonus_medium: -20000000.0
      # Smaller MoE models | default: -1000000.0
      bonus_small_moe: -1000000.0
      # Total parallelism threshold | default: 64
      total_parallelism_very_high: 64
      # Total parallelism threshold | default: 32
      total_parallelism_high: 32
      # Total parallelism threshold | default: 2
      total_parallelism_very_low: 2
      # Total parallelism threshold | default: 4
      total_parallelism_low: 4
      # Total parallelism threshold | default: 8
      total_parallelism_medium: 8
      # Double DP bonus for pure DP+CP configurations | default: 2.0
      cp_bonus_multiplier: 2.0
    pipeline_parallelism:
      # MoE models (lower cost due to higher compute per stage) | default: 500000.0
      cost_moe: 500000.0
      # param_count_b > 300 | default: 5000000.0
      cost_very_large_model: 5000000.0
      # param_count_b > 100 | default: 10000000.0
      cost_large_model: 10000000.0
      # param_count_b > 50 | default: 30000000.0
      cost_medium_model: 30000000.0
      # param_count_b <= 50 and tp > 1 | default: 100000000.0
      cost_small_with_tp: 100000000.0
      # param_count_b <= 50 and tp == 1 | default: 300000000.0
      cost_small_without_tp: 300000000.0
    context_parallelism:
      # Strong bonus for optimal CP | default: -300000000.0
      bonus_optimal: -300000000.0
      # Bonus for CP=2 in medium sequences | default: -200000000.0
      bonus_good: -200000000.0
      # Penalty: not using enough CP or using CP when not needed | default: 200000000.0
      penalty_suboptimal: 200000000.0
      # Penalty: too much CP | default: 100000000.0
      penalty_too_much: 100000000.0
      # Penalty: should use CP but using CP=1 | default: 300000000.0
      penalty_should_use: 300000000.0
      # Very long sequences | default: 1.0
      seq_to_param_ratio_high: 1.0
      # Medium sequences | default: 0.3
      seq_to_param_ratio_medium: 0.3
      # Maximum CP value to consider | default: 8
      max_value: 8
      # CP=2 is optimal for medium sequences | default: 2
      optimal_value: 2
      # Multiplier for parameter memory calculation | default: 6.0
      param_memory_multiplier: 6.0
      # Multiplier for layers in parameter calculation | default: 12.0
      param_layers_multiplier: 12.0
      # Multiplier for sequence memory calculation | default: 38.0
      seq_memory_multiplier: 38.0
      # Enable CP for sequences >= 8K | default: 8192
      seq_threshold_enable: 8192
      # Enable CP=4 for sequences >= 16K | default: 16384
      seq_threshold_cp4: 16384
      # Enable CP=8 for sequences >= 32K | default: 32768
      seq_threshold_cp8: 32768
      # Enable CP=16 for sequences >= 128K | default: 131072
      seq_threshold_cp16: 131072
    expert_parallelism:
      # Huge penalty for EP > 1 on non-MoE models | default: 1000000000.0
      penalty_non_moe: 1000000000.0
      # Perfect: 1 routed expert per GPU | default: -500000000.0
      bonus_perfect: -500000000.0
      # Very efficient: <= 8 experts per GPU | default: -400000000.0
      bonus_very_efficient: -400000000.0
      # Good: <= 32 experts per GPU | default: -300000000.0
      bonus_good: -300000000.0
      # Acceptable: <= 64 experts per GPU | default: -200000000.0
      bonus_acceptable: -200000000.0
      # High expert count per GPU (>64) | default: -100000000.0
      bonus_high_count: -100000000.0
      # Huge penalty: no EP on MoE | default: 800000000.0
      penalty_no_sharding: 800000000.0
      # Penalty for non-divisor EP | default: 300000000.0
      penalty_non_divisor: 300000000.0
      # Experts per GPU threshold | default: 8
      experts_per_gpu_very_efficient: 8
      # Experts per GPU threshold | default: 32
      experts_per_gpu_good: 32
      # Experts per GPU threshold | default: 64
      experts_per_gpu_acceptable: 64
    balance:
      # TP == PP (perfect balance) | default: 1.0
      ratio_perfect: 1.0
      # max(TP, PP) / min(TP, PP) <= 2.0 | default: 2.0
      ratio_good: 2.0
      # Perfect balance for very large dense models (>300B) | default: -500000000.0
      bonus_perfect_very_large: -500000000.0
      # Good balance for very large dense models | default: -300000000.0
      bonus_good_very_large: -300000000.0
      # Perfect balance for large models (>100B) | default: -400000000.0
      bonus_perfect_large: -400000000.0
      # Good balance for large models | default: -200000000.0
      bonus_good_large: -200000000.0
      # Perfect balance for smaller models | default: -300000000.0
      bonus_perfect_small: -300000000.0
      # Good balance for smaller models | default: -100000000.0
      bonus_good_small: -100000000.0
      # Very large MoE with tight memory | default: -500000000.0
      bonus_strong_moe: -500000000.0
      # Quadratic TP penalty factor | default: 5000000.0
      tp_squared_multiplier: 5000000.0
      # PP >= 4 is significant | default: 4
      pp_significant_threshold: 4
      # EP >= 4 is significant | default: 4
      ep_significant_threshold: 4
  # Configuration for trust_remote_code in the models service
  trust_remote_code:
    # List of repo IDs or regex patterns trusted for model loading from HF (direct match or fullmatch).
    hf_allow_list:
    - nvidia/.*
    # List of org/team/name strings or regex patterns for trusted for model loading from NGC (direct match or fullmatch).
    ngc_allow_list:
    - nvidia/.*
    # Whether to allow trust_remote_code anywhere in the platform | default: True
    enabled: true
  # Configuration for tool_call_plugin in the models service
  tool_call_plugin:
    # Whether to allow custom tool-call parser plugins. When disabled (default), any tool_call_plugin value supplied via API or fileset metadata is rejected or stripped. Enable with caution — plugins execute arbitrary Python code inside the inference container. | default: False
    enabled: false

secrets#

Configuration for the Secrets service.

secrets:
  # Encryption configuration for the Secrets service.
  encryption:
    # The type of encryption key used to encrypt/decrypt secrets. | default: ''
    current_provider: ''
    # Configuration for all available encryption providers.
    providers:
      # Mapping of secret key encryptor names to their configurations.
      secret_key:
        default:
          # Base64-encoded encryption key
          value:
          # Indicates if the key should be loaded from an environment variable. If set, 'value' is ignored.
          from_env:
      # Mapping of vault encryptor names to their configurations.
      vault:
        default:
          # Name of the key in Vault to use for encryption/decryption. | default: 'nemo-platform-key'
          key_name: nemo-platform-key
          # Address of the Vault server. If not specificed, the value of the VAULT_ADDR env variable will be used.
          address:
          # Authentication token for Vault. If not specified, the value of the VAULT_TOKEN env variable or the content of the file located at ~/.vault-token will be used.
          token:

customizer#

Configuration for the Customizer service.

customizer:
  # Port to run the service on | default: 8000
  port: 8000
  # Enable debug mode | default: False
  debug: false
  # Override container image for Automodel training. If not set, uses platform defaults.
  training_automodel_image:
  # Override container image for DPO training. If not set, uses platform defaults.
  training_rl_image:
  # default: '1'
  default_job_resource_cpu_request: '1'
  # default: '8Gi'
  default_job_resource_memory_request: 8Gi
  # default: '4'
  default_job_resource_cpu_limit: '4'
  # default: '16Gi'
  default_job_resource_memory_limit: 16Gi
  # Terminate a training step if no task reports progress within this many seconds. 0 disables the check. | default: 3600
  training_staleness_timeout_seconds: 3600

data_designer#

Configuration for the Data Designer service.

data_designer:
  preview_num_records:
    # default: 10
    max: 10
    # default: 10
    default: 10
  # default: 'default'
  job_executor_profile: default

evaluator#

Configuration for the Evaluator service.

evaluator:
  # Configuration for jobs created with Evaluator service.
  jobs:
    # Directory path in job container for evaluation configuration. | default: '/configs'
    configs_dir: /configs
    # Directory path of the shared volume mount for job steps to persist artifacts for a job. | default: '/jobs'
    volume_path: /jobs
    # Directory path in the job container for results to be output. | default: '/jobs/results'
    results_dir: /jobs/results
    # Directory path in the job container for dataset files to be downloaded to and loaded from. | default: '/jobs/datasets'
    dataset_dir: /jobs/datasets
  # Configuration for EvalFactory integration with NeMo Platform.
  evalfactory:
    # default: 'nvcr.io/nvidia/eval-factory/agentic_eval:25.10.1'
    agentic_eval: nvcr.io/nvidia/eval-factory/agentic_eval:25.10.1
    # default: 'nvcr.io/nvidia/eval-factory/bfcl:25.11'
    bfcl: nvcr.io/nvidia/eval-factory/bfcl:25.11
    # default: 'nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.09'
    lm_eval_harness: nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.09
    # default: 'nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.09'
    bigcode_evaluation_harness: nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:25.09
    # default: 'nvcr.io/nvidia/eval-factory/rag_retriever_eval:25.11.1'
    rag_retriever: nvcr.io/nvidia/eval-factory/rag_retriever_eval:25.11.1
    # default: 'nvcr.io/nvidia/eval-factory/safety-harness:25.09'
    safety_harness: nvcr.io/nvidia/eval-factory/safety-harness:25.09
    # default: 'nvcr.io/nvidia/eval-factory/simple-evals:25.09'
    simple_evals: nvcr.io/nvidia/eval-factory/simple-evals:25.09
    # Connect to a hosted Milvus server for retrieval evaluations
    milvus_url:
  # Upsert system metrics and benchmarks on app startup | default: False
  recreate_existing_system_entities: false

safe_synthesizer#

Configuration for the Safe Synthesizer service.

safe_synthesizer:
  # default: '0.0.0.0'
  host: 0.0.0.0
  # default: 8000
  port: 8000
  entrypoint:
  - python
  - -m
  - nmp.safe_synthesizer.tasks.safe_synthesizer
  # default: 'default'
  job_executor_profile: default
  # default: '16G'
  default_job_resource_memory_request: 16G
  # default: '4'
  default_job_resource_cpu_request: '4'
  # default: '16G'
  default_job_resource_memory_limit: 16G
  # default: '4'
  default_job_resource_cpu_limit: '4'

studio#

Configuration for the Studio service.

studio:
  # Path to the directory containing the built static UI assets.
  static_files_path: /static/studio
  # Base URL of the platform. This is used by the Studio UI to make API calls. | default: ''
  platform_base_url: ''