NeMo Platform Helm Chart#

Type: application

For deployment guide, see Admin Setup in the NeMo Platform documentation.

Values#

The following is the complete values.yaml file for the NeMo Platform Helm Chart. All configuration options are documented inline with comments.

# Default values for NeMo Microservices Platform Helm chart

## Helm global configuration settings

# -- Overrides for name and fullname templates
nameOverride: ""
fullnameOverride: ""

# -- Your NVIDIA GPU Cloud (NGC) API key authenticates and enables pulling images from the NGC container registry. The existing secret overrides this key if you provide one to the `existingSecret` key.
ngcAPIKey: YOUR-NGC-API-KEY

# -- Environment variables that will be applied to every deployment pod. Uses a simple key value map structure like MY_ENV_VAR: the-key and works with valueFrom as well.
env: {}

# -- Optional. Name of an existing Kubernetes Secret to load as env vars (envFrom) for the API pod.
# When set, the chart does not create the default api-env secret; use your own secret (e.g. from Vault, sealed-secrets).
# When unset, the chart creates a default secret with the environment variable NMP_SECRETS_DEFAULT_ENCRYPTION_KEY for default installation.
# See the NeMo Platform documentation for more details on secrets encryption.
envFromSecret: ""

# -- You can use an existing Kubernetes secret for communicating with the NGC API for downloading models. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string.
existingSecret: ngc-api

# -- You can specify an existing Kubernetes image pull secret for pulling images from the NGC container registry. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string.
existingImagePullSecret: nvcrimagepullsecret

# -- List of additional image pull secrets to use for pulling container images. Can be used when multiple image pull secrets are required in your environment.
additionalImagePullSecrets: {}

# -- RBAC configuration settings for optional dependencies
rbac:
  # -- Specifies whether to enable the core Controller to have RBAC permissions to Volcano for scheduling distributed jobs.
  volcanoEnabled: true
  # -- Specifies whether to enable the core Controller to have RBAC permissions to k8s-nim-operator's NIMService for scheduling NIMs.
  k8sNimOperatorEnabled: true

# -- Multi-node networking configuration for distributed GPU training.
# These settings control Kyverno policies that inject cloud-specific networking and NCCL configurations.
#
# Requirements:
# - Kyverno policy engine must be installed in your cluster (required for multi-node networking)
# - Kyverno is NOT included as a subchart dependency and must be installed separately
#
# To install Kyverno:
#   helm install kyverno kyverno/kyverno --namespace kyverno --create-namespace --version 3.2.0
#
# Documentation: https://kyverno.io/docs/installation/
# Helm chart: https://kyverno.github.io/kyverno/
#
# Note: Only enable ONE cloud provider per cluster deployment.
multinodeNetworking:
  # -- AWS-specific configuration for EFA device injection
  aws:
    # -- Enable AWS-specific Kyverno policy for EFA device injection
    enabled: false
    # -- Number of EFA devices to request per GPU (typically 1 or 4)
    efaDevicesPerGPU: 1

  # -- Azure-specific configuration for InfiniBand/RDMA
  azure:
    # -- Enable Azure-specific Kyverno policy for InfiniBand/RDMA configuration
    enabled: false
    # -- Number of RDMA devices to request per GPU
    rdmaDevicesPerGPU: 1
    # -- RDMA device plugin resource name
    rdmaDeviceName: "hca_shared_devices_a"

  # -- GCP-specific configuration for TCP-X/TCP-XO
  gcp:
    # -- Enable GCP-specific Kyverno policy for TCP-X/TCP-XO configuration
    enabled: false

  # -- OCI-specific configuration for InfiniBand/SR-IOV
  oci:
    # -- Enable OCI-specific Kyverno policy for InfiniBand/SR-IOV configuration
    enabled: false
    # -- Number of RDMA devices (mlnxnics) to request per GPU
    rdmaDevicesPerGPU: 8

## Optional dependencies configuration. For production deployments, it is recommended to use existing installations of these dependencies.
k8s-nim-operator:
  # -- Specifies whether to enable the default NIM Operator installation. To learn more, see [Install NIM Operator](https://docs.nvidia.com/nim-operator/latest/install.html).
  # If you are using an existing NIM Operator installation, set this to false.
  enabled: true
  nfd:
    nodeFeatureRules:
      # -- Specifies whether to enable device ID feature rules.
      deviceID: false

# -- Local PostgreSQL configuration for the NeMo Platform.
# @default -- This object has the following default values for the PostgreSQL configuration.
postgresql:
  # -- Whether to deploy the embedded PostgreSQL. If enabled, the chart deploys a single-replica PostgreSQL instance using the official Postgres image.
  # It is NOT recommended to use the built-in PostgreSQL for production deployments. It is enabled in the chart by default for ease of getting started with the platform.
  # If you are using an existing PostgreSQL installation, set this to false and use the "externalDatabase" configuration section.
  enabled: true
  image:
    repository: docker.io/library/postgres
    tag: "18"
    pullPolicy: IfNotPresent
  # -- PostgreSQL authentication configuration.
  auth:
    username: nemo
    password: nemo
    database: nemoplatform
    # -- Name of an existing secret containing a "password" key (or use existingSecretPasswordKey). If set, the chart does not create a secret.
    existingSecret: ""


  # -- PostgreSQL service configuration.
  service:
    port: 5432

  # -- PostgreSQL persistence configuration.
  persistence:
    enabled: true
    size: 5Gi
    # -- Storage class for the PostgreSQL PVC. If unset, the cluster default is used.
    storageClass: ""
  # -- Optional resource limits/requests for the PostgreSQL container.
  resources: {}
  # -- Optional pod security context for the PostgreSQL pod (e.g. for OpenShift SCC).
  podSecurityContext: {}
  # -- Optional container security context for the PostgreSQL container.
  securityContext: {}
  # -- Service account for the PostgreSQL pod.
  # @default -- This object has the following default values for the service account configuration.
  serviceAccount:
    # -- Specifies whether a service account should be created for the PostgreSQL pod.
    create: true
    # -- Automatically mount the ServiceAccount's API credentials.
    automount: true
    # -- Annotations to add to the service account.
    annotations: {}
    # -- The name of the service account to use. If not set and create is true, a name is generated from the release fullname.
    name: ""
  # -- Node selector for the PostgreSQL pod.
  nodeSelector: {}
  # -- Affinity for the PostgreSQL pod.
  affinity: {}
  # -- Tolerations for the PostgreSQL pod.
  tolerations: []

# -- External PostgreSQL configuration settings. These values are only used when postgresql.enabled is set to false.
# @default -- This object has the following default values for the external PostgreSQL configuration.
externalDatabase:
  # -- External database host address.
  host: localhost
  # -- External database port number.
  port: 5432
  # -- Database username
  user: nemo
  # -- Database name.
  database: nemoplatform
  # -- Name of an existing secret resource containing the database credentials.
  existingSecret: ""
  # -- Name of an existing secret key containing the database credentials.
  existingSecretPasswordKey: ""
  # -- URI secret configuration for external database.
  # @default -- This object has the following default values for the URI secret configuration.
  uriSecret:
    # -- Name of the URI secret.
    name: ""
    # -- Key in the URI secret containing the database URI.
    key: ""

# -- Platform-wide configuration settings
# Set configuration here to apply custom, structured configuration across all services.
# Applied after the base platform config is evaluated for templates. Enables adding / overriding YAML-based elements in the evaluated platform config.
# It is usually recommended to use this config section instead of `basePlatformConfig` unless you need to use templating features.
# For example, you can set the NIM default StorageClass via models.controller.backends.k8s-nim-operator.config.default_storage_class.
# For full configuration reference, see the NeMo Platform's config reference:
# https://docs.nvidia.com/nemo/microservices/latest/set-up/config-reference.html
platformConfig: {}

# -- Base platform configuration settings
# @default -- This object has the following default values for the base platform configuration.
basePlatformConfig: |
  # -- platform is the service discovery configuration for services across the platform
  platform:
    # -- control_plane specifies the type of control plane the platform is running on.
    # Always set to 'kubernetes' for NeMo Platform when deploying with Helm.
    control_plane: kubernetes

    # Base URLs for various platform services
    base_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"

    # Image configuration for launching containers via the platform
    image_registry: nvcr.io/nvidia/nemo-microservices
    image_tag: {{ .Chart.AppVersion | quote }}
    image_pull_secrets:
      {{ include "nemo-common.imagepullsecrets" . | nindent 8 }}

  studio:
    # -- platform_base_url is the base URL used to access the platform.
    # This is the URL that NeMo Studio will use in the browser to communicate with the platform backend services.
    # An empty string means the Studio UI will reference its own host for API calls.
    platform_base_url: ""

  auth:
    enabled: false
    policy_decision_point_provider: embedded
    policy_decision_point_base_url: "http://localhost:8080"
    policy_data_refresh_interval: 5
    bundle_cache_seconds: 5
    admin_email: "admin@example.com"

  # -- service is the common configuration for service settings on the platform
  service:
    host: "0.0.0.0"
    port: {{ toString .Values.api.service.port }}
    log_format: json

  # -- entities is the configuration specific to entity management on the platform
  entities:
    backend: sqlalchemy

  # -- jobs is the configuration specific to executing jobs on the platform
  jobs:
    # -- executor_defaults is the default configuration applied to all executor profiles
    executor_defaults:
      kubernetes_job:
        service_account_name: {{ include "nmp-core.jobsServiceAccountName" . | quote }}
        launcher_image: {{ include "nmp-core.image" . | quote }}
        storage:
          pvc_name: {{ (include "nmp-core.persistentVolumeClaim" . ) }}
          volume_permissions_image: {{ .Values.core.storage.volumePermissionsImage | quote }}
        pod_security_context: {{ .Values.podSecurityContext | toYaml | nindent 10 }}
      volcano_job:
        service_account_name: {{ include "nmp-core.jobsServiceAccountName" . | quote }}
        launcher_image: {{ include "nmp-core.image" . | quote }}
        storage:
          pvc_name: {{ (include "nmp-core.persistentVolumeClaim" . ) }}
          volume_permissions_image: {{ .Values.core.storage.volumePermissionsImage | quote }}
        pod_security_context: {{ .Values.podSecurityContext | toYaml | nindent 10 }}
        {{- if include "nemo-platform.multinodeNetworkingEnabled" . }}
        # Enable multi-node networking (triggers Kyverno policies for cloud-specific configuration)
        enable_multi_node_networking: true
        {{- end }}

  # -- secrets is the configuration specific to storing secrets on the platform
  secrets:
    encryption:
      current_provider: local_v1
      providers:
        secret_key:
          local_v1:
            from_env: "NMP_SECRETS_DEFAULT_ENCRYPTION_KEY"

  # -- models is the configuration specific to model management on the platform
  models:
    controller:
      backends:
        nim_operator:
          enabled: true
          files_auth_secret: {{ include "nemo-platform.modelsFilesAuthSecretName" . | quote }}

  # -- inference_gateway is the configuration specific to inference request routing
  inference_gateway: {}

  # -- files is the configuration specific to file management on the platform
  files:
    default_storage_config:
      type: local
      path: /vol/files

  # -- auditor is the configuration specific to the Auditor service
  auditor: {}

  # -- data_designer is the configuration specific to the Data Designer service
  data_designer:
    model_provider_registry:
      default: "mock"
      providers:
        - name: "mock"
          endpoint: "http://localhost:8000"

  # -- customizer is the configuration specific to the Customizer service
  customizer: {}

  # -- evaluator is the configuration specific to the Evaluator service
  evaluator: {}

  # -- guardrails is the configuration specific to the Guardrails service
  guardrails: {}


ingress:
  # -- Specifies whether to enable the ingress.
  enabled: false
  # -- Annotations for the ingress resource.
  annotations: {}
  # -- The ingress class to use if your cluster has more than one class.
  className: ""
  # -- Optional default hostname. When set, one rule is generated with this host and paths from the first entry in ingress.hosts.
  defaultHost: ""
  # -- TLS configurations.
  tls: []
  hosts:
      # -- Hostname used by ingress. If blank, use path-only routing.
    - name: ""
      paths:
        - path: /
          pathType: Exact
          service: '{{ include "nemo-platform.ingressBackendService" . }}'
          port: '{{ include "nemo-platform.ingressBackendPort" . }}'
        - path: /apis
          pathType: Prefix
          service: '{{ include "nemo-platform.ingressBackendService" . }}'
          port: '{{ include "nemo-platform.ingressBackendPort" . }}'
        - path: /studio
          pathType: Prefix
          service: '{{ include "nemo-platform.ingressBackendService" . }}'
          port: '{{ include "nemo-platform.ingressBackendPort" . }}'
        - path: /cluster-info
          pathType: Exact
          service: '{{ include "nemo-platform.ingressBackendService" . }}'
          port: '{{ include "nemo-platform.ingressBackendPort" . }}'
        - path: /status
          pathType: Exact
          service: '{{ include "nemo-platform.ingressBackendService" . }}'
          port: '{{ include "nemo-platform.ingressBackendPort" . }}'

httpRoute:
  # -- Specifies whether to enable a Gateway API HTTP Route for the service.
  enabled: false
  # -- Extra labels for the HTTP Route object.
  labels: {}
  # -- Extra annotations for the HTTP Route object.
  annotations: {}
  # -- A list of Gateways to enable this route on. This is required if httpsRoute.enabled is true.
  parentRefs: []
  # -- If this has a specific hostname, add the name or names here in an array.
  hostnames: []
  # -- Path matches to route queries.
  pathRules:
    - matches:
        - path: /
          type: Exact
        - path: /apis
          type: PathPrefix
        - path: /studio
          type: PathPrefix
        - path: /cluster-info
          type: Exact
        - path: /status
          type: Exact
      backends:
        - service: '{{ include "nemo-platform.ingressBackendService" . }}'
          port: '{{ include "nemo-platform.ingressBackendPort" . }}'
  # -- This is a list of filters for the objects, such as CORS settings.
  filters: []

# -- OpenShift Route (route.openshift.io/v1). Use on OpenShift to expose the API via a Route instead of Ingress.
openshiftRoute:
  # -- Specifies whether to create an OpenShift Route for the API service.
  enabled: false
  # -- Hostname for the route. If empty, the OpenShift router may assign a default hostname.
  host: ""
  # -- Service name to route to. Defaults to Envoy when auth+envoy enabled, otherwise API (tpl-evaluated).
  service: '{{ include "nemo-platform.ingressBackendService" . }}'
  # -- Target port on the service. Defaults to Envoy or API port depending on auth (tpl-evaluated).
  targetPort: '{{ include "nemo-platform.ingressBackendPort" . }}'
  # -- Optional TLS configuration (termination, certificate, key, etc.). See OpenShift Route spec.
  tls: {}
  # -- Annotations for the route resource.
  annotations: {}
  # -- Labels for the route resource.
  labels: {}

# # -- OpenTelemetry configuration settings for all services.
# @default -- This object has the following default values for the OpenTelemetry configuration.
telemetry:
  # -- Disable OpenTelemetry instrumentation and exporting for all services.
  OTEL_SDK_DISABLED: false
  # -- The OpenTelemetry grpc collector endpoint to export traces and metrics to.
  OTEL_EXPORTER_OTLP_ENDPOINT: ""
  # -- Whether to use an insecure connection (no TLS) to the OpenTelemetry collector endpoint.
  OTEL_EXPORTER_OTLP_INSECURE: true
  # -- The OpenTelemetry traces exporter to use. Options are "otlp" or "none" to disable export.
  OTEL_TRACES_EXPORTER: "none"
  # -- The OpenTelemetry metrics exporter to use. Options are "otlp", "prometheus" or "none" to disable export.
  OTEL_METRICS_EXPORTER: "none"
  # -- The OpenTelemetry traces exporter endpoint to use. Defaults to `OTEL_EXPORTER_OTLP_ENDPOINT` if not set.
  OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: null
  # -- Whether to use an insecure connection (HTTP) to the OpenTelemetry traces exporter endpoint. Defaults to `OTEL_EXPORTER_OTLP_INSECURE` if not set.
  OTEL_EXPORTER_OTLP_TRACES_INSECURE: true
  # -- The OpenTelemetry metrics exporter endpoint to use. Defaults to `OTEL_EXPORTER_OTLP_ENDPOINT` if not set.
  OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: null
  # -- Whether to use an insecure connection (HTTP) to the OpenTelemetry metrics exporter endpoint. Defaults to `OTEL_EXPORTER_OTLP_INSECURE` if not set.
  OTEL_EXPORTER_OTLP_METRICS_INSECURE: true

# -- Pod security context settings applied to all services by default.
# These can be overridden in individual service configurations.
# @default -- This object has the following default values for the pod security context.
podSecurityContext: {}

# -- Container security context settings applied to all services by default.
# These can be overridden in individual service configurations.
# @default -- This object has the following default values for the container security context.
securityContext: {}

# -- API configuration settings for the api deployment
# @default -- This object has the following default values for the API configuration.
api:
  # -- Specifies whether to enable the api deployment.
  enabled: true

  # -- Container image configuration for the api deployment.
  # @default -- This object has the following default values for the image configuration.
  image:
    # -- The registry where the NeMo Platform image is located.
    repository: nvcr.io/nvidia/nemo-microservices/nmp-api
    # -- The image pull policy determining when to pull new images.
    pullPolicy: IfNotPresent
    # -- The image tag to use.
    tag: ""

  # -- OpenTelemetry configuration overrides for the api deployment.
  telemetry: {}

  # -- Number of replicas for the API service.
  replicaCount: 1
  # -- Additional arguments to pass to the Platform API service
  extraArgs: []
  # -- Service account configuration for the API service.
  # @default -- This object has the following default values for the service account configuration.
  serviceAccount:
    # -- Specifies whether a service account should be created.
    create: true
    # -- Automatically mount a ServiceAccount's API credentials.
    automount: true
    # -- Annotations to add to the service account.
    annotations: {}
    # -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template.
    name: ""
  # -- Annotations to add to the API service deployment.
  annotations: {}
  # -- Annotations to add to the API service pod.
  podAnnotations: {}
  # -- Labels for the API service pod.
  podLabels: {}
  # -- Pod-level security context settings for the API service.
  # @default -- This object has the following default values for the pod security context.
  podSecurityContext:
    # -- The file system group ID to use for all containers.
    fsGroup: 1000
  # -- Container-level security context settings for the API service.
  securityContext: {}
  # -- Service configuration for the API service.
  # @default -- This object has the following default values for the service configuration.
  service:
    # -- The Kubernetes service type to create.
    type: ClusterIP
    # -- The port number to expose for the service.
    port: 8080
    # -- Annotations for the API service.
    annotations: {}
  # -- Kubernetes deployment resources configuration for the API service.
  resources: {}

  # -- Startup probe configuration for the api service.
  # @default -- This object has the following default values for the startup probe configuration.
  startupProbe:
    # -- Number of seconds to wait before the first startup probe. Allows time for DB connection retries (e.g. Postgres pod booting).
    initialDelaySeconds: 10
    # -- The HTTP GET request to use for the startup probe.
    httpGet:
      path: /health/ready
      port: http
    # -- The frequency in seconds to perform the startup probe.
    periodSeconds: 15
    # -- The timeout in seconds for the startup probe.
    timeoutSeconds: 5
    # -- The failure threshold for the startup probe.
    failureThreshold: 24

  # -- Liveness probe configuration for the api service.
  # @default -- This object has the following default values for the liveness probe configuration.
  livenessProbe:
    # -- The HTTP GET request to use for the liveness probe.
    httpGet:
      path: /health/live
      port: http
    # -- The frequency in seconds to perform the liveness probe.
    periodSeconds: 10
    # -- The timeout in seconds for the liveness probe.
    timeoutSeconds: 5
    # -- The failure threshold for the liveness probe.
    failureThreshold: 3

  # -- Readiness probe configuration for the api service.
  # @default -- This object has the following default values for the readiness probe configuration.
  readinessProbe:
    # -- The HTTP GET request to use for the readiness probe.
    httpGet:
      path: /health/ready
      port: http
    # -- The frequency in seconds to perform the readiness probe.
    periodSeconds: 10
    # -- The timeout in seconds for the readiness probe.
    timeoutSeconds: 5
    # -- The failure threshold for the readiness probe.
    failureThreshold: 3

  # -- PodDisruptionBudget configuration for the API service.
  # @default -- This object has the following default values for the pod disruption budget configuration.
  podDisruptionBudget:
    # -- Whether to create a PodDisruptionBudget for the API pods.
    enabled: false
    # -- Minimum number of API pods that must remain available during voluntary disruptions.
    # Only one of minAvailable or maxUnavailable may be set.
    minAvailable: 1
    # -- Maximum number of API pods that can be unavailable during voluntary disruptions.
    # Only one of minAvailable or maxUnavailable may be set.
    # maxUnavailable: 0
    # -- Annotations for the PodDisruptionBudget.
    annotations: {}

  # -- Specifies autoscaling configurations for the deployment.
  autoscaling:
    # -- Whether to enable horizontal pod autoscaler.
    enabled: false
    # -- The minimum number of replicas for the deployment.
    minReplicas: 1
    # -- The maximum number of replicas for the deployment.
    maxReplicas: 10
    # -- The target CPU utilization percentage.
    targetCPUUtilizationPercentage: 80
    # targetMemoryUtilizationPercentage: 80
    # -- Annotations for the HorizontalPodAutoscaler.
    annotations: {}

  # Environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object}
  env: {}
  # -- Node selector configuration for the API service.
  nodeSelector: {}
  # -- Affinity configuration for the API service.
  affinity: {}
  # -- Tolerations configuration for the API service.
  tolerations: []
  # -- Topology spread constraints for the API service pods. See https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
  topologySpreadConstraints: []

  # ServiceMonitor configuration for Prometheus Operator
  serviceMonitor:
    # -- Enable ServiceMonitor resources for Prometheus Operator
    enabled: false
    # -- Scrape interval for the ServiceMonitor
    interval: "30s"
    # -- Scheme to use for scraping metrics (http or https)
    scheme: "http"
    # -- Additional labels to add to the ServiceMonitor
    labels: {}
    # -- Additional annotations to add to the ServiceMonitor
    annotations: {}

# -- Platform seed Job (Helm hook: runs after install/upgrade)
# Runs the platform-seed task (guardrails configs, evaluator system entities, data designer filesets).
# Uses post-install,post-upgrade hooks so it runs on fresh installs and can be re-triggered on no-op upgrade.
# @default -- This object has the following default values for the platform seed Job configuration.
platformSeedJob:
  # -- Specifies whether to enable the platform-seed Job.
  enabled: true
  # -- Seconds after the Job finishes (success or failure) before it is eligible for automatic deletion.
  ttlSecondsAfterFinished: 86400
  # -- Number of retries before considering the Job failed.
  backoffLimit: 6
  # -- Maximum time in seconds the Job can run.
  activeDeadlineSeconds: 600
  # -- Pod-level security context for the platform seeding Job pod.
  podSecurityContext: {}
  # -- Container-level security context for the platform-seed container.
  securityContext: {}
  # -- Resource requests/limits for the platform-seed container.
  resources: {}
  # -- Extra environment variables for the platform-seed container (e.g. CONFIG_STORE_PATH, NMP_PLATFORM_SEED_*).
  extraEnv: []
  # -- Node selector for the platform seeding Job pod.
  nodeSelector: {}
  # -- Affinity for the platform seeding Job pod.
  affinity: {}
  # -- Tolerations for the platform seeding Job pod.
  tolerations: []
  # -- Additional labels for the platform seeding Job pod.
  podLabels: {}

# -- Core deployment configuration settings
# @default -- This object has the following default values for the core deployment configuration.
core:
  # -- Specifies whether to enable the core deployment.
  enabled: true

  # -- Container image configuration for the core deployment.
  # @default -- This object has the following default values for the image configuration.
  image:
    # -- The registry where the NeMo Platform image is located.
    repository: nvcr.io/nvidia/nemo-microservices/nmp-api
    # -- The image pull policy determining when to pull new images.
    pullPolicy: IfNotPresent
    # -- The image tag to use.
    tag: ""

  storage:
    # -- If set, pods will mount this persistent volume for job-scoped storage
    # and we will not create a new persistent volume claim.
    existingPersistentVolumeName: ""
    # -- Which storageClass to use when creating a new persistent volume claim. Empty string uses the cluster's default StorageClass.
    storageClass: ""
    # -- accessModes for the persistent volume claim. This should include `ReadWriteMany` to ensure
    # multiple job pods can write to the volume concurrently.
    accessModes:
      - ReadWriteMany
    # -- size of the persistent volume claim used for persistent storage
    size: 200Gi
    # -- volumePermissionsImage is the image used to set permissions on the volume
    volumePermissionsImage: "busybox"
    # -- Annotations to add to the persistent volume claim
    annotations: {}

  # -- OpenTelemetry configuration overrides for the platform deployment.
  telemetry: {}

  # -- Service account configuration for pods created by the jobs controller (Kubernetes/Volcano job pods).
  # @default -- This object has the following default values for the jobs service account configuration.
  jobs:
    serviceAccount:
      # -- Specifies whether a service account should be created for job pods.
      create: true
      # -- Automatically mount a ServiceAccount's API credentials.
      automount: true
      # -- Annotations to add to the service account.
      annotations: {}
      # -- The name of the service account to use. If not set and create is true, a name is generated with a '-jobs' suffix.
      name: ""

  # @default -- This object has the following default values for the controller configuration.
  controller:
    # -- Service account configuration for the controller service.
    # @default -- This object has the following default values for the service account configuration.
    serviceAccount:
      # -- Specifies whether a service account should be created.
      create: true
      # -- Automatically mount a ServiceAccount's API credentials.
      automount: true
      # -- Annotations to add to the service account.
      annotations: {}
      # -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template.
      name: ""
    # -- Additional arguments to pass to the Core Controller service
    extraArgs: []

    # -- Service configuration for the controller service. This only configures a headless service for DNS resolution.
    # @default -- This object has the following default values for the service configuration.
    service:
      # -- The port for the service.
      port: 8080
      # -- Annotations for the headless controller service.
      annotations: {}
    # -- Annotations to add to the controller service deployment.
    annotations: {}
    # -- Annotations to add to the controller service pod.
    podAnnotations: {}
    # -- Labels for the controller service pod.
    podLabels: {}
    # -- Pod-level security context settings for the controller service.
    # @default -- This object has the following default values for the pod security context.
    podSecurityContext:
      # -- The file system group ID to use for all containers.
      fsGroup: 1000
    # -- Container-level security context settings for the controller service.
    securityContext: {}
    # -- Kubernetes deployment resources configuration for the controller service.
    resources: {}

    # -- Startup probe configuration for the core service.
    # @default -- This object has the following default values for the startup probe configuration.
    startupProbe:
      # -- Number of seconds to wait before the first startup probe. Allows time for DB connection retries (e.g. Postgres pod booting).
      initialDelaySeconds: 10
      # -- The HTTP GET request to use for the startup probe.
      httpGet:
        path: /health/ready
        port: http
      # -- The frequency in seconds to perform the startup probe.
      periodSeconds: 15
      # -- The timeout in seconds for the startup probe.
      timeoutSeconds: 5
      # -- The failure threshold for the startup probe.
      failureThreshold: 24

    # -- Liveness probe configuration for the controller service.
    # @default -- This object has the following default values for the liveness probe configuration.
    livenessProbe:
      # -- The HTTP GET request to use for the readiness probe.
      httpGet:
        path: /health/live
        port: http
      # -- The frequency in seconds to perform the readiness probe.
      periodSeconds: 10
      # -- The timeout in seconds for the readiness probe.
      timeoutSeconds: 5
      # -- The failure threshold for the readiness probe.
      failureThreshold: 3

    # -- Readiness probe configuration for the controller service.
    # @default -- This object has the following default values for the readiness probe configuration.
    readinessProbe:
      # -- The HTTP GET request to use for the readiness probe.
      httpGet:
        path: /health/ready
        port: http
      # -- The frequency in seconds to perform the readiness probe.
      periodSeconds: 10
      # -- The timeout in seconds for the readiness probe.
      timeoutSeconds: 5
      # -- The failure threshold for the readiness probe.
      failureThreshold: 3
    # -- Additional environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object}.
    env: {}
    # -- Node selector configuration for the controller service.
    nodeSelector: {}
    # -- Affinity configuration for the controller service.
    affinity: {}
    # -- Tolerations configuration for the controller service.
    tolerations: []
    # -- Topology spread constraints for the controller service pods. See https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
    topologySpreadConstraints: []

  # ServiceMonitor configuration for Prometheus Operator
  serviceMonitor:
    # -- Enable ServiceMonitor resources for Prometheus Operator
    enabled: false
    # -- Scrape interval for the ServiceMonitor
    interval: "30s"
    # -- Scheme to use for scraping metrics (http or https)
    scheme: "http"
    # -- Additional labels to add to the ServiceMonitor
    labels: {}
    # -- Additional annotations to add to the ServiceMonitor
    annotations: {}


# -- Envoy proxy configuration settings. Resources are created only when platform config has auth.enabled: true (see platformConfig.auth.enabled).
# @default -- This object has the following default values for the envoy proxy configuration.
envoyProxy:
  # -- Specifies whether to enable the Envoy proxy deployment. Rendered only when platform config has auth.enabled: true.
  enabled: true

  # Headers considered internal-only
  trustedHeaders:
    - x-nmp-principal-id
    - x-nmp-principal-email
    - x-nmp-principal-groups
    - x-nmp-principal-filters
    - x-nmp-principal-roles

  # Number of Envoy proxy replicas
  replicaCount: 2

  # Envoy image
  image:
    repository: envoyproxy/envoy
    tag: v1.37.0
    pullPolicy: IfNotPresent

  # -- Service account configuration for the Envoy service.
  # @default -- This object has the following default values for the service account configuration.
  serviceAccount:
    # -- Specifies whether a service account should be created.
    create: true
    # -- Automatically mount a ServiceAccount's API credentials.
    automount: true
    # -- Annotations to add to the service account.
    annotations: {}
    # -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template.
    name: ""
  # -- Annotations to add to the Envoy service deployment.
  annotations: {}
  # -- Annotations to add to the Envoy service pod.
  podAnnotations: {}
  # -- Labels for the Envoy service pod.
  podLabels: {}
  # -- Pod-level security context settings for the Envoy service.
  # @default -- This object has the following default values for the pod security context.
  podSecurityContext:
    # -- The file system group ID to use for all containers.
    fsGroup: 1000
  # -- Container-level security context settings for the Envoy service.
  securityContext: {}
  # -- Service configuration for the Envoy service.
  # @default -- This object has the following default values for the service configuration.
  service:
    # -- The Kubernetes service type to create.
    type: ClusterIP
    # -- The port number to expose for the service.
    port: 8080
    # -- Annotations for the Envoy service.
    annotations: {}

  # -- Envoy Admin port
  adminPort: 9901

  # -- Timeouts for proxying to long-lived streams (e.g. inference gateway). Use "0s" to disable a timeout.
  # @default -- Tuned for streaming; increase or set to "0s" if requests are cut off.
  timeouts:
    # -- Stream idle timeout. Time with no activity before stream is closed. 0 = disabled (required for long-lived streams).
    streamIdle: "0s"
    # -- Time to receive full request headers. 0 = disabled.
    requestHeaders: "60s"
    # -- Total request timeout. 0 = disabled (required for streaming; not compatible with streaming if set).
    request: "0s"
    # -- Per-route timeout for the passthrough to backend. 0 = disabled.
    route: "0s"
    # -- Cluster connect timeout (time to establish connection to backend).
    connect: "30s"

  # -- Number of Envoy worker threads. Defaults to 2, which is sufficient for the platform proxy workload.
  # Envoy's own default (0) means one worker per CPU core, which on large nodes can exhaust the
  # container's RLIMIT_NOFILE (commonly 1024 under containerd CRI) during startup, causing
  # "Too many open files" errors from libevent. Set to 0 to restore Envoy's auto-detect behavior
  # (only safe when the node's nofile rlimit has been raised, e.g. via containerd base_runtime_spec).
  concurrency: 2

  # -- Kubernetes deployment resources configuration for the Envoy service.
  resources: {}

  # -- Liveness probe for the Envoy container (admin interface /ready).
  livenessProbe:
    httpGet:
      path: /ready
      port: admin
    periodSeconds: 10
    timeoutSeconds: 5
    failureThreshold: 3
  # -- Readiness probe for the Envoy container (admin interface /ready).
  readinessProbe:
    httpGet:
      path: /ready
      port: admin
    periodSeconds: 10
    timeoutSeconds: 5
    failureThreshold: 3
  # -- Startup probe for the Envoy container (admin interface /ready).
  startupProbe:
    httpGet:
      path: /ready
      port: admin
    periodSeconds: 5
    timeoutSeconds: 3
    failureThreshold: 12

  # -- PodDisruptionBudget configuration for the Envoy service.
  # @default -- This object has the following default values for the pod disruption budget configuration.
  podDisruptionBudget:
    # -- Whether to create a PodDisruptionBudget for the Envoy pods.
    enabled: false
    # -- Minimum number of Envoy pods that must remain available during voluntary disruptions.
    # Only one of minAvailable or maxUnavailable may be set.
    minAvailable: 1
    # -- Maximum number of Envoy pods that can be unavailable during voluntary disruptions.
    # Only one of minAvailable or maxUnavailable may be set.
    # maxUnavailable: 0
    # -- Annotations for the PodDisruptionBudget.
    annotations: {}

  # -- Specifies autoscaling configurations for the deployment.
  autoscaling:
    # -- Whether to enable horizontal pod autoscaler.
    enabled: false
    # -- The minimum number of replicas for the deployment.
    minReplicas: 1
    # -- The maximum number of replicas for the deployment.
    maxReplicas: 10
    # -- The target CPU utilization percentage.
    targetCPUUtilizationPercentage: 80
    # targetMemoryUtilizationPercentage: 80
    # -- Annotations for the HorizontalPodAutoscaler.
    annotations: {}

  # Environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object}
  env: {}
  # -- Node selector configuration for the Envoy pods.
  nodeSelector: {}
  # -- Affinity configuration for the Envoy pods.
  affinity: {}
  # -- Tolerations configuration for the Envoy pods.
  tolerations: []
  # -- Topology spread constraints for the Envoy pods. See https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
  topologySpreadConstraints: []

  # ServiceMonitor configuration for Prometheus Operator
  serviceMonitor:
    # -- Enable ServiceMonitor resources for Prometheus Operator
    enabled: false
    # -- Scrape interval for the ServiceMonitor
    interval: "30s"
    # -- Scheme to use for scraping metrics (http or https)
    scheme: "http"
    # -- Additional labels to add to the ServiceMonitor
    labels: {}
    # -- Additional annotations to add to the ServiceMonitor
    annotations: {}