NeMo Platform Helm Chart#
For deployment guide, see Admin Setup in the NeMo Platform documentation.
Values#
The following is the complete values.yaml file for the NeMo Platform Helm Chart.
All configuration options are documented inline with comments.
# Default values for NeMo Microservices Platform Helm chart
## Helm global configuration settings
# -- Overrides for name and fullname templates
nameOverride: ""
fullnameOverride: ""
# -- Your NVIDIA GPU Cloud (NGC) API key authenticates and enables pulling images from the NGC container registry. The existing secret overrides this key if you provide one to the `existingSecret` key.
ngcAPIKey: YOUR-NGC-API-KEY
# -- Environment variables that will be applied to every deployment pod. Uses a simple key value map structure like MY_ENV_VAR: the-key and works with valueFrom as well.
env: {}
# -- Optional. Name of an existing Kubernetes Secret to load as env vars (envFrom) for the API pod.
# When set, the chart does not create the default api-env secret; use your own secret (e.g. from Vault, sealed-secrets).
# When unset, the chart creates a default secret with the environment variable NMP_SECRETS_DEFAULT_ENCRYPTION_KEY for default installation.
# See the NeMo Platform documentation for more details on secrets encryption.
envFromSecret: ""
# -- You can use an existing Kubernetes secret for communicating with the NGC API for downloading models. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string.
existingSecret: ngc-api
# -- You can specify an existing Kubernetes image pull secret for pulling images from the NGC container registry. The chart uses the `ngcAPIKey` value to generate the secret if you set this to an empty string.
existingImagePullSecret: nvcrimagepullsecret
# -- List of additional image pull secrets to use for pulling container images. Can be used when multiple image pull secrets are required in your environment.
additionalImagePullSecrets: {}
# -- RBAC configuration settings for optional dependencies
rbac:
# -- Specifies whether to enable the core Controller to have RBAC permissions to Volcano for scheduling distributed jobs.
volcanoEnabled: true
# -- Specifies whether to enable the core Controller to have RBAC permissions to k8s-nim-operator's NIMService for scheduling NIMs.
k8sNimOperatorEnabled: true
# -- Multi-node networking configuration for distributed GPU training.
# These settings control Kyverno policies that inject cloud-specific networking and NCCL configurations.
#
# Requirements:
# - Kyverno policy engine must be installed in your cluster (required for multi-node networking)
# - Kyverno is NOT included as a subchart dependency and must be installed separately
#
# To install Kyverno:
# helm install kyverno kyverno/kyverno --namespace kyverno --create-namespace --version 3.2.0
#
# Documentation: https://kyverno.io/docs/installation/
# Helm chart: https://kyverno.github.io/kyverno/
#
# Note: Only enable ONE cloud provider per cluster deployment.
multinodeNetworking:
# -- AWS-specific configuration for EFA device injection
aws:
# -- Enable AWS-specific Kyverno policy for EFA device injection
enabled: false
# -- Number of EFA devices to request per GPU (typically 1 or 4)
efaDevicesPerGPU: 1
# -- Azure-specific configuration for InfiniBand/RDMA
azure:
# -- Enable Azure-specific Kyverno policy for InfiniBand/RDMA configuration
enabled: false
# -- Number of RDMA devices to request per GPU
rdmaDevicesPerGPU: 1
# -- RDMA device plugin resource name
rdmaDeviceName: "hca_shared_devices_a"
# -- GCP-specific configuration for TCP-X/TCP-XO
gcp:
# -- Enable GCP-specific Kyverno policy for TCP-X/TCP-XO configuration
enabled: false
# -- OCI-specific configuration for InfiniBand/SR-IOV
oci:
# -- Enable OCI-specific Kyverno policy for InfiniBand/SR-IOV configuration
enabled: false
# -- Number of RDMA devices (mlnxnics) to request per GPU
rdmaDevicesPerGPU: 8
## Optional dependencies configuration. For production deployments, it is recommended to use existing installations of these dependencies.
k8s-nim-operator:
# -- Specifies whether to enable the default NIM Operator installation. To learn more, see [Install NIM Operator](https://docs.nvidia.com/nim-operator/latest/install.html).
# If you are using an existing NIM Operator installation, set this to false.
enabled: true
nfd:
nodeFeatureRules:
# -- Specifies whether to enable device ID feature rules.
deviceID: false
# -- Local PostgreSQL configuration for the NeMo Platform.
# @default -- This object has the following default values for the PostgreSQL configuration.
postgresql:
# -- Whether to deploy the embedded PostgreSQL. If enabled, the chart deploys a single-replica PostgreSQL instance using the official Postgres image.
# It is NOT recommended to use the built-in PostgreSQL for production deployments. It is enabled in the chart by default for ease of getting started with the platform.
# If you are using an existing PostgreSQL installation, set this to false and use the "externalDatabase" configuration section.
enabled: true
image:
repository: docker.io/library/postgres
tag: "18"
pullPolicy: IfNotPresent
# -- PostgreSQL authentication configuration.
auth:
username: nemo
password: nemo
database: nemoplatform
# -- Name of an existing secret containing a "password" key (or use existingSecretPasswordKey). If set, the chart does not create a secret.
existingSecret: ""
# -- PostgreSQL service configuration.
service:
port: 5432
# -- PostgreSQL persistence configuration.
persistence:
enabled: true
size: 5Gi
# -- Storage class for the PostgreSQL PVC. If unset, the cluster default is used.
storageClass: ""
# -- Optional resource limits/requests for the PostgreSQL container.
resources: {}
# -- Optional pod security context for the PostgreSQL pod (e.g. for OpenShift SCC).
podSecurityContext: {}
# -- Optional container security context for the PostgreSQL container.
securityContext: {}
# -- Service account for the PostgreSQL pod.
# @default -- This object has the following default values for the service account configuration.
serviceAccount:
# -- Specifies whether a service account should be created for the PostgreSQL pod.
create: true
# -- Automatically mount the ServiceAccount's API credentials.
automount: true
# -- Annotations to add to the service account.
annotations: {}
# -- The name of the service account to use. If not set and create is true, a name is generated from the release fullname.
name: ""
# -- Node selector for the PostgreSQL pod.
nodeSelector: {}
# -- Affinity for the PostgreSQL pod.
affinity: {}
# -- Tolerations for the PostgreSQL pod.
tolerations: []
# -- External PostgreSQL configuration settings. These values are only used when postgresql.enabled is set to false.
# @default -- This object has the following default values for the external PostgreSQL configuration.
externalDatabase:
# -- External database host address.
host: localhost
# -- External database port number.
port: 5432
# -- Database username
user: nemo
# -- Database name.
database: nemoplatform
# -- Name of an existing secret resource containing the database credentials.
existingSecret: ""
# -- Name of an existing secret key containing the database credentials.
existingSecretPasswordKey: ""
# -- URI secret configuration for external database.
# @default -- This object has the following default values for the URI secret configuration.
uriSecret:
# -- Name of the URI secret.
name: ""
# -- Key in the URI secret containing the database URI.
key: ""
# -- Platform-wide configuration settings
# Set configuration here to apply custom, structured configuration across all services.
# Applied after the base platform config is evaluated for templates. Enables adding / overriding YAML-based elements in the evaluated platform config.
# It is usually recommended to use this config section instead of `basePlatformConfig` unless you need to use templating features.
# For example, you can set the NIM default StorageClass via models.controller.backends.k8s-nim-operator.config.default_storage_class.
# For full configuration reference, see the NeMo Platform's config reference:
# https://docs.nvidia.com/nemo/microservices/latest/set-up/config-reference.html
platformConfig: {}
# -- Base platform configuration settings
# @default -- This object has the following default values for the base platform configuration.
basePlatformConfig: |
# -- platform is the service discovery configuration for services across the platform
platform:
# -- control_plane specifies the type of control plane the platform is running on.
# Always set to 'kubernetes' for NeMo Platform when deploying with Helm.
control_plane: kubernetes
# Base URLs for various platform services
base_url: "{{ printf "http://%s:%s" (include "nmp-api.api-servicename" . ) (toString .Values.api.service.port) }}"
# Image configuration for launching containers via the platform
image_registry: nvcr.io/nvidia/nemo-microservices
image_tag: {{ .Chart.AppVersion | quote }}
image_pull_secrets:
{{ include "nemo-common.imagepullsecrets" . | nindent 8 }}
studio:
# -- platform_base_url is the base URL used to access the platform.
# This is the URL that NeMo Studio will use in the browser to communicate with the platform backend services.
# An empty string means the Studio UI will reference its own host for API calls.
platform_base_url: ""
auth:
enabled: false
policy_decision_point_provider: embedded
policy_decision_point_base_url: "http://localhost:8080"
policy_data_refresh_interval: 5
bundle_cache_seconds: 5
admin_email: "admin@example.com"
# -- service is the common configuration for service settings on the platform
service:
host: "0.0.0.0"
port: {{ toString .Values.api.service.port }}
log_format: json
# -- entities is the configuration specific to entity management on the platform
entities:
backend: sqlalchemy
# -- jobs is the configuration specific to executing jobs on the platform
jobs:
# -- executor_defaults is the default configuration applied to all executor profiles
executor_defaults:
kubernetes_job:
service_account_name: {{ include "nmp-core.jobsServiceAccountName" . | quote }}
launcher_image: {{ include "nmp-core.image" . | quote }}
storage:
pvc_name: {{ (include "nmp-core.persistentVolumeClaim" . ) }}
volume_permissions_image: {{ .Values.core.storage.volumePermissionsImage | quote }}
pod_security_context: {{ .Values.podSecurityContext | toYaml | nindent 10 }}
volcano_job:
service_account_name: {{ include "nmp-core.jobsServiceAccountName" . | quote }}
launcher_image: {{ include "nmp-core.image" . | quote }}
storage:
pvc_name: {{ (include "nmp-core.persistentVolumeClaim" . ) }}
volume_permissions_image: {{ .Values.core.storage.volumePermissionsImage | quote }}
pod_security_context: {{ .Values.podSecurityContext | toYaml | nindent 10 }}
{{- if include "nemo-platform.multinodeNetworkingEnabled" . }}
# Enable multi-node networking (triggers Kyverno policies for cloud-specific configuration)
enable_multi_node_networking: true
{{- end }}
# -- secrets is the configuration specific to storing secrets on the platform
secrets:
encryption:
current_provider: local_v1
providers:
secret_key:
local_v1:
from_env: "NMP_SECRETS_DEFAULT_ENCRYPTION_KEY"
# -- models is the configuration specific to model management on the platform
models:
controller:
backends:
nim_operator:
enabled: true
files_auth_secret: {{ include "nemo-platform.modelsFilesAuthSecretName" . | quote }}
# -- inference_gateway is the configuration specific to inference request routing
inference_gateway: {}
# -- files is the configuration specific to file management on the platform
files:
default_storage_config:
type: local
path: /vol/files
# -- auditor is the configuration specific to the Auditor service
auditor: {}
# -- data_designer is the configuration specific to the Data Designer service
data_designer:
model_provider_registry:
default: "mock"
providers:
- name: "mock"
endpoint: "http://localhost:8000"
# -- customizer is the configuration specific to the Customizer service
customizer: {}
# -- evaluator is the configuration specific to the Evaluator service
evaluator: {}
# -- guardrails is the configuration specific to the Guardrails service
guardrails: {}
ingress:
# -- Specifies whether to enable the ingress.
enabled: false
# -- Annotations for the ingress resource.
annotations: {}
# -- The ingress class to use if your cluster has more than one class.
className: ""
# -- Optional default hostname. When set, one rule is generated with this host and paths from the first entry in ingress.hosts.
defaultHost: ""
# -- TLS configurations.
tls: []
hosts:
# -- Hostname used by ingress. If blank, use path-only routing.
- name: ""
paths:
- path: /
pathType: Exact
service: '{{ include "nemo-platform.ingressBackendService" . }}'
port: '{{ include "nemo-platform.ingressBackendPort" . }}'
- path: /apis
pathType: Prefix
service: '{{ include "nemo-platform.ingressBackendService" . }}'
port: '{{ include "nemo-platform.ingressBackendPort" . }}'
- path: /studio
pathType: Prefix
service: '{{ include "nemo-platform.ingressBackendService" . }}'
port: '{{ include "nemo-platform.ingressBackendPort" . }}'
- path: /cluster-info
pathType: Exact
service: '{{ include "nemo-platform.ingressBackendService" . }}'
port: '{{ include "nemo-platform.ingressBackendPort" . }}'
- path: /status
pathType: Exact
service: '{{ include "nemo-platform.ingressBackendService" . }}'
port: '{{ include "nemo-platform.ingressBackendPort" . }}'
httpRoute:
# -- Specifies whether to enable a Gateway API HTTP Route for the service.
enabled: false
# -- Extra labels for the HTTP Route object.
labels: {}
# -- Extra annotations for the HTTP Route object.
annotations: {}
# -- A list of Gateways to enable this route on. This is required if httpsRoute.enabled is true.
parentRefs: []
# -- If this has a specific hostname, add the name or names here in an array.
hostnames: []
# -- Path matches to route queries.
pathRules:
- matches:
- path: /
type: Exact
- path: /apis
type: PathPrefix
- path: /studio
type: PathPrefix
- path: /cluster-info
type: Exact
- path: /status
type: Exact
backends:
- service: '{{ include "nemo-platform.ingressBackendService" . }}'
port: '{{ include "nemo-platform.ingressBackendPort" . }}'
# -- This is a list of filters for the objects, such as CORS settings.
filters: []
# -- OpenShift Route (route.openshift.io/v1). Use on OpenShift to expose the API via a Route instead of Ingress.
openshiftRoute:
# -- Specifies whether to create an OpenShift Route for the API service.
enabled: false
# -- Hostname for the route. If empty, the OpenShift router may assign a default hostname.
host: ""
# -- Service name to route to. Defaults to Envoy when auth+envoy enabled, otherwise API (tpl-evaluated).
service: '{{ include "nemo-platform.ingressBackendService" . }}'
# -- Target port on the service. Defaults to Envoy or API port depending on auth (tpl-evaluated).
targetPort: '{{ include "nemo-platform.ingressBackendPort" . }}'
# -- Optional TLS configuration (termination, certificate, key, etc.). See OpenShift Route spec.
tls: {}
# -- Annotations for the route resource.
annotations: {}
# -- Labels for the route resource.
labels: {}
# # -- OpenTelemetry configuration settings for all services.
# @default -- This object has the following default values for the OpenTelemetry configuration.
telemetry:
# -- Disable OpenTelemetry instrumentation and exporting for all services.
OTEL_SDK_DISABLED: false
# -- The OpenTelemetry grpc collector endpoint to export traces and metrics to.
OTEL_EXPORTER_OTLP_ENDPOINT: ""
# -- Whether to use an insecure connection (no TLS) to the OpenTelemetry collector endpoint.
OTEL_EXPORTER_OTLP_INSECURE: true
# -- The OpenTelemetry traces exporter to use. Options are "otlp" or "none" to disable export.
OTEL_TRACES_EXPORTER: "none"
# -- The OpenTelemetry metrics exporter to use. Options are "otlp", "prometheus" or "none" to disable export.
OTEL_METRICS_EXPORTER: "none"
# -- The OpenTelemetry traces exporter endpoint to use. Defaults to `OTEL_EXPORTER_OTLP_ENDPOINT` if not set.
OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: null
# -- Whether to use an insecure connection (HTTP) to the OpenTelemetry traces exporter endpoint. Defaults to `OTEL_EXPORTER_OTLP_INSECURE` if not set.
OTEL_EXPORTER_OTLP_TRACES_INSECURE: true
# -- The OpenTelemetry metrics exporter endpoint to use. Defaults to `OTEL_EXPORTER_OTLP_ENDPOINT` if not set.
OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: null
# -- Whether to use an insecure connection (HTTP) to the OpenTelemetry metrics exporter endpoint. Defaults to `OTEL_EXPORTER_OTLP_INSECURE` if not set.
OTEL_EXPORTER_OTLP_METRICS_INSECURE: true
# -- Pod security context settings applied to all services by default.
# These can be overridden in individual service configurations.
# @default -- This object has the following default values for the pod security context.
podSecurityContext: {}
# -- Container security context settings applied to all services by default.
# These can be overridden in individual service configurations.
# @default -- This object has the following default values for the container security context.
securityContext: {}
# -- API configuration settings for the api deployment
# @default -- This object has the following default values for the API configuration.
api:
# -- Specifies whether to enable the api deployment.
enabled: true
# -- Container image configuration for the api deployment.
# @default -- This object has the following default values for the image configuration.
image:
# -- The registry where the NeMo Platform image is located.
repository: nvcr.io/nvidia/nemo-microservices/nmp-api
# -- The image pull policy determining when to pull new images.
pullPolicy: IfNotPresent
# -- The image tag to use.
tag: ""
# -- OpenTelemetry configuration overrides for the api deployment.
telemetry: {}
# -- Number of replicas for the API service.
replicaCount: 1
# -- Additional arguments to pass to the Platform API service
extraArgs: []
# -- Service account configuration for the API service.
# @default -- This object has the following default values for the service account configuration.
serviceAccount:
# -- Specifies whether a service account should be created.
create: true
# -- Automatically mount a ServiceAccount's API credentials.
automount: true
# -- Annotations to add to the service account.
annotations: {}
# -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template.
name: ""
# -- Annotations to add to the API service deployment.
annotations: {}
# -- Annotations to add to the API service pod.
podAnnotations: {}
# -- Labels for the API service pod.
podLabels: {}
# -- Pod-level security context settings for the API service.
# @default -- This object has the following default values for the pod security context.
podSecurityContext:
# -- The file system group ID to use for all containers.
fsGroup: 1000
# -- Container-level security context settings for the API service.
securityContext: {}
# -- Service configuration for the API service.
# @default -- This object has the following default values for the service configuration.
service:
# -- The Kubernetes service type to create.
type: ClusterIP
# -- The port number to expose for the service.
port: 8080
# -- Annotations for the API service.
annotations: {}
# -- Kubernetes deployment resources configuration for the API service.
resources: {}
# -- Startup probe configuration for the api service.
# @default -- This object has the following default values for the startup probe configuration.
startupProbe:
# -- Number of seconds to wait before the first startup probe. Allows time for DB connection retries (e.g. Postgres pod booting).
initialDelaySeconds: 10
# -- The HTTP GET request to use for the startup probe.
httpGet:
path: /health/ready
port: http
# -- The frequency in seconds to perform the startup probe.
periodSeconds: 15
# -- The timeout in seconds for the startup probe.
timeoutSeconds: 5
# -- The failure threshold for the startup probe.
failureThreshold: 24
# -- Liveness probe configuration for the api service.
# @default -- This object has the following default values for the liveness probe configuration.
livenessProbe:
# -- The HTTP GET request to use for the liveness probe.
httpGet:
path: /health/live
port: http
# -- The frequency in seconds to perform the liveness probe.
periodSeconds: 10
# -- The timeout in seconds for the liveness probe.
timeoutSeconds: 5
# -- The failure threshold for the liveness probe.
failureThreshold: 3
# -- Readiness probe configuration for the api service.
# @default -- This object has the following default values for the readiness probe configuration.
readinessProbe:
# -- The HTTP GET request to use for the readiness probe.
httpGet:
path: /health/ready
port: http
# -- The frequency in seconds to perform the readiness probe.
periodSeconds: 10
# -- The timeout in seconds for the readiness probe.
timeoutSeconds: 5
# -- The failure threshold for the readiness probe.
failureThreshold: 3
# -- PodDisruptionBudget configuration for the API service.
# @default -- This object has the following default values for the pod disruption budget configuration.
podDisruptionBudget:
# -- Whether to create a PodDisruptionBudget for the API pods.
enabled: false
# -- Minimum number of API pods that must remain available during voluntary disruptions.
# Only one of minAvailable or maxUnavailable may be set.
minAvailable: 1
# -- Maximum number of API pods that can be unavailable during voluntary disruptions.
# Only one of minAvailable or maxUnavailable may be set.
# maxUnavailable: 0
# -- Annotations for the PodDisruptionBudget.
annotations: {}
# -- Specifies autoscaling configurations for the deployment.
autoscaling:
# -- Whether to enable horizontal pod autoscaler.
enabled: false
# -- The minimum number of replicas for the deployment.
minReplicas: 1
# -- The maximum number of replicas for the deployment.
maxReplicas: 10
# -- The target CPU utilization percentage.
targetCPUUtilizationPercentage: 80
# targetMemoryUtilizationPercentage: 80
# -- Annotations for the HorizontalPodAutoscaler.
annotations: {}
# Environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object}
env: {}
# -- Node selector configuration for the API service.
nodeSelector: {}
# -- Affinity configuration for the API service.
affinity: {}
# -- Tolerations configuration for the API service.
tolerations: []
# -- Topology spread constraints for the API service pods. See https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
topologySpreadConstraints: []
# ServiceMonitor configuration for Prometheus Operator
serviceMonitor:
# -- Enable ServiceMonitor resources for Prometheus Operator
enabled: false
# -- Scrape interval for the ServiceMonitor
interval: "30s"
# -- Scheme to use for scraping metrics (http or https)
scheme: "http"
# -- Additional labels to add to the ServiceMonitor
labels: {}
# -- Additional annotations to add to the ServiceMonitor
annotations: {}
# -- Platform seed Job (Helm hook: runs after install/upgrade)
# Runs the platform-seed task (guardrails configs, evaluator system entities, data designer filesets).
# Uses post-install,post-upgrade hooks so it runs on fresh installs and can be re-triggered on no-op upgrade.
# @default -- This object has the following default values for the platform seed Job configuration.
platformSeedJob:
# -- Specifies whether to enable the platform-seed Job.
enabled: true
# -- Seconds after the Job finishes (success or failure) before it is eligible for automatic deletion.
ttlSecondsAfterFinished: 86400
# -- Number of retries before considering the Job failed.
backoffLimit: 6
# -- Maximum time in seconds the Job can run.
activeDeadlineSeconds: 600
# -- Pod-level security context for the platform seeding Job pod.
podSecurityContext: {}
# -- Container-level security context for the platform-seed container.
securityContext: {}
# -- Resource requests/limits for the platform-seed container.
resources: {}
# -- Extra environment variables for the platform-seed container (e.g. CONFIG_STORE_PATH, NMP_PLATFORM_SEED_*).
extraEnv: []
# -- Node selector for the platform seeding Job pod.
nodeSelector: {}
# -- Affinity for the platform seeding Job pod.
affinity: {}
# -- Tolerations for the platform seeding Job pod.
tolerations: []
# -- Additional labels for the platform seeding Job pod.
podLabels: {}
# -- Core deployment configuration settings
# @default -- This object has the following default values for the core deployment configuration.
core:
# -- Specifies whether to enable the core deployment.
enabled: true
# -- Container image configuration for the core deployment.
# @default -- This object has the following default values for the image configuration.
image:
# -- The registry where the NeMo Platform image is located.
repository: nvcr.io/nvidia/nemo-microservices/nmp-api
# -- The image pull policy determining when to pull new images.
pullPolicy: IfNotPresent
# -- The image tag to use.
tag: ""
storage:
# -- If set, pods will mount this persistent volume for job-scoped storage
# and we will not create a new persistent volume claim.
existingPersistentVolumeName: ""
# -- Which storageClass to use when creating a new persistent volume claim. Empty string uses the cluster's default StorageClass.
storageClass: ""
# -- accessModes for the persistent volume claim. This should include `ReadWriteMany` to ensure
# multiple job pods can write to the volume concurrently.
accessModes:
- ReadWriteMany
# -- size of the persistent volume claim used for persistent storage
size: 200Gi
# -- volumePermissionsImage is the image used to set permissions on the volume
volumePermissionsImage: "busybox"
# -- Annotations to add to the persistent volume claim
annotations: {}
# -- OpenTelemetry configuration overrides for the platform deployment.
telemetry: {}
# -- Service account configuration for pods created by the jobs controller (Kubernetes/Volcano job pods).
# @default -- This object has the following default values for the jobs service account configuration.
jobs:
serviceAccount:
# -- Specifies whether a service account should be created for job pods.
create: true
# -- Automatically mount a ServiceAccount's API credentials.
automount: true
# -- Annotations to add to the service account.
annotations: {}
# -- The name of the service account to use. If not set and create is true, a name is generated with a '-jobs' suffix.
name: ""
# @default -- This object has the following default values for the controller configuration.
controller:
# -- Service account configuration for the controller service.
# @default -- This object has the following default values for the service account configuration.
serviceAccount:
# -- Specifies whether a service account should be created.
create: true
# -- Automatically mount a ServiceAccount's API credentials.
automount: true
# -- Annotations to add to the service account.
annotations: {}
# -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template.
name: ""
# -- Additional arguments to pass to the Core Controller service
extraArgs: []
# -- Service configuration for the controller service. This only configures a headless service for DNS resolution.
# @default -- This object has the following default values for the service configuration.
service:
# -- The port for the service.
port: 8080
# -- Annotations for the headless controller service.
annotations: {}
# -- Annotations to add to the controller service deployment.
annotations: {}
# -- Annotations to add to the controller service pod.
podAnnotations: {}
# -- Labels for the controller service pod.
podLabels: {}
# -- Pod-level security context settings for the controller service.
# @default -- This object has the following default values for the pod security context.
podSecurityContext:
# -- The file system group ID to use for all containers.
fsGroup: 1000
# -- Container-level security context settings for the controller service.
securityContext: {}
# -- Kubernetes deployment resources configuration for the controller service.
resources: {}
# -- Startup probe configuration for the core service.
# @default -- This object has the following default values for the startup probe configuration.
startupProbe:
# -- Number of seconds to wait before the first startup probe. Allows time for DB connection retries (e.g. Postgres pod booting).
initialDelaySeconds: 10
# -- The HTTP GET request to use for the startup probe.
httpGet:
path: /health/ready
port: http
# -- The frequency in seconds to perform the startup probe.
periodSeconds: 15
# -- The timeout in seconds for the startup probe.
timeoutSeconds: 5
# -- The failure threshold for the startup probe.
failureThreshold: 24
# -- Liveness probe configuration for the controller service.
# @default -- This object has the following default values for the liveness probe configuration.
livenessProbe:
# -- The HTTP GET request to use for the readiness probe.
httpGet:
path: /health/live
port: http
# -- The frequency in seconds to perform the readiness probe.
periodSeconds: 10
# -- The timeout in seconds for the readiness probe.
timeoutSeconds: 5
# -- The failure threshold for the readiness probe.
failureThreshold: 3
# -- Readiness probe configuration for the controller service.
# @default -- This object has the following default values for the readiness probe configuration.
readinessProbe:
# -- The HTTP GET request to use for the readiness probe.
httpGet:
path: /health/ready
port: http
# -- The frequency in seconds to perform the readiness probe.
periodSeconds: 10
# -- The timeout in seconds for the readiness probe.
timeoutSeconds: 5
# -- The failure threshold for the readiness probe.
failureThreshold: 3
# -- Additional environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object}.
env: {}
# -- Node selector configuration for the controller service.
nodeSelector: {}
# -- Affinity configuration for the controller service.
affinity: {}
# -- Tolerations configuration for the controller service.
tolerations: []
# -- Topology spread constraints for the controller service pods. See https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
topologySpreadConstraints: []
# ServiceMonitor configuration for Prometheus Operator
serviceMonitor:
# -- Enable ServiceMonitor resources for Prometheus Operator
enabled: false
# -- Scrape interval for the ServiceMonitor
interval: "30s"
# -- Scheme to use for scraping metrics (http or https)
scheme: "http"
# -- Additional labels to add to the ServiceMonitor
labels: {}
# -- Additional annotations to add to the ServiceMonitor
annotations: {}
# -- Envoy proxy configuration settings. Resources are created only when platform config has auth.enabled: true (see platformConfig.auth.enabled).
# @default -- This object has the following default values for the envoy proxy configuration.
envoyProxy:
# -- Specifies whether to enable the Envoy proxy deployment. Rendered only when platform config has auth.enabled: true.
enabled: true
# Headers considered internal-only
trustedHeaders:
- x-nmp-principal-id
- x-nmp-principal-email
- x-nmp-principal-groups
- x-nmp-principal-filters
- x-nmp-principal-roles
# Number of Envoy proxy replicas
replicaCount: 2
# Envoy image
image:
repository: envoyproxy/envoy
tag: v1.37.0
pullPolicy: IfNotPresent
# -- Service account configuration for the Envoy service.
# @default -- This object has the following default values for the service account configuration.
serviceAccount:
# -- Specifies whether a service account should be created.
create: true
# -- Automatically mount a ServiceAccount's API credentials.
automount: true
# -- Annotations to add to the service account.
annotations: {}
# -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template.
name: ""
# -- Annotations to add to the Envoy service deployment.
annotations: {}
# -- Annotations to add to the Envoy service pod.
podAnnotations: {}
# -- Labels for the Envoy service pod.
podLabels: {}
# -- Pod-level security context settings for the Envoy service.
# @default -- This object has the following default values for the pod security context.
podSecurityContext:
# -- The file system group ID to use for all containers.
fsGroup: 1000
# -- Container-level security context settings for the Envoy service.
securityContext: {}
# -- Service configuration for the Envoy service.
# @default -- This object has the following default values for the service configuration.
service:
# -- The Kubernetes service type to create.
type: ClusterIP
# -- The port number to expose for the service.
port: 8080
# -- Annotations for the Envoy service.
annotations: {}
# -- Envoy Admin port
adminPort: 9901
# -- Timeouts for proxying to long-lived streams (e.g. inference gateway). Use "0s" to disable a timeout.
# @default -- Tuned for streaming; increase or set to "0s" if requests are cut off.
timeouts:
# -- Stream idle timeout. Time with no activity before stream is closed. 0 = disabled (required for long-lived streams).
streamIdle: "0s"
# -- Time to receive full request headers. 0 = disabled.
requestHeaders: "60s"
# -- Total request timeout. 0 = disabled (required for streaming; not compatible with streaming if set).
request: "0s"
# -- Per-route timeout for the passthrough to backend. 0 = disabled.
route: "0s"
# -- Cluster connect timeout (time to establish connection to backend).
connect: "30s"
# -- Number of Envoy worker threads. Defaults to 2, which is sufficient for the platform proxy workload.
# Envoy's own default (0) means one worker per CPU core, which on large nodes can exhaust the
# container's RLIMIT_NOFILE (commonly 1024 under containerd CRI) during startup, causing
# "Too many open files" errors from libevent. Set to 0 to restore Envoy's auto-detect behavior
# (only safe when the node's nofile rlimit has been raised, e.g. via containerd base_runtime_spec).
concurrency: 2
# -- Kubernetes deployment resources configuration for the Envoy service.
resources: {}
# -- Liveness probe for the Envoy container (admin interface /ready).
livenessProbe:
httpGet:
path: /ready
port: admin
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
# -- Readiness probe for the Envoy container (admin interface /ready).
readinessProbe:
httpGet:
path: /ready
port: admin
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
# -- Startup probe for the Envoy container (admin interface /ready).
startupProbe:
httpGet:
path: /ready
port: admin
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 12
# -- PodDisruptionBudget configuration for the Envoy service.
# @default -- This object has the following default values for the pod disruption budget configuration.
podDisruptionBudget:
# -- Whether to create a PodDisruptionBudget for the Envoy pods.
enabled: false
# -- Minimum number of Envoy pods that must remain available during voluntary disruptions.
# Only one of minAvailable or maxUnavailable may be set.
minAvailable: 1
# -- Maximum number of Envoy pods that can be unavailable during voluntary disruptions.
# Only one of minAvailable or maxUnavailable may be set.
# maxUnavailable: 0
# -- Annotations for the PodDisruptionBudget.
annotations: {}
# -- Specifies autoscaling configurations for the deployment.
autoscaling:
# -- Whether to enable horizontal pod autoscaler.
enabled: false
# -- The minimum number of replicas for the deployment.
minReplicas: 1
# -- The maximum number of replicas for the deployment.
maxReplicas: 10
# -- The target CPU utilization percentage.
targetCPUUtilizationPercentage: 80
# targetMemoryUtilizationPercentage: 80
# -- Annotations for the HorizontalPodAutoscaler.
annotations: {}
# Environment variables to pass to containers. This is an object formatted like NAME: value or NAME: valueFrom: {object}
env: {}
# -- Node selector configuration for the Envoy pods.
nodeSelector: {}
# -- Affinity configuration for the Envoy pods.
affinity: {}
# -- Tolerations configuration for the Envoy pods.
tolerations: []
# -- Topology spread constraints for the Envoy pods. See https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
topologySpreadConstraints: []
# ServiceMonitor configuration for Prometheus Operator
serviceMonitor:
# -- Enable ServiceMonitor resources for Prometheus Operator
enabled: false
# -- Scrape interval for the ServiceMonitor
interval: "30s"
# -- Scheme to use for scraping metrics (http or https)
scheme: "http"
# -- Additional labels to add to the ServiceMonitor
labels: {}
# -- Additional annotations to add to the ServiceMonitor
annotations: {}