improvement(helm): update GPU device plugin and add cert-manager issuers (#3036)

* improvement(helm): update GPU device plugin and add cert-manager issuers

* fix(helm): address code review feedback for GPU plugin and cert-manager

* fix(helm): remove duplicate nodeSelector, add hook for CA issuer ordering

* fix(helm): remove incorrect hook, CA issuer auto-reconciles
This commit is contained in:
Waleed
2026-01-27 18:25:08 -08:00
committed by GitHub
parent 65bc21608c
commit b4a389a71f
3 changed files with 191 additions and 37 deletions

View File

@@ -0,0 +1,84 @@
{{- if .Values.certManager.enabled }}
{{- /*
cert-manager Issuer Bootstrap Pattern
PREREQUISITE: cert-manager must be installed in your cluster before enabling this.
The root CA Certificate is created in the namespace specified by certManager.rootCA.namespace
(defaults to "cert-manager"). Ensure this namespace exists and cert-manager is running there.
Install cert-manager: https://cert-manager.io/docs/installation/
This implements the recommended pattern from cert-manager documentation:
1. A self-signed ClusterIssuer (for bootstrapping the root CA only)
2. A root CA Certificate (self-signed, used to sign other certificates)
3. A CA ClusterIssuer (uses the root CA to sign certificates)
Reference: https://cert-manager.io/docs/configuration/selfsigned/
*/ -}}
---
# 1. Self-Signed ClusterIssuer (Bootstrap Only)
# This issuer is used ONLY to create the root CA certificate.
# It should NOT be used directly for application certificates.
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: {{ .Values.certManager.selfSignedIssuer.name }}
labels:
{{- include "sim.labels" . | nindent 4 }}
app.kubernetes.io/component: cert-manager
spec:
selfSigned: {}
---
# 2. Root CA Certificate
# This certificate is signed by the self-signed issuer and becomes the root of trust.
# The secret created here will be used by the CA issuer to sign certificates.
# NOTE: This must be created in the cert-manager namespace (or the namespace specified
# in certManager.rootCA.namespace). Ensure cert-manager is installed there first.
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: {{ .Values.certManager.rootCA.certificateName }}
namespace: {{ .Values.certManager.rootCA.namespace | default "cert-manager" }} # Must match cert-manager's cluster-resource-namespace
labels:
{{- include "sim.labels" . | nindent 4 }}
app.kubernetes.io/component: cert-manager
spec:
isCA: true
commonName: {{ .Values.certManager.rootCA.commonName }}
secretName: {{ .Values.certManager.rootCA.secretName }}
duration: {{ .Values.certManager.rootCA.duration | default "87600h" }}
renewBefore: {{ .Values.certManager.rootCA.renewBefore | default "2160h" }}
privateKey:
algorithm: {{ .Values.certManager.rootCA.privateKey.algorithm | default "RSA" }}
size: {{ .Values.certManager.rootCA.privateKey.size | default 4096 }}
subject:
organizations:
{{- if .Values.certManager.rootCA.subject.organizations }}
{{- toYaml .Values.certManager.rootCA.subject.organizations | nindent 6 }}
{{- else }}
- {{ .Release.Name }}
{{- end }}
issuerRef:
name: {{ .Values.certManager.selfSignedIssuer.name }}
kind: ClusterIssuer
group: cert-manager.io
---
# 3. CA ClusterIssuer
# This is the issuer that should be used by applications to obtain certificates.
# It signs certificates using the root CA created above.
# NOTE: This issuer may briefly show "not ready" on first install while cert-manager
# processes the Certificate above and creates the secret. It will auto-reconcile.
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: {{ .Values.certManager.caIssuer.name }}
labels:
{{- include "sim.labels" . | nindent 4 }}
app.kubernetes.io/component: cert-manager
spec:
ca:
secretName: {{ .Values.certManager.rootCA.secretName }}
{{- end }}

View File

@@ -1,6 +1,36 @@
{{- if and .Values.ollama.enabled .Values.ollama.gpu.enabled }}
---
# NVIDIA Device Plugin DaemonSet for GPU support
# 1. ConfigMap for NVIDIA Device Plugin Configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config
namespace: {{ .Release.Namespace }}
labels:
{{- include "sim.labels" . | nindent 4 }}
app.kubernetes.io/component: nvidia-device-plugin
data:
config.yaml: |
version: v1
flags:
{{- if eq .Values.ollama.gpu.strategy "mig" }}
migStrategy: "single"
{{- else }}
migStrategy: "none"
{{- end }}
failOnInitError: false
plugin:
passDeviceSpecs: true
deviceListStrategy: envvar
{{- if eq .Values.ollama.gpu.strategy "time-slicing" }}
sharing:
timeSlicing:
resources:
- name: nvidia.com/gpu
replicas: {{ .Values.ollama.gpu.timeSlicingReplicas | default 5 }}
{{- end }}
---
# 2. NVIDIA Device Plugin DaemonSet for GPU support
apiVersion: apps/v1
kind: DaemonSet
metadata:
@@ -35,9 +65,6 @@ spec:
# Only schedule on nodes with NVIDIA GPUs
accelerator: nvidia
priorityClassName: system-node-critical
runtimeClassName: nvidia
hostNetwork: true
hostPID: true
volumes:
- name: device-plugin
hostPath:
@@ -48,22 +75,21 @@ spec:
- name: sys
hostPath:
path: /sys
- name: proc-driver-nvidia
hostPath:
path: /proc/driver/nvidia
# Volume to mount the ConfigMap
- name: nvidia-device-plugin-config
configMap:
name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config
containers:
- name: nvidia-device-plugin
image: nvcr.io/nvidia/k8s-device-plugin:v0.14.5
image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2
imagePullPolicy: Always
args:
- --mig-strategy=single
- --pass-device-specs=true
- --fail-on-init-error=false
- --device-list-strategy=envvar
- --nvidia-driver-root=/host-sys/fs/cgroup
- "--config-file=/etc/device-plugin/config.yaml"
{{- if eq .Values.ollama.gpu.strategy "mig" }}
env:
- name: NVIDIA_MIG_MONITOR_DEVICES
value: all
{{- end }}
securityContext:
allowPrivilegeEscalation: false
capabilities:
@@ -74,29 +100,16 @@ spec:
- name: dev
mountPath: /dev
- name: sys
mountPath: /host-sys
mountPath: /sys
readOnly: true
- name: proc-driver-nvidia
mountPath: /proc/driver/nvidia
- name: nvidia-device-plugin-config
mountPath: /etc/device-plugin/
readOnly: true
resources:
requests:
cpu: 50m
memory: 10Mi
memory: 20Mi
limits:
cpu: 50m
memory: 20Mi
{{- if .Values.nodeSelector }}
nodeSelector:
{{- toYaml .Values.nodeSelector | nindent 8 }}
{{- end }}
---
# RuntimeClass for NVIDIA Container Runtime
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: {{ include "sim.fullname" . }}-nvidia
labels:
{{- include "sim.labels" . | nindent 4 }}
handler: nvidia
{{- end }}
memory: 50Mi
{{- end }}

View File

@@ -400,8 +400,10 @@ postgresql:
algorithm: RSA # RSA or ECDSA
size: 4096 # Key size in bits
# Issuer reference (REQUIRED if tls.enabled is true)
# By default, references the CA issuer created by certManager.caIssuer
# Make sure certManager.enabled is true, or provide your own issuer
issuerRef:
name: selfsigned-cluster-issuer # Name of your cert-manager Issuer/ClusterIssuer
name: sim-ca-issuer # Name of your cert-manager Issuer/ClusterIssuer
kind: ClusterIssuer # ClusterIssuer or Issuer
group: "" # Optional: cert-manager.io (leave empty for default)
# Additional DNS names (optional)
@@ -463,20 +465,26 @@ externalDatabase:
ollama:
# Enable/disable Ollama deployment
enabled: false
# Image configuration
image:
repository: ollama/ollama
tag: latest
pullPolicy: Always
# Number of replicas
replicaCount: 1
# GPU configuration
gpu:
enabled: false
count: 1
# GPU sharing strategy: "mig" (Multi-Instance GPU) or "time-slicing"
# - mig: Hardware-level GPU partitioning (requires supported GPUs like A100)
# - time-slicing: Software-level GPU sharing (works with most NVIDIA GPUs)
strategy: "time-slicing"
# Number of time-slicing replicas (only used when strategy is "time-slicing")
timeSlicingReplicas: 5
# Node selector for GPU workloads (adjust labels based on your cluster configuration)
nodeSelector:
@@ -1185,4 +1193,53 @@ externalSecrets:
# External database password (when using managed database services)
externalDatabase:
# Path to external database password in external store
password: ""
password: ""
# cert-manager configuration
# Prerequisites: Install cert-manager in your cluster first
# See: https://cert-manager.io/docs/installation/
#
# This implements the recommended CA bootstrap pattern from cert-manager:
# 1. Self-signed ClusterIssuer (bootstrap only - creates root CA)
# 2. Root CA Certificate (self-signed, becomes the trust anchor)
# 3. CA ClusterIssuer (signs application certificates using root CA)
#
# Reference: https://cert-manager.io/docs/configuration/selfsigned/
certManager:
# Enable/disable cert-manager issuer resources
enabled: false
# Self-signed ClusterIssuer (used ONLY to bootstrap the root CA)
# Do not reference this issuer directly for application certificates
selfSignedIssuer:
name: "sim-selfsigned-bootstrap-issuer"
# Root CA Certificate configuration
# This certificate is signed by the self-signed issuer and used as the trust anchor
rootCA:
# Name of the Certificate resource
certificateName: "sim-root-ca"
# Namespace where the root CA certificate and secret will be created
# Must match cert-manager's cluster-resource-namespace (default: cert-manager)
namespace: "cert-manager"
# Common name for the root CA certificate
commonName: "sim-root-ca"
# Secret name where the root CA certificate and key will be stored
secretName: "sim-root-ca-secret"
# Certificate validity duration (default: 10 years)
duration: "87600h"
# Renew before expiry (default: 90 days)
renewBefore: "2160h"
# Private key configuration
privateKey:
algorithm: RSA
size: 4096
# Subject configuration
subject:
organizations: []
# If empty, defaults to the release name
# CA ClusterIssuer configuration
# This is the issuer that applications should reference for obtaining certificates
caIssuer:
name: "sim-ca-issuer"