mirror of
https://github.com/simstudioai/sim.git
synced 2026-01-28 00:08:21 -05:00
improvement(helm): update GPU device plugin and add cert-manager issuers (#3036)
* improvement(helm): update GPU device plugin and add cert-manager issuers * fix(helm): address code review feedback for GPU plugin and cert-manager * fix(helm): remove duplicate nodeSelector, add hook for CA issuer ordering * fix(helm): remove incorrect hook, CA issuer auto-reconciles
This commit is contained in:
84
helm/sim/templates/cert-manager-issuers.yaml
Normal file
84
helm/sim/templates/cert-manager-issuers.yaml
Normal file
@@ -0,0 +1,84 @@
|
||||
{{- if .Values.certManager.enabled }}
|
||||
{{- /*
|
||||
cert-manager Issuer Bootstrap Pattern
|
||||
|
||||
PREREQUISITE: cert-manager must be installed in your cluster before enabling this.
|
||||
The root CA Certificate is created in the namespace specified by certManager.rootCA.namespace
|
||||
(defaults to "cert-manager"). Ensure this namespace exists and cert-manager is running there.
|
||||
|
||||
Install cert-manager: https://cert-manager.io/docs/installation/
|
||||
|
||||
This implements the recommended pattern from cert-manager documentation:
|
||||
1. A self-signed ClusterIssuer (for bootstrapping the root CA only)
|
||||
2. A root CA Certificate (self-signed, used to sign other certificates)
|
||||
3. A CA ClusterIssuer (uses the root CA to sign certificates)
|
||||
|
||||
Reference: https://cert-manager.io/docs/configuration/selfsigned/
|
||||
*/ -}}
|
||||
|
||||
---
|
||||
# 1. Self-Signed ClusterIssuer (Bootstrap Only)
|
||||
# This issuer is used ONLY to create the root CA certificate.
|
||||
# It should NOT be used directly for application certificates.
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: ClusterIssuer
|
||||
metadata:
|
||||
name: {{ .Values.certManager.selfSignedIssuer.name }}
|
||||
labels:
|
||||
{{- include "sim.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: cert-manager
|
||||
spec:
|
||||
selfSigned: {}
|
||||
|
||||
---
|
||||
# 2. Root CA Certificate
|
||||
# This certificate is signed by the self-signed issuer and becomes the root of trust.
|
||||
# The secret created here will be used by the CA issuer to sign certificates.
|
||||
# NOTE: This must be created in the cert-manager namespace (or the namespace specified
|
||||
# in certManager.rootCA.namespace). Ensure cert-manager is installed there first.
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: {{ .Values.certManager.rootCA.certificateName }}
|
||||
namespace: {{ .Values.certManager.rootCA.namespace | default "cert-manager" }} # Must match cert-manager's cluster-resource-namespace
|
||||
labels:
|
||||
{{- include "sim.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: cert-manager
|
||||
spec:
|
||||
isCA: true
|
||||
commonName: {{ .Values.certManager.rootCA.commonName }}
|
||||
secretName: {{ .Values.certManager.rootCA.secretName }}
|
||||
duration: {{ .Values.certManager.rootCA.duration | default "87600h" }}
|
||||
renewBefore: {{ .Values.certManager.rootCA.renewBefore | default "2160h" }}
|
||||
privateKey:
|
||||
algorithm: {{ .Values.certManager.rootCA.privateKey.algorithm | default "RSA" }}
|
||||
size: {{ .Values.certManager.rootCA.privateKey.size | default 4096 }}
|
||||
subject:
|
||||
organizations:
|
||||
{{- if .Values.certManager.rootCA.subject.organizations }}
|
||||
{{- toYaml .Values.certManager.rootCA.subject.organizations | nindent 6 }}
|
||||
{{- else }}
|
||||
- {{ .Release.Name }}
|
||||
{{- end }}
|
||||
issuerRef:
|
||||
name: {{ .Values.certManager.selfSignedIssuer.name }}
|
||||
kind: ClusterIssuer
|
||||
group: cert-manager.io
|
||||
|
||||
---
|
||||
# 3. CA ClusterIssuer
|
||||
# This is the issuer that should be used by applications to obtain certificates.
|
||||
# It signs certificates using the root CA created above.
|
||||
# NOTE: This issuer may briefly show "not ready" on first install while cert-manager
|
||||
# processes the Certificate above and creates the secret. It will auto-reconcile.
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: ClusterIssuer
|
||||
metadata:
|
||||
name: {{ .Values.certManager.caIssuer.name }}
|
||||
labels:
|
||||
{{- include "sim.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: cert-manager
|
||||
spec:
|
||||
ca:
|
||||
secretName: {{ .Values.certManager.rootCA.secretName }}
|
||||
{{- end }}
|
||||
@@ -1,6 +1,36 @@
|
||||
{{- if and .Values.ollama.enabled .Values.ollama.gpu.enabled }}
|
||||
---
|
||||
# NVIDIA Device Plugin DaemonSet for GPU support
|
||||
# 1. ConfigMap for NVIDIA Device Plugin Configuration
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "sim.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: nvidia-device-plugin
|
||||
data:
|
||||
config.yaml: |
|
||||
version: v1
|
||||
flags:
|
||||
{{- if eq .Values.ollama.gpu.strategy "mig" }}
|
||||
migStrategy: "single"
|
||||
{{- else }}
|
||||
migStrategy: "none"
|
||||
{{- end }}
|
||||
failOnInitError: false
|
||||
plugin:
|
||||
passDeviceSpecs: true
|
||||
deviceListStrategy: envvar
|
||||
{{- if eq .Values.ollama.gpu.strategy "time-slicing" }}
|
||||
sharing:
|
||||
timeSlicing:
|
||||
resources:
|
||||
- name: nvidia.com/gpu
|
||||
replicas: {{ .Values.ollama.gpu.timeSlicingReplicas | default 5 }}
|
||||
{{- end }}
|
||||
---
|
||||
# 2. NVIDIA Device Plugin DaemonSet for GPU support
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
@@ -35,9 +65,6 @@ spec:
|
||||
# Only schedule on nodes with NVIDIA GPUs
|
||||
accelerator: nvidia
|
||||
priorityClassName: system-node-critical
|
||||
runtimeClassName: nvidia
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
@@ -48,22 +75,21 @@ spec:
|
||||
- name: sys
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: proc-driver-nvidia
|
||||
hostPath:
|
||||
path: /proc/driver/nvidia
|
||||
# Volume to mount the ConfigMap
|
||||
- name: nvidia-device-plugin-config
|
||||
configMap:
|
||||
name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config
|
||||
containers:
|
||||
- name: nvidia-device-plugin
|
||||
image: nvcr.io/nvidia/k8s-device-plugin:v0.14.5
|
||||
image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2
|
||||
imagePullPolicy: Always
|
||||
args:
|
||||
- --mig-strategy=single
|
||||
- --pass-device-specs=true
|
||||
- --fail-on-init-error=false
|
||||
- --device-list-strategy=envvar
|
||||
- --nvidia-driver-root=/host-sys/fs/cgroup
|
||||
- "--config-file=/etc/device-plugin/config.yaml"
|
||||
{{- if eq .Values.ollama.gpu.strategy "mig" }}
|
||||
env:
|
||||
- name: NVIDIA_MIG_MONITOR_DEVICES
|
||||
value: all
|
||||
{{- end }}
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
@@ -74,29 +100,16 @@ spec:
|
||||
- name: dev
|
||||
mountPath: /dev
|
||||
- name: sys
|
||||
mountPath: /host-sys
|
||||
mountPath: /sys
|
||||
readOnly: true
|
||||
- name: proc-driver-nvidia
|
||||
mountPath: /proc/driver/nvidia
|
||||
- name: nvidia-device-plugin-config
|
||||
mountPath: /etc/device-plugin/
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 10Mi
|
||||
memory: 20Mi
|
||||
limits:
|
||||
cpu: 50m
|
||||
memory: 20Mi
|
||||
{{- if .Values.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml .Values.nodeSelector | nindent 8 }}
|
||||
{{- end }}
|
||||
---
|
||||
# RuntimeClass for NVIDIA Container Runtime
|
||||
apiVersion: node.k8s.io/v1
|
||||
kind: RuntimeClass
|
||||
metadata:
|
||||
name: {{ include "sim.fullname" . }}-nvidia
|
||||
labels:
|
||||
{{- include "sim.labels" . | nindent 4 }}
|
||||
handler: nvidia
|
||||
{{- end }}
|
||||
memory: 50Mi
|
||||
{{- end }}
|
||||
|
||||
@@ -400,8 +400,10 @@ postgresql:
|
||||
algorithm: RSA # RSA or ECDSA
|
||||
size: 4096 # Key size in bits
|
||||
# Issuer reference (REQUIRED if tls.enabled is true)
|
||||
# By default, references the CA issuer created by certManager.caIssuer
|
||||
# Make sure certManager.enabled is true, or provide your own issuer
|
||||
issuerRef:
|
||||
name: selfsigned-cluster-issuer # Name of your cert-manager Issuer/ClusterIssuer
|
||||
name: sim-ca-issuer # Name of your cert-manager Issuer/ClusterIssuer
|
||||
kind: ClusterIssuer # ClusterIssuer or Issuer
|
||||
group: "" # Optional: cert-manager.io (leave empty for default)
|
||||
# Additional DNS names (optional)
|
||||
@@ -463,20 +465,26 @@ externalDatabase:
|
||||
ollama:
|
||||
# Enable/disable Ollama deployment
|
||||
enabled: false
|
||||
|
||||
|
||||
# Image configuration
|
||||
image:
|
||||
repository: ollama/ollama
|
||||
tag: latest
|
||||
pullPolicy: Always
|
||||
|
||||
|
||||
# Number of replicas
|
||||
replicaCount: 1
|
||||
|
||||
|
||||
# GPU configuration
|
||||
gpu:
|
||||
enabled: false
|
||||
count: 1
|
||||
# GPU sharing strategy: "mig" (Multi-Instance GPU) or "time-slicing"
|
||||
# - mig: Hardware-level GPU partitioning (requires supported GPUs like A100)
|
||||
# - time-slicing: Software-level GPU sharing (works with most NVIDIA GPUs)
|
||||
strategy: "time-slicing"
|
||||
# Number of time-slicing replicas (only used when strategy is "time-slicing")
|
||||
timeSlicingReplicas: 5
|
||||
|
||||
# Node selector for GPU workloads (adjust labels based on your cluster configuration)
|
||||
nodeSelector:
|
||||
@@ -1185,4 +1193,53 @@ externalSecrets:
|
||||
# External database password (when using managed database services)
|
||||
externalDatabase:
|
||||
# Path to external database password in external store
|
||||
password: ""
|
||||
password: ""
|
||||
|
||||
# cert-manager configuration
|
||||
# Prerequisites: Install cert-manager in your cluster first
|
||||
# See: https://cert-manager.io/docs/installation/
|
||||
#
|
||||
# This implements the recommended CA bootstrap pattern from cert-manager:
|
||||
# 1. Self-signed ClusterIssuer (bootstrap only - creates root CA)
|
||||
# 2. Root CA Certificate (self-signed, becomes the trust anchor)
|
||||
# 3. CA ClusterIssuer (signs application certificates using root CA)
|
||||
#
|
||||
# Reference: https://cert-manager.io/docs/configuration/selfsigned/
|
||||
certManager:
|
||||
# Enable/disable cert-manager issuer resources
|
||||
enabled: false
|
||||
|
||||
# Self-signed ClusterIssuer (used ONLY to bootstrap the root CA)
|
||||
# Do not reference this issuer directly for application certificates
|
||||
selfSignedIssuer:
|
||||
name: "sim-selfsigned-bootstrap-issuer"
|
||||
|
||||
# Root CA Certificate configuration
|
||||
# This certificate is signed by the self-signed issuer and used as the trust anchor
|
||||
rootCA:
|
||||
# Name of the Certificate resource
|
||||
certificateName: "sim-root-ca"
|
||||
# Namespace where the root CA certificate and secret will be created
|
||||
# Must match cert-manager's cluster-resource-namespace (default: cert-manager)
|
||||
namespace: "cert-manager"
|
||||
# Common name for the root CA certificate
|
||||
commonName: "sim-root-ca"
|
||||
# Secret name where the root CA certificate and key will be stored
|
||||
secretName: "sim-root-ca-secret"
|
||||
# Certificate validity duration (default: 10 years)
|
||||
duration: "87600h"
|
||||
# Renew before expiry (default: 90 days)
|
||||
renewBefore: "2160h"
|
||||
# Private key configuration
|
||||
privateKey:
|
||||
algorithm: RSA
|
||||
size: 4096
|
||||
# Subject configuration
|
||||
subject:
|
||||
organizations: []
|
||||
# If empty, defaults to the release name
|
||||
|
||||
# CA ClusterIssuer configuration
|
||||
# This is the issuer that applications should reference for obtaining certificates
|
||||
caIssuer:
|
||||
name: "sim-ca-issuer"
|
||||
Reference in New Issue
Block a user