diff --git a/helm/sim/templates/cert-manager-issuers.yaml b/helm/sim/templates/cert-manager-issuers.yaml new file mode 100644 index 000000000..aef2a61a0 --- /dev/null +++ b/helm/sim/templates/cert-manager-issuers.yaml @@ -0,0 +1,84 @@ +{{- if .Values.certManager.enabled }} +{{- /* + cert-manager Issuer Bootstrap Pattern + + PREREQUISITE: cert-manager must be installed in your cluster before enabling this. + The root CA Certificate is created in the namespace specified by certManager.rootCA.namespace + (defaults to "cert-manager"). Ensure this namespace exists and cert-manager is running there. + + Install cert-manager: https://cert-manager.io/docs/installation/ + + This implements the recommended pattern from cert-manager documentation: + 1. A self-signed ClusterIssuer (for bootstrapping the root CA only) + 2. A root CA Certificate (self-signed, used to sign other certificates) + 3. A CA ClusterIssuer (uses the root CA to sign certificates) + + Reference: https://cert-manager.io/docs/configuration/selfsigned/ +*/ -}} + +--- +# 1. Self-Signed ClusterIssuer (Bootstrap Only) +# This issuer is used ONLY to create the root CA certificate. +# It should NOT be used directly for application certificates. +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: {{ .Values.certManager.selfSignedIssuer.name }} + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: cert-manager +spec: + selfSigned: {} + +--- +# 2. Root CA Certificate +# This certificate is signed by the self-signed issuer and becomes the root of trust. +# The secret created here will be used by the CA issuer to sign certificates. +# NOTE: This must be created in the cert-manager namespace (or the namespace specified +# in certManager.rootCA.namespace). Ensure cert-manager is installed there first. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ .Values.certManager.rootCA.certificateName }} + namespace: {{ .Values.certManager.rootCA.namespace | default "cert-manager" }} # Must match cert-manager's cluster-resource-namespace + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: cert-manager +spec: + isCA: true + commonName: {{ .Values.certManager.rootCA.commonName }} + secretName: {{ .Values.certManager.rootCA.secretName }} + duration: {{ .Values.certManager.rootCA.duration | default "87600h" }} + renewBefore: {{ .Values.certManager.rootCA.renewBefore | default "2160h" }} + privateKey: + algorithm: {{ .Values.certManager.rootCA.privateKey.algorithm | default "RSA" }} + size: {{ .Values.certManager.rootCA.privateKey.size | default 4096 }} + subject: + organizations: + {{- if .Values.certManager.rootCA.subject.organizations }} + {{- toYaml .Values.certManager.rootCA.subject.organizations | nindent 6 }} + {{- else }} + - {{ .Release.Name }} + {{- end }} + issuerRef: + name: {{ .Values.certManager.selfSignedIssuer.name }} + kind: ClusterIssuer + group: cert-manager.io + +--- +# 3. CA ClusterIssuer +# This is the issuer that should be used by applications to obtain certificates. +# It signs certificates using the root CA created above. +# NOTE: This issuer may briefly show "not ready" on first install while cert-manager +# processes the Certificate above and creates the secret. It will auto-reconcile. +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: {{ .Values.certManager.caIssuer.name }} + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: cert-manager +spec: + ca: + secretName: {{ .Values.certManager.rootCA.secretName }} +{{- end }} diff --git a/helm/sim/templates/gpu-device-plugin.yaml b/helm/sim/templates/gpu-device-plugin.yaml index df9a30b3d..b7bb9a628 100644 --- a/helm/sim/templates/gpu-device-plugin.yaml +++ b/helm/sim/templates/gpu-device-plugin.yaml @@ -1,6 +1,36 @@ {{- if and .Values.ollama.enabled .Values.ollama.gpu.enabled }} --- -# NVIDIA Device Plugin DaemonSet for GPU support +# 1. ConfigMap for NVIDIA Device Plugin Configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: nvidia-device-plugin +data: + config.yaml: | + version: v1 + flags: + {{- if eq .Values.ollama.gpu.strategy "mig" }} + migStrategy: "single" + {{- else }} + migStrategy: "none" + {{- end }} + failOnInitError: false + plugin: + passDeviceSpecs: true + deviceListStrategy: envvar + {{- if eq .Values.ollama.gpu.strategy "time-slicing" }} + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: {{ .Values.ollama.gpu.timeSlicingReplicas | default 5 }} + {{- end }} +--- +# 2. NVIDIA Device Plugin DaemonSet for GPU support apiVersion: apps/v1 kind: DaemonSet metadata: @@ -35,9 +65,6 @@ spec: # Only schedule on nodes with NVIDIA GPUs accelerator: nvidia priorityClassName: system-node-critical - runtimeClassName: nvidia - hostNetwork: true - hostPID: true volumes: - name: device-plugin hostPath: @@ -48,22 +75,21 @@ spec: - name: sys hostPath: path: /sys - - name: proc-driver-nvidia - hostPath: - path: /proc/driver/nvidia + # Volume to mount the ConfigMap + - name: nvidia-device-plugin-config + configMap: + name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config containers: - name: nvidia-device-plugin - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.5 + image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2 imagePullPolicy: Always args: - - --mig-strategy=single - - --pass-device-specs=true - - --fail-on-init-error=false - - --device-list-strategy=envvar - - --nvidia-driver-root=/host-sys/fs/cgroup + - "--config-file=/etc/device-plugin/config.yaml" + {{- if eq .Values.ollama.gpu.strategy "mig" }} env: - name: NVIDIA_MIG_MONITOR_DEVICES value: all + {{- end }} securityContext: allowPrivilegeEscalation: false capabilities: @@ -74,29 +100,16 @@ spec: - name: dev mountPath: /dev - name: sys - mountPath: /host-sys + mountPath: /sys readOnly: true - - name: proc-driver-nvidia - mountPath: /proc/driver/nvidia + - name: nvidia-device-plugin-config + mountPath: /etc/device-plugin/ readOnly: true resources: requests: cpu: 50m - memory: 10Mi + memory: 20Mi limits: cpu: 50m - memory: 20Mi - {{- if .Values.nodeSelector }} - nodeSelector: - {{- toYaml .Values.nodeSelector | nindent 8 }} - {{- end }} ---- -# RuntimeClass for NVIDIA Container Runtime -apiVersion: node.k8s.io/v1 -kind: RuntimeClass -metadata: - name: {{ include "sim.fullname" . }}-nvidia - labels: - {{- include "sim.labels" . | nindent 4 }} -handler: nvidia -{{- end }} \ No newline at end of file + memory: 50Mi +{{- end }} diff --git a/helm/sim/values.yaml b/helm/sim/values.yaml index dc09a9ce2..e78e0f917 100644 --- a/helm/sim/values.yaml +++ b/helm/sim/values.yaml @@ -400,8 +400,10 @@ postgresql: algorithm: RSA # RSA or ECDSA size: 4096 # Key size in bits # Issuer reference (REQUIRED if tls.enabled is true) + # By default, references the CA issuer created by certManager.caIssuer + # Make sure certManager.enabled is true, or provide your own issuer issuerRef: - name: selfsigned-cluster-issuer # Name of your cert-manager Issuer/ClusterIssuer + name: sim-ca-issuer # Name of your cert-manager Issuer/ClusterIssuer kind: ClusterIssuer # ClusterIssuer or Issuer group: "" # Optional: cert-manager.io (leave empty for default) # Additional DNS names (optional) @@ -463,20 +465,26 @@ externalDatabase: ollama: # Enable/disable Ollama deployment enabled: false - + # Image configuration image: repository: ollama/ollama tag: latest pullPolicy: Always - + # Number of replicas replicaCount: 1 - + # GPU configuration gpu: enabled: false count: 1 + # GPU sharing strategy: "mig" (Multi-Instance GPU) or "time-slicing" + # - mig: Hardware-level GPU partitioning (requires supported GPUs like A100) + # - time-slicing: Software-level GPU sharing (works with most NVIDIA GPUs) + strategy: "time-slicing" + # Number of time-slicing replicas (only used when strategy is "time-slicing") + timeSlicingReplicas: 5 # Node selector for GPU workloads (adjust labels based on your cluster configuration) nodeSelector: @@ -1185,4 +1193,53 @@ externalSecrets: # External database password (when using managed database services) externalDatabase: # Path to external database password in external store - password: "" \ No newline at end of file + password: "" + +# cert-manager configuration +# Prerequisites: Install cert-manager in your cluster first +# See: https://cert-manager.io/docs/installation/ +# +# This implements the recommended CA bootstrap pattern from cert-manager: +# 1. Self-signed ClusterIssuer (bootstrap only - creates root CA) +# 2. Root CA Certificate (self-signed, becomes the trust anchor) +# 3. CA ClusterIssuer (signs application certificates using root CA) +# +# Reference: https://cert-manager.io/docs/configuration/selfsigned/ +certManager: + # Enable/disable cert-manager issuer resources + enabled: false + + # Self-signed ClusterIssuer (used ONLY to bootstrap the root CA) + # Do not reference this issuer directly for application certificates + selfSignedIssuer: + name: "sim-selfsigned-bootstrap-issuer" + + # Root CA Certificate configuration + # This certificate is signed by the self-signed issuer and used as the trust anchor + rootCA: + # Name of the Certificate resource + certificateName: "sim-root-ca" + # Namespace where the root CA certificate and secret will be created + # Must match cert-manager's cluster-resource-namespace (default: cert-manager) + namespace: "cert-manager" + # Common name for the root CA certificate + commonName: "sim-root-ca" + # Secret name where the root CA certificate and key will be stored + secretName: "sim-root-ca-secret" + # Certificate validity duration (default: 10 years) + duration: "87600h" + # Renew before expiry (default: 90 days) + renewBefore: "2160h" + # Private key configuration + privateKey: + algorithm: RSA + size: 4096 + # Subject configuration + subject: + organizations: [] + # If empty, defaults to the release name + + # CA ClusterIssuer configuration + # This is the issuer that applications should reference for obtaining certificates + caIssuer: + name: "sim-ca-issuer" \ No newline at end of file