diff --git a/.gitignore b/.gitignore index 08dedb867..d48610097 100644 --- a/.gitignore +++ b/.gitignore @@ -65,4 +65,7 @@ start-collector.sh .turbo # VSCode -.vscode \ No newline at end of file +.vscode + +## Helm Chart Tests +helm/sim/test \ No newline at end of file diff --git a/apps/sim/components/emails/batch-invitation-email.tsx b/apps/sim/components/emails/batch-invitation-email.tsx index 0fa5bbfe6..aa4eecc0e 100644 --- a/apps/sim/components/emails/batch-invitation-email.tsx +++ b/apps/sim/components/emails/batch-invitation-email.tsx @@ -73,7 +73,7 @@ export const BatchInvitationEmail = ({ src='https://simstudio.ai/logo.png' width='120' height='36' - alt='SimStudio' + alt='Sim Studio' style={logo} /> @@ -82,7 +82,7 @@ export const BatchInvitationEmail = ({ {inviterName} has invited you to join{' '} - {organizationName} on SimStudio. + {organizationName} on Sim Studio. {/* Organization Invitation Details */} diff --git a/helm/sim/.helmignore b/helm/sim/.helmignore new file mode 100644 index 000000000..b7ed70ccb --- /dev/null +++ b/helm/sim/.helmignore @@ -0,0 +1,28 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ +# Examples directory (included in chart but ignored during packaging) +examples/ +# Test files +*_test.yaml +test/ \ No newline at end of file diff --git a/helm/sim/Chart.yaml b/helm/sim/Chart.yaml new file mode 100644 index 000000000..b21cdea25 --- /dev/null +++ b/helm/sim/Chart.yaml @@ -0,0 +1,23 @@ +apiVersion: v2 +name: sim +description: A Helm chart for Sim - AI agent workflow platform +type: application +version: 0.1.0 +appVersion: "1.0.0" +home: https://simstudio.ai +icon: https://raw.githubusercontent.com/simstudioai/sim/main/apps/sim/public/sim.svg +sources: + - https://github.com/simstudioai/sim +maintainers: + - name: Sim Team + email: help@simstudio.ai + url: https://simstudio.ai +keywords: + - ai + - workflow + - automation + - agents + - nextjs +annotations: + category: AI/ML Platform + licenses: Apache-2.0 \ No newline at end of file diff --git a/helm/sim/README.md b/helm/sim/README.md new file mode 100644 index 000000000..f5c480960 --- /dev/null +++ b/helm/sim/README.md @@ -0,0 +1,619 @@ +# Sim Helm Chart + +This Helm chart deploys Sim, a lightweight AI agent workflow platform, on Kubernetes. + +## Prerequisites + +- Kubernetes 1.19+ +- Helm 3.0+ +- PV provisioner support in the underlying infrastructure (for persistent storage) + +## Installation + +### Quick Start + +1. Add the chart repository (if using a separate repo): +```bash +helm repo add simstudio https://charts.simstudio.ai +helm repo update +``` + +2. Install the chart: +```bash +helm install sim simstudio/sim +``` + +### Local Installation + +If using the chart from this repository: + +```bash +# From the repository root +helm install sim ./helm/sim +``` + +### Custom Configuration + +Install with custom values: + +```bash +helm install sim ./helm/sim -f custom-values.yaml +``` + +## Configuration Examples + +The chart includes several pre-configured values files for different scenarios: + +| Example File | Description | Use Case | +|-------------|-------------|----------| +| `values-development.yaml` | Minimal resources, no SSL | Local development and testing | +| `values-production.yaml` | High availability, security-focused | Generic production deployment | +| `values-external-db.yaml` | External database configuration | Production with managed database | +| `values-azure.yaml` | Azure AKS optimized | Azure Kubernetes Service | +| `values-aws.yaml` | AWS EKS optimized | Amazon Elastic Kubernetes Service | +| `values-gcp.yaml` | GCP GKE optimized | Google Kubernetes Engine | + +### Development Environment + +```bash +helm install sim-dev ./helm/sim \ + --values ./helm/sim/examples/values-development.yaml \ + --namespace simstudio-dev --create-namespace +``` + +### Production Environment + +```bash +helm install sim-prod ./helm/sim \ + --values ./helm/sim/examples/values-production.yaml \ + --namespace simstudio-prod --create-namespace +``` + +### Azure Environment + +```bash +helm install sim-azure ./helm/sim \ + --values ./helm/sim/examples/values-azure.yaml \ + --namespace simstudio --create-namespace +``` + +### AWS Environment (EKS) + +```bash +helm install sim-aws ./helm/sim \ + --values ./helm/sim/examples/values-aws.yaml \ + --namespace simstudio --create-namespace +``` + +### GCP Environment (GKE) + +```bash +helm install sim-gcp ./helm/sim \ + --values ./helm/sim/examples/values-gcp.yaml \ + --namespace simstudio --create-namespace +``` + +### External Database (Managed Services) + +```bash +helm install sim-prod ./helm/sim \ + --values ./helm/sim/examples/values-external-db.yaml \ + --set externalDatabase.host="your-rds-endpoint.com" \ + --set externalDatabase.username="simstudio_user" \ + --set externalDatabase.password="secure-password" \ + --set externalDatabase.database="simstudio_prod" \ + --namespace simstudio --create-namespace +``` + +## Cloud-Specific Features + +Each cloud platform example includes optimized configurations: + +### Azure (AKS) +- **Storage**: Premium managed disks (`managed-csi-premium`) +- **Node Selectors**: Role-based node targeting (`node-role: application`, `node-role: datalake`) +- **GPU Support**: NVIDIA GPU nodes with tolerations +- **Ingress**: NGINX ingress controller with SSL redirect + +### AWS (EKS) +- **Storage**: EBS GP3 volumes for optimal performance +- **Node Selectors**: Instance type targeting (`t3.large`, `r5.large`, `g4dn.xlarge`) +- **GPU Support**: GPU-optimized instances (G4, P3 families) +- **Ingress**: Application Load Balancer (ALB) with AWS Certificate Manager +- **IAM**: Service Account annotations for IAM roles + +### GCP (GKE) +- **Storage**: Persistent Disk with standard and premium options +- **Node Selectors**: Node pool and machine family targeting +- **GPU Support**: Tesla T4/V100 GPUs with GKE accelerator labels +- **Ingress**: Google Cloud Load Balancer with managed certificates +- **Workload Identity**: Service Account annotations for GCP IAM + +## Configuration + +The following table lists the configurable parameters and their default values. + +### Global Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `global.imageRegistry` | Global Docker image registry | `"ghcr.io"` | +| `global.useRegistryForAllImages` | Use custom registry for all images (not just simstudioai/*) | `false` | +| `global.imagePullSecrets` | Global Docker registry secret names | `[]` | +| `global.storageClass` | Global storage class for PVCs | `""` | +| `global.commonLabels` | Common labels to add to all resources | `{}` | + +### Application Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `app.enabled` | Enable the main application | `true` | +| `app.replicaCount` | Number of app replicas | `1` | +| `app.image.repository` | App image repository | `simstudioai/sim` | +| `app.image.tag` | App image tag | `latest` | +| `app.image.pullPolicy` | App image pull policy | `Always` | +| `app.resources` | App resource limits and requests | See values.yaml | +| `app.nodeSelector` | App node selector | `{}` | +| `app.podSecurityContext` | App pod security context | `fsGroup: 1001` | +| `app.securityContext` | App container security context | `runAsNonRoot: true, runAsUser: 1001` | +| `app.service.type` | App service type | `ClusterIP` | +| `app.service.port` | App service port | `3000` | +| `app.service.targetPort` | App service target port | `3000` | +| `app.livenessProbe` | App liveness probe configuration | See values.yaml | +| `app.readinessProbe` | App readiness probe configuration | See values.yaml | +| `app.env` | App environment variables | See values.yaml | + +### Realtime Service Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `realtime.enabled` | Enable the realtime service | `true` | +| `realtime.replicaCount` | Number of realtime replicas | `1` | +| `realtime.image.repository` | Realtime image repository | `simstudioai/realtime` | +| `realtime.image.tag` | Realtime image tag | `latest` | +| `realtime.image.pullPolicy` | Realtime image pull policy | `Always` | +| `realtime.resources` | Realtime resource limits and requests | See values.yaml | +| `realtime.nodeSelector` | Realtime node selector | `{}` | +| `realtime.podSecurityContext` | Realtime pod security context | `fsGroup: 1001` | +| `realtime.securityContext` | Realtime container security context | `runAsNonRoot: true, runAsUser: 1001` | +| `realtime.service.type` | Realtime service type | `ClusterIP` | +| `realtime.service.port` | Realtime service port | `3002` | +| `realtime.service.targetPort` | Realtime service target port | `3002` | +| `realtime.livenessProbe` | Realtime liveness probe configuration | See values.yaml | +| `realtime.readinessProbe` | Realtime readiness probe configuration | See values.yaml | +| `realtime.env` | Realtime environment variables | See values.yaml | + +### PostgreSQL Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `postgresql.enabled` | Enable internal PostgreSQL | `true` | +| `postgresql.image.repository` | PostgreSQL image repository | `pgvector/pgvector` | +| `postgresql.image.tag` | PostgreSQL image tag | `pg17` | +| `postgresql.image.pullPolicy` | PostgreSQL image pull policy | `IfNotPresent` | +| `postgresql.auth.username` | PostgreSQL username | `postgres` | +| `postgresql.auth.password` | PostgreSQL password | `""` (REQUIRED) | +| `postgresql.auth.database` | PostgreSQL database name | `sim` | +| `postgresql.nodeSelector` | PostgreSQL node selector | `{}` | +| `postgresql.resources` | PostgreSQL resource limits and requests | See values.yaml | +| `postgresql.podSecurityContext` | PostgreSQL pod security context | `fsGroup: 999` | +| `postgresql.securityContext` | PostgreSQL container security context | `runAsUser: 999` | +| `postgresql.persistence.enabled` | Enable PostgreSQL persistence | `true` | +| `postgresql.persistence.storageClass` | PostgreSQL storage class | `""` | +| `postgresql.persistence.size` | PostgreSQL PVC size | `10Gi` | +| `postgresql.persistence.accessModes` | PostgreSQL PVC access modes | `["ReadWriteOnce"]` | +| `postgresql.tls.enabled` | Enable PostgreSQL SSL/TLS | `false` | +| `postgresql.tls.certificatesSecret` | PostgreSQL TLS certificates secret | `postgres-tls-secret` | +| `postgresql.config.maxConnections` | PostgreSQL max connections | `1000` | +| `postgresql.config.sharedBuffers` | PostgreSQL shared buffers | `"1280MB"` | +| `postgresql.config.maxWalSize` | PostgreSQL max WAL size | `"4GB"` | +| `postgresql.config.minWalSize` | PostgreSQL min WAL size | `"80MB"` | +| `postgresql.service.type` | PostgreSQL service type | `ClusterIP` | +| `postgresql.service.port` | PostgreSQL service port | `5432` | +| `postgresql.service.targetPort` | PostgreSQL service target port | `5432` | +| `postgresql.livenessProbe` | PostgreSQL liveness probe configuration | See values.yaml | +| `postgresql.readinessProbe` | PostgreSQL readiness probe configuration | See values.yaml | + +### External Database Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `externalDatabase.enabled` | Use external database instead of internal PostgreSQL | `false` | +| `externalDatabase.host` | External database host | `"external-db.example.com"` | +| `externalDatabase.port` | External database port | `5432` | +| `externalDatabase.username` | External database username | `postgres` | +| `externalDatabase.password` | External database password | `""` | +| `externalDatabase.database` | External database name | `sim` | +| `externalDatabase.sslMode` | External database SSL mode | `require` | + +### Ollama Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `ollama.enabled` | Enable Ollama for local AI models | `false` | +| `ollama.image.repository` | Ollama image repository | `ollama/ollama` | +| `ollama.image.tag` | Ollama image tag | `latest` | +| `ollama.image.pullPolicy` | Ollama image pull policy | `Always` | +| `ollama.replicaCount` | Number of Ollama replicas | `1` | +| `ollama.gpu.enabled` | Enable GPU support for Ollama | `false` | +| `ollama.gpu.count` | Number of GPUs to allocate | `1` | +| `ollama.nodeSelector` | Ollama node selector | `accelerator: nvidia` | +| `ollama.tolerations` | Ollama tolerations for GPU nodes | See values.yaml | +| `ollama.resources` | Ollama resource limits and requests | See values.yaml | +| `ollama.env` | Ollama environment variables | See values.yaml | +| `ollama.persistence.enabled` | Enable Ollama persistence | `true` | +| `ollama.persistence.storageClass` | Ollama storage class | `""` | +| `ollama.persistence.size` | Ollama PVC size | `100Gi` | +| `ollama.persistence.accessModes` | Ollama PVC access modes | `["ReadWriteOnce"]` | +| `ollama.service.type` | Ollama service type | `ClusterIP` | +| `ollama.service.port` | Ollama service port | `11434` | +| `ollama.service.targetPort` | Ollama service target port | `11434` | +| `ollama.startupProbe` | Ollama startup probe configuration | See values.yaml | +| `ollama.livenessProbe` | Ollama liveness probe configuration | See values.yaml | +| `ollama.readinessProbe` | Ollama readiness probe configuration | See values.yaml | + +### Ingress Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `ingress.enabled` | Enable ingress | `false` | +| `ingress.className` | Ingress class name | `nginx` | +| `ingress.annotations` | Ingress annotations | See values.yaml | +| `ingress.app.host` | App ingress hostname | `sim.local` | +| `ingress.app.paths` | App ingress paths | `[{path: "/", pathType: "Prefix"}]` | +| `ingress.realtime.host` | Realtime ingress hostname | `sim-ws.local` | +| `ingress.realtime.paths` | Realtime ingress paths | `[{path: "/", pathType: "Prefix"}]` | +| `ingress.tls.enabled` | Enable TLS for ingress | `false` | +| `ingress.tls.secretName` | TLS secret name | `sim-tls-secret` | + +### Autoscaling Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `autoscaling.enabled` | Enable Horizontal Pod Autoscaler | `false` | +| `autoscaling.minReplicas` | Minimum number of replicas | `1` | +| `autoscaling.maxReplicas` | Maximum number of replicas | `10` | +| `autoscaling.targetCPUUtilizationPercentage` | Target CPU utilization | `80` | +| `autoscaling.targetMemoryUtilizationPercentage` | Target memory utilization | `80` | +| `autoscaling.customMetrics` | Custom metrics for scaling | `[]` | +| `autoscaling.behavior` | Scaling behavior configuration | `{}` | + +### Monitoring Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `monitoring.serviceMonitor.enabled` | Enable ServiceMonitor for Prometheus | `false` | +| `monitoring.serviceMonitor.labels` | Additional labels for ServiceMonitor | `{}` | +| `monitoring.serviceMonitor.annotations` | Additional annotations for ServiceMonitor | `{}` | +| `monitoring.serviceMonitor.path` | Metrics endpoint path | `/metrics` | +| `monitoring.serviceMonitor.interval` | Scrape interval | `30s` | +| `monitoring.serviceMonitor.scrapeTimeout` | Scrape timeout | `10s` | +| `monitoring.serviceMonitor.targetLabels` | Target labels to add to scraped metrics | `[]` | +| `monitoring.serviceMonitor.metricRelabelings` | Metric relabeling configurations | `[]` | +| `monitoring.serviceMonitor.relabelings` | Relabeling configurations | `[]` | + +### Security Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `networkPolicy.enabled` | Enable network policies | `false` | +| `networkPolicy.ingress` | Custom ingress rules | `[]` | +| `networkPolicy.egress` | Custom egress rules | `[]` | +| `podDisruptionBudget.enabled` | Enable pod disruption budget | `false` | +| `podDisruptionBudget.minAvailable` | Minimum available pods | `1` | + +### Migration Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `migrations.enabled` | Enable database migrations job | `true` | +| `migrations.image.repository` | Migrations image repository | `simstudioai/migrations` | +| `migrations.image.tag` | Migrations image tag | `latest` | +| `migrations.image.pullPolicy` | Migrations image pull policy | `Always` | +| `migrations.resources` | Migrations resource limits and requests | See values.yaml | +| `migrations.podSecurityContext` | Migrations pod security context | `fsGroup: 1001` | +| `migrations.securityContext` | Migrations container security context | `runAsNonRoot: true, runAsUser: 1001` | + +### Shared Storage Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `sharedStorage.enabled` | Enable shared storage for multi-pod data sharing | `false` | +| `sharedStorage.storageClass` | Storage class for shared volumes (must support ReadWriteMany) | `""` | +| `sharedStorage.defaultAccessModes` | Default access modes for shared volumes | `["ReadWriteMany"]` | +| `sharedStorage.volumes` | Array of shared volume definitions | `[]` | +| `sharedStorage.volumes[].name` | Shared volume name | Required | +| `sharedStorage.volumes[].size` | Shared volume size | Required | +| `sharedStorage.volumes[].accessModes` | Shared volume access modes | Uses default | +| `sharedStorage.volumes[].storageClass` | Shared volume storage class | Uses global | +| `sharedStorage.volumes[].annotations` | Shared volume annotations | `{}` | +| `sharedStorage.volumes[].selector` | Shared volume selector | `{}` | + +### Telemetry Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `telemetry.enabled` | Enable telemetry and observability collection | `false` | +| `telemetry.replicaCount` | Number of telemetry collector replicas | `1` | +| `telemetry.image.repository` | Telemetry collector image repository | `otel/opentelemetry-collector-contrib` | +| `telemetry.image.tag` | Telemetry collector image tag | `0.91.0` | +| `telemetry.image.pullPolicy` | Telemetry collector image pull policy | `IfNotPresent` | +| `telemetry.resources` | Telemetry collector resource limits and requests | See values.yaml | +| `telemetry.nodeSelector` | Telemetry collector node selector | `{}` | +| `telemetry.tolerations` | Telemetry collector tolerations | `[]` | +| `telemetry.affinity` | Telemetry collector affinity | `{}` | +| `telemetry.service.type` | Telemetry collector service type | `ClusterIP` | +| `telemetry.jaeger.enabled` | Enable Jaeger tracing backend | `false` | +| `telemetry.jaeger.endpoint` | Jaeger collector endpoint | `"http://jaeger-collector:14250"` | +| `telemetry.jaeger.tls.enabled` | Enable TLS for Jaeger connection | `false` | +| `telemetry.prometheus.enabled` | Enable Prometheus metrics backend | `false` | +| `telemetry.prometheus.endpoint` | Prometheus remote write endpoint | `"http://prometheus-server/api/v1/write"` | +| `telemetry.prometheus.auth` | Prometheus authentication header | `""` | +| `telemetry.otlp.enabled` | Enable generic OTLP backend | `false` | +| `telemetry.otlp.endpoint` | OTLP collector endpoint | `"http://otlp-collector:4317"` | +| `telemetry.otlp.tls.enabled` | Enable TLS for OTLP connection | `false` | + +### Service Account Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `serviceAccount.create` | Create a service account | `true` | +| `serviceAccount.annotations` | Service account annotations | `{}` | +| `serviceAccount.name` | Service account name (auto-generated if empty) | `""` | + +### Common Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `nameOverride` | Override the name of the chart | `""` | +| `fullnameOverride` | Override the fullname of the chart | `""` | +| `extraVolumes` | Additional volumes for all pods | `[]` | +| `extraVolumeMounts` | Additional volume mounts for all containers | `[]` | +| `extraEnvVars` | Additional environment variables for all containers | `[]` | +| `podAnnotations` | Additional annotations for all pods | `{}` | +| `podLabels` | Additional labels for all pods | `{}` | +| `affinity` | Affinity settings for all pods | `{}` | +| `tolerations` | Tolerations for all pods | `[]` | + +## Enterprise Features + +### Autoscaling + +Enable automatic horizontal scaling based on CPU and memory usage: + +```yaml +autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 20 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 +``` + +### Shared Storage + +Enable shared storage for multi-pod data sharing and enterprise workflows: + +```yaml +sharedStorage: + enabled: true + storageClass: "managed-csi-premium" + volumes: + - name: output-share + size: 100Gi + accessModes: + - ReadWriteMany + - name: model-share + size: 200Gi + accessModes: + - ReadWriteMany + - name: logs-share + size: 50Gi + accessModes: + - ReadWriteMany +``` + +This creates persistent volume claims that can be shared across multiple pods for: +- Output data sharing between workflow steps +- Model storage and caching +- Centralized logging and audit trails +- Temporary data exchange + +### Telemetry and Observability + +Enable comprehensive telemetry collection with OpenTelemetry: + +```yaml +telemetry: + enabled: true + resources: + limits: + memory: "1Gi" + cpu: "500m" + requests: + memory: "512Mi" + cpu: "200m" + + # Enable Jaeger for distributed tracing + jaeger: + enabled: true + endpoint: "http://jaeger-collector:14250" + + # Enable Prometheus for metrics + prometheus: + enabled: true + endpoint: "http://prometheus-server/api/v1/write" + auth: "Bearer your-prometheus-token" + + # Enable generic OTLP for flexibility + otlp: + enabled: true + endpoint: "http://otlp-collector:4317" +``` + +This automatically configures: +- OpenTelemetry Collector for metrics, traces, and logs +- Automatic service discovery for Sim components +- Environment variable injection for applications +- Support for multiple observability backends + +### GPU Support + +Enable GPU device plugin support for AI workloads: + +```yaml +ollama: + enabled: true + gpu: + enabled: true + count: 1 + nodeSelector: + accelerator: nvidia + tolerations: + - key: "sku" + operator: "Equal" + value: "gpu" + effect: "NoSchedule" +``` + +This deploys: +- NVIDIA Device Plugin DaemonSet +- RuntimeClass for NVIDIA container runtime +- Proper node scheduling and resource allocation + +### Monitoring Integration + +Enable Prometheus monitoring with ServiceMonitor: + +```yaml +monitoring: + serviceMonitor: + enabled: true + labels: + monitoring: "prometheus" + interval: 15s +``` + +### Network Security + +Enable network policies for micro-segmentation: + +```yaml +networkPolicy: + enabled: true +``` + +This creates network policies that: +- Allow communication between Sim components +- Restrict unnecessary network access +- Permit DNS resolution and HTTPS egress +- Support custom ingress/egress rules + +### High Availability + +Configure pod disruption budgets and anti-affinity: + +```yaml +podDisruptionBudget: + enabled: true + minAvailable: 1 + +affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: ["simstudio"] + topologyKey: kubernetes.io/hostname +``` + +## Upgrading + +To upgrade your release: + +```bash +helm upgrade sim ./helm/sim +``` + +## Uninstalling + +To uninstall/delete the release: + +```bash +helm uninstall sim +``` + +## Security Considerations + +### Production Secrets + +For production deployments, make sure to: + +1. **Change default secrets**: Update `BETTER_AUTH_SECRET` and `ENCRYPTION_KEY` with secure, randomly generated values +2. **Use strong database passwords**: Set `postgresql.auth.password` to a strong password +3. **Enable TLS**: Configure `postgresql.tls.enabled=true` and provide proper certificates +4. **Configure ingress TLS**: Enable HTTPS with proper SSL certificates + +### Example secure values: + +```yaml +app: + env: + BETTER_AUTH_SECRET: "your-secure-random-string-here" + ENCRYPTION_KEY: "your-secure-encryption-key-here" + +postgresql: + auth: + password: "your-secure-database-password" + tls: + enabled: true + certificatesSecret: "postgres-tls-secret" + +ingress: + enabled: true + tls: + enabled: true + secretName: "simstudio-tls-secret" +``` + +## Troubleshooting + +### Common Issues + +1. **Database Connection Issues** + - Check if PostgreSQL pod is running: `kubectl get pods -l app.kubernetes.io/component=postgresql` + - Verify database credentials in the secret: `kubectl get secret -postgresql-secret -o yaml` + +2. **Migration Issues** + - Check migration job logs: `kubectl logs job/-migrations` + - Ensure database is accessible from the migration job + +3. **Image Pull Issues** + - Verify image names and tags in values.yaml + - Check if image pull secrets are configured correctly + +### Getting Logs + +```bash +# App logs +kubectl logs deployment/-app + +# Realtime logs +kubectl logs deployment/-realtime + +# PostgreSQL logs +kubectl logs statefulset/-postgresql + +# Migration logs +kubectl logs job/-migrations +``` + +## Support + +- Documentation: https://docs.simstudio.ai +- GitHub Issues: https://github.com/simstudioai/sim/issues +- Discord: https://discord.gg/Hr4UWYEcTT \ No newline at end of file diff --git a/helm/sim/examples/values-aws.yaml b/helm/sim/examples/values-aws.yaml new file mode 100644 index 000000000..14adf139f --- /dev/null +++ b/helm/sim/examples/values-aws.yaml @@ -0,0 +1,234 @@ +# AWS-specific values for Sim +# Example configuration for Amazon EKS deployment + +# Global configuration +global: + imageRegistry: "ghcr.io" + storageClass: "gp3" + +# Main application +app: + enabled: true + replicaCount: 2 + + # Node selector for application pods (customize based on your EKS node labels) + nodeSelector: + kubernetes.io/arch: amd64 + node.kubernetes.io/instance-type: "t3.large" + + resources: + limits: + memory: "4Gi" + cpu: "2000m" + requests: + memory: "2Gi" + cpu: "1000m" + + # Production URLs (REQUIRED - update with your actual domain names) + env: + NEXT_PUBLIC_APP_URL: "https://simstudio.acme.com" + BETTER_AUTH_URL: "https://simstudio.acme.com" + SOCKET_SERVER_URL: "https://simstudio-ws.acme.com" + NEXT_PUBLIC_SOCKET_URL: "https://simstudio-ws.acme.com" + + # Security settings (REQUIRED - replace with your own secure secrets) + BETTER_AUTH_SECRET: "your-secure-production-auth-secret-here" + ENCRYPTION_KEY: "your-secure-production-encryption-key-here" + + NODE_ENV: "production" + NEXT_TELEMETRY_DISABLED: "1" + + # AWS-specific environment variables + AWS_REGION: "us-west-2" + +# Realtime service +realtime: + enabled: true + replicaCount: 2 + + # Node selector for realtime pods (customize based on your EKS node labels) + nodeSelector: + kubernetes.io/arch: amd64 + node.kubernetes.io/instance-type: "t3.medium" + + resources: + limits: + memory: "4Gi" + cpu: "1000m" + requests: + memory: "2Gi" + cpu: "500m" + + env: + NEXT_PUBLIC_APP_URL: "https://simstudio.acme.com" + BETTER_AUTH_URL: "https://simstudio.acme.com" + NEXT_PUBLIC_SOCKET_URL: "https://simstudio-ws.acme.com" + BETTER_AUTH_SECRET: "your-secure-production-auth-secret-here" + ALLOWED_ORIGINS: "https://simstudio.acme.com" + NODE_ENV: "production" + +# Database migrations +migrations: + enabled: true + + resources: + limits: + memory: "2Gi" + cpu: "1000m" + requests: + memory: "1Gi" + cpu: "500m" + +# PostgreSQL database +postgresql: + enabled: true + + # Node selector for database pods (recommended: memory-optimized EC2 instances) + nodeSelector: + node.kubernetes.io/instance-type: "r5.large" + + # Database authentication (REQUIRED - set secure credentials) + auth: + username: postgres + password: "your-secure-postgres-password" + database: simstudio + + # Resource allocation optimized for AWS EKS + resources: + limits: + memory: "4Gi" + cpu: "2000m" + requests: + memory: "2Gi" + cpu: "1000m" + + # Persistent storage using AWS EBS GP3 volumes + persistence: + enabled: true + storageClass: "gp3" + size: 50Gi + accessModes: + - ReadWriteOnce + + # SSL/TLS configuration + tls: + enabled: true + certificatesSecret: postgres-tls-secret + + # PostgreSQL performance tuning for AWS infrastructure + config: + maxConnections: 1000 + sharedBuffers: "2GB" + maxWalSize: "8GB" + minWalSize: "160MB" + +# Ollama AI models with GPU acceleration (AWS EC2 GPU instances) +ollama: + enabled: true + replicaCount: 1 + + # GPU node targeting (recommended: g4dn.xlarge or p3.2xlarge instances) + nodeSelector: + node.kubernetes.io/instance-type: "g4dn.xlarge" + kubernetes.io/arch: amd64 + + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" + + # GPU resource allocation for AI model serving + gpu: + enabled: true + count: 1 + + resources: + limits: + memory: "16Gi" + cpu: "4000m" + nvidia.com/gpu: "1" + requests: + memory: "8Gi" + cpu: "2000m" + + # High-performance storage for AI models + persistence: + enabled: true + storageClass: "gp3" + size: 100Gi + accessModes: + - ReadWriteOnce + + env: + NVIDIA_DRIVER_CAPABILITIES: "all" + OLLAMA_LOAD_TIMEOUT: "-1" + OLLAMA_KEEP_ALIVE: "-1" + OLLAMA_DEBUG: "1" + +# Ingress using AWS Application Load Balancer (ALB) +ingress: + enabled: true + className: alb + + annotations: + kubernetes.io/ingress.class: alb + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip + alb.ingress.kubernetes.io/ssl-redirect: "443" + alb.ingress.kubernetes.io/certificate-arn: "arn:aws:acm:us-west-2:123456789012:certificate/your-cert-arn" + + # Main application + app: + host: simstudio.acme.com + paths: + - path: / + pathType: Prefix + + # Realtime service + realtime: + host: simstudio-ws.acme.com + paths: + - path: / + pathType: Prefix + + # TLS configuration + tls: + enabled: true + secretName: simstudio-tls-secret + +# Pod disruption budget for high availability +podDisruptionBudget: + enabled: true + minAvailable: 1 + +# Network policies +networkPolicy: + enabled: true + +# Pod anti-affinity for high availability across AWS Availability Zones +affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: ["simstudio"] + topologyKey: kubernetes.io/hostname + - weight: 50 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: ["simstudio"] + topologyKey: topology.kubernetes.io/zone + +# Service Account with IAM roles for service account (IRSA) integration +serviceAccount: + create: true + annotations: + eks.amazonaws.com/role-arn: "arn:aws:iam::123456789012:role/SimStudioServiceRole" \ No newline at end of file diff --git a/helm/sim/examples/values-azure.yaml b/helm/sim/examples/values-azure.yaml new file mode 100644 index 000000000..dd3653fae --- /dev/null +++ b/helm/sim/examples/values-azure.yaml @@ -0,0 +1,172 @@ +# Azure-specific values for Sim +# Example configuration for Azure AKS deployment + +# Global configuration +global: + imageRegistry: "ghcr.io" + storageClass: "managed-csi-premium" + +# Main application +app: + enabled: true + replicaCount: 1 + + # Node selector for application pods (customize based on your AKS node labels) + nodeSelector: + node-role: application + + resources: + limits: + memory: "4Gi" + requests: + memory: "2Gi" + cpu: "500m" + + # Production URLs (REQUIRED - update with your actual domain names) + env: + NEXT_PUBLIC_APP_URL: "https://simstudio.acme.com" + BETTER_AUTH_URL: "https://simstudio.acme.com" + SOCKET_SERVER_URL: "https://simstudio-ws.acme.com" + NEXT_PUBLIC_SOCKET_URL: "https://simstudio-ws.acme.com" + + # Security settings (REQUIRED - replace with your own secure secrets) + BETTER_AUTH_SECRET: "your-secure-production-auth-secret-here" + ENCRYPTION_KEY: "your-secure-production-encryption-key-here" + + NODE_ENV: "production" + NEXT_TELEMETRY_DISABLED: "1" + +# Realtime service +realtime: + enabled: true + replicaCount: 1 + + # Node selector for application pods (customize based on your AKS node labels) + nodeSelector: + node-role: application + + resources: + limits: + memory: "4Gi" + requests: + memory: "1Gi" + cpu: "250m" + + env: + NEXT_PUBLIC_APP_URL: "https://simstudio.acme.com" + BETTER_AUTH_URL: "https://simstudio.acme.com" + NEXT_PUBLIC_SOCKET_URL: "https://simstudio-ws.acme.com" + BETTER_AUTH_SECRET: "your-secure-production-auth-secret-here" + ALLOWED_ORIGINS: "https://simstudio.acme.com" + NODE_ENV: "production" + +# Database migrations +migrations: + enabled: true + +# PostgreSQL database +postgresql: + enabled: true + + # Node selector for database pods (recommended: memory-optimized VM sizes) + nodeSelector: + node-role: datalake + + # Database authentication (REQUIRED - set secure credentials) + auth: + username: postgres + password: "your-secure-postgres-password" + database: simstudio + + # Resource allocation for production workloads + resources: + limits: + memory: "2Gi" + requests: + memory: "1Gi" + cpu: "500m" + + # Persistent storage using Azure Premium SSD + persistence: + enabled: true + storageClass: "managed-csi-premium" + size: 10Gi + + # SSL/TLS configuration (recommended for production) + tls: + enabled: true + certificatesSecret: postgres-tls-secret + + # PostgreSQL performance tuning for Azure infrastructure + config: + maxConnections: 1000 + sharedBuffers: "1280MB" + maxWalSize: "4GB" + minWalSize: "80MB" + +# Ollama AI models with GPU acceleration (Azure NC-series VMs) +ollama: + enabled: true + replicaCount: 1 + + # GPU node targeting (recommended: NC6s_v3 or NC12s_v3 VMs) + nodeSelector: + accelerator: nvidia + + tolerations: + - key: "sku" + operator: "Equal" + value: "gpu" + effect: "NoSchedule" + + # GPU resource allocation for AI model serving + gpu: + enabled: true + count: 1 + + resources: + limits: + memory: "8Gi" + nvidia.com/gpu: "1" + requests: + memory: "4Gi" + cpu: "1000m" + + # High-performance storage for AI models + persistence: + enabled: true + storageClass: "managed-csi-premium" + size: 100Gi + + env: + NVIDIA_DRIVER_CAPABILITIES: "all" + OLLAMA_LOAD_TIMEOUT: "-1" + OLLAMA_KEEP_ALIVE: "-1" + OLLAMA_DEBUG: "1" + +# Ingress configuration (NGINX ingress controller on Azure AKS) +ingress: + enabled: true + className: nginx + + annotations: + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + + # Main application + app: + host: simstudio.acme.com + paths: + - path: / + pathType: Prefix + + # Realtime service + realtime: + host: simstudio-ws.acme.com + paths: + - path: / + pathType: Prefix + + # TLS configuration + tls: + enabled: true + secretName: simstudio-tls-secret \ No newline at end of file diff --git a/helm/sim/examples/values-development.yaml b/helm/sim/examples/values-development.yaml new file mode 100644 index 000000000..c2f20837e --- /dev/null +++ b/helm/sim/examples/values-development.yaml @@ -0,0 +1,111 @@ +# Development values for Sim +# This configuration is suitable for development and testing + +# Global configuration +global: + imageRegistry: "ghcr.io" + +# Main application +app: + enabled: true + replicaCount: 1 + + # Resource allocation for development environment + resources: + limits: + memory: "4Gi" + cpu: "2000m" + requests: + memory: "2Gi" + cpu: "1000m" + + # Development URLs + env: + NEXT_PUBLIC_APP_URL: "http://localhost:3000" + BETTER_AUTH_URL: "http://localhost:3000" + NEXT_PUBLIC_SOCKET_URL: "http://localhost:3002" + + # Example secrets for development (replace with secure values for production) + BETTER_AUTH_SECRET: "dev-32-char-auth-secret-not-secure-dev" + ENCRYPTION_KEY: "dev-32-char-encryption-key-not-secure" + +# Realtime service +realtime: + enabled: true + replicaCount: 1 + + # Resource allocation for realtime WebSocket service in development + resources: + limits: + memory: "2Gi" + cpu: "1000m" + requests: + memory: "1Gi" + cpu: "500m" + + env: + NEXT_PUBLIC_APP_URL: "http://localhost:3000" + BETTER_AUTH_URL: "http://localhost:3000" + NEXT_PUBLIC_SOCKET_URL: "http://localhost:3002" + BETTER_AUTH_SECRET: "dev-32-char-auth-secret-not-secure-dev" + ALLOWED_ORIGINS: "http://localhost:3000" + +# Database migrations +migrations: + enabled: true + +# PostgreSQL database +postgresql: + enabled: true + + # Simple authentication for development + auth: + username: postgres + password: "postgres" + database: simstudio + + # PostgreSQL with pgvector extension for vector operations + image: + repository: pgvector/pgvector + tag: pg17 + pullPolicy: IfNotPresent + + # Minimal resource allocation for development PostgreSQL + resources: + limits: + memory: "1Gi" + cpu: "500m" + requests: + memory: "512Mi" + cpu: "250m" + + # Persistence disabled for easier development (data will be lost on restart) + persistence: + enabled: false + + # SSL/TLS disabled for local development + tls: + enabled: false + + # Minimal PostgreSQL configuration for development + config: + maxConnections: 100 + sharedBuffers: "256MB" + maxWalSize: "1GB" + minWalSize: "80MB" + +# Ollama AI models (disabled by default for development) +ollama: + enabled: false + +# Ingress (disabled for development - use port-forward for local access) +ingress: + enabled: false + +# Pod disruption budget (disabled for development) +podDisruptionBudget: + enabled: false + +# Network policies (disabled for development) +networkPolicy: + enabled: false \ No newline at end of file diff --git a/helm/sim/examples/values-external-db.yaml b/helm/sim/examples/values-external-db.yaml new file mode 100644 index 000000000..c4dd7f754 --- /dev/null +++ b/helm/sim/examples/values-external-db.yaml @@ -0,0 +1,152 @@ +# External Database Example for Sim +# Use this configuration when connecting to a managed database service +# (AWS RDS, Azure Database, Google Cloud SQL, etc.) + +# Global configuration +global: + imageRegistry: "ghcr.io" + +# Main application +app: + enabled: true + replicaCount: 2 + + resources: + limits: + memory: "4Gi" + cpu: "2000m" + requests: + memory: "2Gi" + cpu: "1000m" + + env: + NEXT_PUBLIC_APP_URL: "https://simstudio.acme.com" + BETTER_AUTH_URL: "https://simstudio.acme.com" + SOCKET_SERVER_URL: "https://simstudio-ws.acme.com" + NEXT_PUBLIC_SOCKET_URL: "https://simstudio-ws.acme.com" + + # Security settings (REQUIRED - replace with your own secure secrets) + BETTER_AUTH_SECRET: "" # Set via --set flag or external secret manager + ENCRYPTION_KEY: "" # Set via --set flag or external secret manager + + NODE_ENV: "production" + NEXT_TELEMETRY_DISABLED: "1" + +# Realtime service +realtime: + enabled: true + replicaCount: 2 + + resources: + limits: + memory: "4Gi" + cpu: "1000m" + requests: + memory: "2Gi" + cpu: "500m" + + env: + NEXT_PUBLIC_APP_URL: "https://simstudio.acme.com" + BETTER_AUTH_URL: "https://simstudio.acme.com" + NEXT_PUBLIC_SOCKET_URL: "https://simstudio-ws.acme.com" + BETTER_AUTH_SECRET: "" # Must match main app secret - set via --set flag + ALLOWED_ORIGINS: "https://simstudio.acme.com" + NODE_ENV: "production" + +# Database migrations +migrations: + enabled: true + + resources: + limits: + memory: "2Gi" + cpu: "1000m" + requests: + memory: "1Gi" + cpu: "500m" + +# Disable internal PostgreSQL +postgresql: + enabled: false + +# Configure external database connection +externalDatabase: + enabled: true + + # Database connection details (REQUIRED - configure for your external database) + host: "" # Database hostname (e.g., "postgres.acme.com" or RDS endpoint) + port: 5432 + username: "" # Database username (e.g., "simstudio_user") + password: "" # Database password - set via --set flag or external secret + database: "" # Database name (e.g., "simstudio_production") + + # SSL mode for database connections (recommended: 'require' for production) + sslMode: "require" # Options: disable, allow, prefer, require, verify-ca, verify-full + +# Ollama (optional for AI models) +ollama: + enabled: false + +# Ingress configuration +ingress: + enabled: true + className: nginx + + annotations: + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + cert-manager.io/cluster-issuer: "letsencrypt-prod" + + app: + host: simstudio.acme.com + paths: + - path: / + pathType: Prefix + + realtime: + host: simstudio-ws.acme.com + paths: + - path: / + pathType: Prefix + + tls: + enabled: true + secretName: simstudio-tls-secret + +# Production-ready features (autoscaling, monitoring, etc.) +autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 20 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 + +podDisruptionBudget: + enabled: true + minAvailable: 1 + +monitoring: + serviceMonitor: + enabled: true + labels: + monitoring: "prometheus" + interval: 15s + +networkPolicy: + enabled: true + # Custom egress rules to allow database connectivity + egress: + - to: [] # Allow outbound connections to external database + ports: + - protocol: TCP + port: 5432 + +# Example deployment command with secure secret generation: +# helm install sim ./helm/sim \ +# --values ./helm/sim/examples/values-external-db.yaml \ +# --set externalDatabase.host="your-db-host.com" \ +# --set externalDatabase.username="your-db-user" \ +# --set externalDatabase.password="your-db-password" \ +# --set externalDatabase.database="your-db-name" \ +# --set app.env.BETTER_AUTH_SECRET="$(openssl rand -hex 32)" \ +# --set app.env.ENCRYPTION_KEY="$(openssl rand -hex 32)" \ +# --set realtime.env.BETTER_AUTH_SECRET="$(openssl rand -hex 32)" \ No newline at end of file diff --git a/helm/sim/examples/values-gcp.yaml b/helm/sim/examples/values-gcp.yaml new file mode 100644 index 000000000..987bad86f --- /dev/null +++ b/helm/sim/examples/values-gcp.yaml @@ -0,0 +1,251 @@ +# GCP-specific values for Sim +# Example configuration for Google Kubernetes Engine (GKE) deployment + +# Global configuration +global: + imageRegistry: "ghcr.io" + storageClass: "standard-rwo" + +# Main application +app: + enabled: true + replicaCount: 2 + + # Node selector for application pods (customize based on your GKE node labels) + nodeSelector: + kubernetes.io/arch: amd64 + cloud.google.com/gke-nodepool: "default-pool" + + resources: + limits: + memory: "4Gi" + cpu: "2000m" + requests: + memory: "2Gi" + cpu: "1000m" + + # Production URLs (REQUIRED - update with your actual domain names) + env: + NEXT_PUBLIC_APP_URL: "https://simstudio.acme.com" + BETTER_AUTH_URL: "https://simstudio.acme.com" + SOCKET_SERVER_URL: "https://simstudio-ws.acme.com" + NEXT_PUBLIC_SOCKET_URL: "https://simstudio-ws.acme.com" + + # Security settings (REQUIRED - replace with your own secure secrets) + BETTER_AUTH_SECRET: "your-secure-production-auth-secret-here" + ENCRYPTION_KEY: "your-secure-production-encryption-key-here" + + NODE_ENV: "production" + NEXT_TELEMETRY_DISABLED: "1" + + # GCP-specific environment variables + GOOGLE_CLOUD_PROJECT: "your-project-id" + GOOGLE_CLOUD_REGION: "us-central1" + +# Realtime service +realtime: + enabled: true + replicaCount: 2 + + # Node selector for realtime pods (customize based on your GKE node labels) + nodeSelector: + kubernetes.io/arch: amd64 + cloud.google.com/gke-nodepool: "default-pool" + + resources: + limits: + memory: "4Gi" + cpu: "1000m" + requests: + memory: "2Gi" + cpu: "500m" + + env: + NEXT_PUBLIC_APP_URL: "https://simstudio.acme.com" + BETTER_AUTH_URL: "https://simstudio.acme.com" + NEXT_PUBLIC_SOCKET_URL: "https://simstudio-ws.acme.com" + BETTER_AUTH_SECRET: "your-secure-production-auth-secret-here" + ALLOWED_ORIGINS: "https://simstudio.acme.com" + NODE_ENV: "production" + +# Database migrations +migrations: + enabled: true + + resources: + limits: + memory: "2Gi" + cpu: "1000m" + requests: + memory: "1Gi" + cpu: "500m" + +# PostgreSQL database +postgresql: + enabled: true + + # Node selector for database pods (recommended: memory-optimized machine types) + nodeSelector: + cloud.google.com/gke-nodepool: "database-pool" + cloud.google.com/machine-family: "n2" + + # Database authentication (REQUIRED - set secure credentials) + auth: + username: postgres + password: "your-secure-postgres-password" + database: simstudio + + # Resource allocation optimized for GKE + resources: + limits: + memory: "4Gi" + cpu: "2000m" + requests: + memory: "2Gi" + cpu: "1000m" + + # Persistent storage using Google Cloud Persistent Disk + persistence: + enabled: true + storageClass: "standard-rwo" + size: 50Gi + accessModes: + - ReadWriteOnce + + # SSL/TLS configuration + tls: + enabled: true + certificatesSecret: postgres-tls-secret + + # PostgreSQL performance tuning for GCP infrastructure + config: + maxConnections: 1000 + sharedBuffers: "2GB" + maxWalSize: "8GB" + minWalSize: "160MB" + +# Ollama AI models with GPU acceleration (GCP GPU instances) +ollama: + enabled: true + replicaCount: 1 + + # GPU node targeting (recommended: T4 or V100 GPU instances) + nodeSelector: + cloud.google.com/gke-nodepool: "gpu-pool" + cloud.google.com/gke-accelerator: "nvidia-tesla-t4" + + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "present" + effect: "NoSchedule" + + # GPU resource allocation for AI model serving + gpu: + enabled: true + count: 1 + + resources: + limits: + memory: "16Gi" + cpu: "4000m" + nvidia.com/gpu: "1" + requests: + memory: "8Gi" + cpu: "2000m" + + # High-performance SSD storage for AI models + persistence: + enabled: true + storageClass: "premium-rwo" + size: 100Gi + accessModes: + - ReadWriteOnce + + env: + NVIDIA_DRIVER_CAPABILITIES: "all" + OLLAMA_LOAD_TIMEOUT: "-1" + OLLAMA_KEEP_ALIVE: "-1" + OLLAMA_DEBUG: "1" + +# Ingress using Google Cloud Load Balancer +ingress: + enabled: true + className: gce + + annotations: + kubernetes.io/ingress.class: gce + kubernetes.io/ingress.global-static-ip-name: "simstudio-ip" + networking.gke.io/managed-certificates: "simstudio-ssl-cert" + kubernetes.io/ingress.allow-http: "false" + + # Main application + app: + host: simstudio.acme.com + paths: + - path: / + pathType: Prefix + + # Realtime service + realtime: + host: simstudio-ws.acme.com + paths: + - path: / + pathType: Prefix + + # TLS configuration + tls: + enabled: true + secretName: simstudio-tls-secret + +# Pod disruption budget for high availability +podDisruptionBudget: + enabled: true + minAvailable: 1 + +# Network policies +networkPolicy: + enabled: true + +# Pod anti-affinity for high availability across GCP zones +affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: ["simstudio"] + topologyKey: kubernetes.io/hostname + - weight: 50 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: ["simstudio"] + topologyKey: topology.gke.io/zone + +# Service Account with Workload Identity integration +serviceAccount: + create: true + annotations: + iam.gke.io/gcp-service-account: "simstudio@your-project-id.iam.gserviceaccount.com" + +# Additional environment variables for GCP service integration +extraEnvVars: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: "/var/secrets/google/key.json" + +# Additional volumes for service account credentials +extraVolumes: + - name: google-cloud-key + secret: + secretName: google-service-account-key + +extraVolumeMounts: + - name: google-cloud-key + mountPath: /var/secrets/google + readOnly: true \ No newline at end of file diff --git a/helm/sim/examples/values-production.yaml b/helm/sim/examples/values-production.yaml new file mode 100644 index 000000000..dd43a078f --- /dev/null +++ b/helm/sim/examples/values-production.yaml @@ -0,0 +1,210 @@ +# Production values for Sim +# This configuration is suitable for production deployments + +# Global configuration +global: + imageRegistry: "ghcr.io" + storageClass: "managed-csi-premium" + +# Main application +app: + enabled: true + replicaCount: 2 + + resources: + limits: + memory: "6Gi" + cpu: "2000m" + requests: + memory: "4Gi" + cpu: "1000m" + + # Production URLs (REQUIRED - update with your actual domain names) + env: + NEXT_PUBLIC_APP_URL: "https://simstudio.acme.com" + BETTER_AUTH_URL: "https://simstudio.acme.com" + SOCKET_SERVER_URL: "https://simstudio-ws.acme.com" + NEXT_PUBLIC_SOCKET_URL: "https://simstudio-ws.acme.com" + + # Security settings (REQUIRED - replace with your own secure secrets) + BETTER_AUTH_SECRET: "your-production-auth-secret-here" + ENCRYPTION_KEY: "your-production-encryption-key-here" + + # Optional third-party service integrations (configure as needed) + RESEND_API_KEY: "your-resend-api-key" + GOOGLE_CLIENT_ID: "your-google-client-id" + GOOGLE_CLIENT_SECRET: "your-google-client-secret" + +# Realtime service +realtime: + enabled: true + replicaCount: 2 + + resources: + limits: + memory: "4Gi" + cpu: "1000m" + requests: + memory: "2Gi" + cpu: "500m" + + env: + NEXT_PUBLIC_APP_URL: "https://simstudio.acme.com" + BETTER_AUTH_URL: "https://simstudio.acme.com" + NEXT_PUBLIC_SOCKET_URL: "https://simstudio-ws.acme.com" + BETTER_AUTH_SECRET: "your-production-auth-secret-here" + ALLOWED_ORIGINS: "https://simstudio.acme.com" + +# Database migrations +migrations: + enabled: true + resources: + limits: + memory: "2Gi" + cpu: "1000m" + requests: + memory: "1Gi" + cpu: "500m" + +# PostgreSQL database +postgresql: + enabled: true + + # Database authentication (REQUIRED - set secure credentials) + auth: + username: postgres + password: "your-secure-postgres-password" + database: simstudio + + # Resource allocation for production workloads + resources: + limits: + memory: "4Gi" + cpu: "2000m" + requests: + memory: "2Gi" + cpu: "1000m" + + # Persistent storage configuration + persistence: + enabled: true + storageClass: "managed-csi-premium" + size: 50Gi + + # SSL/TLS configuration (recommended for production) + tls: + enabled: true + certificatesSecret: postgres-tls-secret + + # PostgreSQL performance configuration for production + config: + maxConnections: 1000 + sharedBuffers: "2GB" + maxWalSize: "8GB" + minWalSize: "160MB" + +# Ollama AI models (optional - enable if you need local AI model serving) +ollama: + enabled: false + +# Ingress configuration +ingress: + enabled: true + className: nginx + + annotations: + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + cert-manager.io/cluster-issuer: "letsencrypt-prod" + + # Main application + app: + host: simstudio.acme.com + paths: + - path: / + pathType: Prefix + + # Realtime service + realtime: + host: simstudio-ws.acme.com + paths: + - path: / + pathType: Prefix + + # TLS configuration + tls: + enabled: true + secretName: simstudio-tls-secret + +# Horizontal Pod Autoscaler (automatically scales pods based on CPU/memory usage) +autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 20 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 + behavior: + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 50 + periodSeconds: 60 + scaleUp: + stabilizationWindowSeconds: 60 + policies: + - type: Percent + value: 100 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 60 + +# Pod disruption budget (ensures minimum availability during cluster maintenance) +podDisruptionBudget: + enabled: true + minAvailable: 1 + +# Monitoring integration with Prometheus +monitoring: + serviceMonitor: + enabled: true + labels: + monitoring: "prometheus" + interval: 15s + scrapeTimeout: 10s + +# Network policies (restricts pod-to-pod communication for security) +networkPolicy: + enabled: true + +# Shared storage for data sharing between pods (enterprise feature) +sharedStorage: + enabled: true + storageClass: "managed-csi-premium" + volumes: + - name: output-share + size: 100Gi + accessModes: + - ReadWriteMany + - name: model-share + size: 200Gi + accessModes: + - ReadWriteMany + +# Telemetry and observability (comprehensive monitoring and tracing) +telemetry: + enabled: true + resources: + limits: + memory: "1Gi" + cpu: "500m" + requests: + memory: "512Mi" + cpu: "200m" + # Configure endpoints based on your observability infrastructure + prometheus: + enabled: true + endpoint: "http://prometheus-server/api/v1/write" + jaeger: + enabled: true + endpoint: "http://jaeger-collector:14250" \ No newline at end of file diff --git a/helm/sim/templates/NOTES.txt b/helm/sim/templates/NOTES.txt new file mode 100644 index 000000000..063c7b425 --- /dev/null +++ b/helm/sim/templates/NOTES.txt @@ -0,0 +1,67 @@ +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} + http{{ if .Values.ingress.tls.enabled }}s{{ end }}://{{ .Values.ingress.app.host }} +{{- else if contains "NodePort" .Values.app.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "sim.fullname" . }}-app) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.app.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "sim.fullname" . }}-app' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "sim.fullname" . }}-app --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.app.service.port }} +{{- else if contains "ClusterIP" .Values.app.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "{{ include "sim.app.selectorLabels" . }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} + +{{- if .Values.realtime.enabled }} + +2. Realtime service is available at: +{{- if .Values.ingress.enabled }} + http{{ if .Values.ingress.tls.enabled }}s{{ end }}://{{ .Values.ingress.realtime.host }} +{{- else }} + Use port-forwarding: kubectl port-forward svc/{{ include "sim.fullname" . }}-realtime 3002:3002 +{{- end }} +{{- end }} + +{{- if .Values.postgresql.enabled }} + +3. PostgreSQL database is running internally and accessible to the application. + Database: {{ .Values.postgresql.auth.database }} + Username: {{ .Values.postgresql.auth.username }} +{{- end }} + +{{- if .Values.ollama.enabled }} + +4. Ollama service is available internally for AI model operations. +{{- end }} + +{{- if not .Values.postgresql.enabled }} + +WARNING: You have disabled the internal PostgreSQL database. +Make sure to configure an external database connection in your values.yaml file. +{{- end }} + +{{- if not .Values.app.env.BETTER_AUTH_SECRET }} + +⚠️ SECURITY WARNING: Required secrets are not configured! + +Please set the following REQUIRED values for production use: + + helm upgrade {{ .Release.Name }} ./helm/sim \ + --set app.env.BETTER_AUTH_SECRET="your-secure-32-char-secret-here" \ + --set app.env.ENCRYPTION_KEY="your-secure-32-char-encryption-key" \ + --set realtime.env.BETTER_AUTH_SECRET="your-secure-32-char-secret-here" \ + --set postgresql.auth.password="your-secure-database-password" + +Generate secure secrets using: + openssl rand -hex 32 + +{{- end }} + +For more information and configuration options, see: +- Chart documentation: https://github.com/simstudioai/sim/tree/main/helm/sim +- Sim documentation: https://docs.sim.ai \ No newline at end of file diff --git a/helm/sim/templates/_helpers.tpl b/helm/sim/templates/_helpers.tpl new file mode 100644 index 000000000..dd61cb770 --- /dev/null +++ b/helm/sim/templates/_helpers.tpl @@ -0,0 +1,303 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "sim.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "sim.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "sim.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "sim.labels" -}} +helm.sh/chart: {{ include "sim.chart" . }} +{{ include "sim.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- with .Values.global.commonLabels }} +{{ toYaml . }} +{{- end }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "sim.selectorLabels" -}} +app.kubernetes.io/name: {{ include "sim.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +App specific labels +*/}} +{{- define "sim.app.labels" -}} +{{ include "sim.labels" . }} +app.kubernetes.io/component: app +{{- end }} + +{{/* +App selector labels +*/}} +{{- define "sim.app.selectorLabels" -}} +{{ include "sim.selectorLabels" . }} +app.kubernetes.io/component: app +{{- end }} + +{{/* +Realtime specific labels +*/}} +{{- define "sim.realtime.labels" -}} +{{ include "sim.labels" . }} +app.kubernetes.io/component: realtime +{{- end }} + +{{/* +Realtime selector labels +*/}} +{{- define "sim.realtime.selectorLabels" -}} +{{ include "sim.selectorLabels" . }} +app.kubernetes.io/component: realtime +{{- end }} + +{{/* +PostgreSQL specific labels +*/}} +{{- define "sim.postgresql.labels" -}} +{{ include "sim.labels" . }} +app.kubernetes.io/component: postgresql +{{- end }} + +{{/* +PostgreSQL selector labels +*/}} +{{- define "sim.postgresql.selectorLabels" -}} +{{ include "sim.selectorLabels" . }} +app.kubernetes.io/component: postgresql +{{- end }} + +{{/* +Ollama specific labels +*/}} +{{- define "sim.ollama.labels" -}} +{{ include "sim.labels" . }} +app.kubernetes.io/component: ollama +{{- end }} + +{{/* +Ollama selector labels +*/}} +{{- define "sim.ollama.selectorLabels" -}} +{{ include "sim.selectorLabels" . }} +app.kubernetes.io/component: ollama +{{- end }} + +{{/* +Migrations specific labels +*/}} +{{- define "sim.migrations.labels" -}} +{{ include "sim.labels" . }} +app.kubernetes.io/component: migrations +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "sim.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "sim.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Create image name with registry +Expects context with image object passed as second parameter +Usage: {{ include "sim.image" (dict "context" . "image" .Values.app.image) }} +*/}} +{{- define "sim.image" -}} +{{- $registry := "" -}} +{{- $repository := .image.repository -}} +{{- $tag := .image.tag | toString -}} +{{- /* Use global registry for simstudioai images or when explicitly set for all images */ -}} +{{- if .context.Values.global.imageRegistry -}} + {{- if or (hasPrefix "simstudioai/" $repository) .context.Values.global.useRegistryForAllImages -}} + {{- $registry = .context.Values.global.imageRegistry -}} + {{- end -}} +{{- end -}} +{{- if $registry -}} +{{- printf "%s/%s:%s" $registry $repository $tag }} +{{- else -}} +{{- printf "%s:%s" $repository $tag }} +{{- end -}} +{{- end }} + +{{/* +Database URL for internal PostgreSQL +*/}} +{{- define "sim.databaseUrl" -}} +{{- if .Values.postgresql.enabled }} +{{- $host := printf "%s-postgresql" (include "sim.fullname" .) }} +{{- $port := .Values.postgresql.service.port }} +{{- $username := .Values.postgresql.auth.username }} +{{- $database := .Values.postgresql.auth.database }} +{{- $sslMode := ternary "require" "disable" .Values.postgresql.tls.enabled }} +{{- printf "postgresql://%s:$(POSTGRES_PASSWORD)@%s:%v/%s?sslmode=%s" $username $host $port $database $sslMode }} +{{- else if .Values.externalDatabase.enabled }} +{{- $host := .Values.externalDatabase.host }} +{{- $port := .Values.externalDatabase.port }} +{{- $username := .Values.externalDatabase.username }} +{{- $database := .Values.externalDatabase.database }} +{{- $sslMode := .Values.externalDatabase.sslMode }} +{{- printf "postgresql://%s:$(EXTERNAL_DB_PASSWORD)@%s:%v/%s?sslmode=%s" $username $host $port $database $sslMode }} +{{- end }} +{{- end }} + +{{/* +Validate required secrets and reject default placeholder values +*/}} +{{- define "sim.validateSecrets" -}} +{{- if and .Values.app.enabled (not .Values.app.env.BETTER_AUTH_SECRET) }} +{{- fail "app.env.BETTER_AUTH_SECRET is required for production deployment" }} +{{- end }} +{{- if and .Values.app.enabled (eq .Values.app.env.BETTER_AUTH_SECRET "CHANGE-ME-32-CHAR-SECRET-FOR-PRODUCTION-USE") }} +{{- fail "app.env.BETTER_AUTH_SECRET must not use the default placeholder value. Generate a secure secret with: openssl rand -hex 32" }} +{{- end }} +{{- if and .Values.app.enabled (not .Values.app.env.ENCRYPTION_KEY) }} +{{- fail "app.env.ENCRYPTION_KEY is required for production deployment" }} +{{- end }} +{{- if and .Values.app.enabled (eq .Values.app.env.ENCRYPTION_KEY "CHANGE-ME-32-CHAR-ENCRYPTION-KEY-FOR-PROD") }} +{{- fail "app.env.ENCRYPTION_KEY must not use the default placeholder value. Generate a secure key with: openssl rand -hex 32" }} +{{- end }} +{{- if and .Values.realtime.enabled (eq .Values.realtime.env.BETTER_AUTH_SECRET "CHANGE-ME-32-CHAR-SECRET-FOR-PRODUCTION-USE") }} +{{- fail "realtime.env.BETTER_AUTH_SECRET must not use the default placeholder value. Generate a secure secret with: openssl rand -hex 32" }} +{{- end }} +{{- if and .Values.postgresql.enabled (not .Values.postgresql.auth.password) }} +{{- fail "postgresql.auth.password is required when using internal PostgreSQL" }} +{{- end }} +{{- if and .Values.postgresql.enabled (eq .Values.postgresql.auth.password "CHANGE-ME-SECURE-PASSWORD") }} +{{- fail "postgresql.auth.password must not use the default placeholder value. Set a secure password for production" }} +{{- end }} +{{- if and .Values.externalDatabase.enabled (not .Values.externalDatabase.password) }} +{{- fail "externalDatabase.password is required when using external database" }} +{{- end }} +{{- end }} + +{{/* +Ollama URL +*/}} +{{- define "sim.ollamaUrl" -}} +{{- if .Values.ollama.enabled }} +{{- $serviceName := printf "%s-ollama" (include "sim.fullname" .) }} +{{- $port := .Values.ollama.service.port }} +{{- printf "http://%s:%v" $serviceName $port }} +{{- else }} +{{- .Values.app.env.OLLAMA_URL | default "http://localhost:11434" }} +{{- end }} +{{- end }} + +{{/* +Socket Server URL (internal) +*/}} +{{- define "sim.socketServerUrl" -}} +{{- if .Values.realtime.enabled }} +{{- $serviceName := printf "%s-realtime" (include "sim.fullname" .) }} +{{- $port := .Values.realtime.service.port }} +{{- printf "http://%s:%v" $serviceName $port }} +{{- else }} +{{- .Values.app.env.SOCKET_SERVER_URL | default "http://localhost:3002" }} +{{- end }} +{{- end }} + +{{/* +Resource limits and requests +*/}} +{{- define "sim.resources" -}} +{{- if .resources }} +resources: + {{- if .resources.limits }} + limits: + {{- toYaml .resources.limits | nindent 4 }} + {{- end }} + {{- if .resources.requests }} + requests: + {{- toYaml .resources.requests | nindent 4 }} + {{- end }} +{{- end }} +{{- end }} + +{{/* +Security context +*/}} +{{- define "sim.securityContext" -}} +{{- if .securityContext }} +securityContext: + {{- toYaml .securityContext | nindent 2 }} +{{- end }} +{{- end }} + +{{/* +Pod security context +*/}} +{{- define "sim.podSecurityContext" -}} +{{- if .podSecurityContext }} +securityContext: + {{- toYaml .podSecurityContext | nindent 2 }} +{{- end }} +{{- end }} + +{{/* +Node selector +*/}} +{{- define "sim.nodeSelector" -}} +{{- if .nodeSelector }} +nodeSelector: + {{- toYaml .nodeSelector | nindent 2 }} +{{- end }} +{{- end }} + +{{/* +Tolerations +*/}} +{{- define "sim.tolerations" -}} +{{- if .tolerations }} +tolerations: + {{- toYaml .tolerations | nindent 2 }} +{{- end }} +{{- end }} + +{{/* +Affinity +*/}} +{{- define "sim.affinity" -}} +{{- if .affinity }} +affinity: + {{- toYaml .affinity | nindent 2 }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/deployment-app.yaml b/helm/sim/templates/deployment-app.yaml new file mode 100644 index 000000000..92931632b --- /dev/null +++ b/helm/sim/templates/deployment-app.yaml @@ -0,0 +1,117 @@ +{{- if .Values.app.enabled }} +{{- include "sim.validateSecrets" . }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "sim.fullname" . }}-app + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.app.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.app.replicaCount }} + selector: + matchLabels: + {{- include "sim.app.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "sim.app.selectorLabels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.global.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "sim.serviceAccountName" . }} + {{- include "sim.podSecurityContext" .Values.app | nindent 6 }} + {{- include "sim.nodeSelector" .Values.app | nindent 6 }} + {{- include "sim.tolerations" .Values | nindent 6 }} + {{- include "sim.affinity" .Values | nindent 6 }} + {{- if .Values.migrations.enabled }} + initContainers: + - name: migrations + image: {{ include "sim.image" (dict "context" . "image" .Values.migrations.image) }} + imagePullPolicy: {{ .Values.migrations.image.pullPolicy }} + command: ["bun", "run", "db:migrate"] + env: + - name: DATABASE_URL + value: {{ include "sim.databaseUrl" . | quote }} + {{- if .Values.postgresql.enabled }} + envFrom: + - secretRef: + name: {{ include "sim.fullname" . }}-postgresql-secret + {{- else if .Values.externalDatabase.enabled }} + envFrom: + - secretRef: + name: {{ include "sim.fullname" . }}-external-db-secret + {{- end }} + {{- include "sim.resources" .Values.migrations | nindent 10 }} + {{- include "sim.securityContext" .Values.migrations | nindent 10 }} + {{- end }} + containers: + - name: app + image: {{ include "sim.image" (dict "context" . "image" .Values.app.image) }} + imagePullPolicy: {{ .Values.app.image.pullPolicy }} + ports: + - name: http + containerPort: {{ .Values.app.service.targetPort }} + protocol: TCP + env: + - name: DATABASE_URL + value: {{ include "sim.databaseUrl" . | quote }} + - name: SOCKET_SERVER_URL + value: {{ include "sim.socketServerUrl" . | quote }} + - name: OLLAMA_URL + value: {{ include "sim.ollamaUrl" . | quote }} + {{- range $key, $value := .Values.app.env }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- if .Values.telemetry.enabled }} + # OpenTelemetry configuration + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://{{ include "sim.fullname" . }}-otel-collector:4318" + - name: OTEL_SERVICE_NAME + value: sim-app + - name: OTEL_SERVICE_VERSION + value: {{ .Chart.AppVersion | quote }} + - name: OTEL_RESOURCE_ATTRIBUTES + value: "service.name=sim-app,service.version={{ .Chart.AppVersion }},deployment.environment={{ .Values.app.env.NODE_ENV }}" + {{- end }} + {{- with .Values.extraEnvVars }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- if .Values.postgresql.enabled }} + envFrom: + - secretRef: + name: {{ include "sim.fullname" . }}-postgresql-secret + {{- else if .Values.externalDatabase.enabled }} + envFrom: + - secretRef: + name: {{ include "sim.fullname" . }}-external-db-secret + {{- end }} + {{- if .Values.app.livenessProbe }} + livenessProbe: + {{- toYaml .Values.app.livenessProbe | nindent 12 }} + {{- end }} + {{- if .Values.app.readinessProbe }} + readinessProbe: + {{- toYaml .Values.app.readinessProbe | nindent 12 }} + {{- end }} + {{- include "sim.resources" .Values.app | nindent 10 }} + {{- include "sim.securityContext" .Values.app | nindent 10 }} + {{- with .Values.extraVolumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.extraVolumes }} + volumes: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/deployment-ollama.yaml b/helm/sim/templates/deployment-ollama.yaml new file mode 100644 index 000000000..a46f2b661 --- /dev/null +++ b/helm/sim/templates/deployment-ollama.yaml @@ -0,0 +1,112 @@ +{{- if .Values.ollama.enabled }} +--- +# PersistentVolumeClaim for Ollama data +{{- if .Values.ollama.persistence.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "sim.fullname" . }}-ollama-data + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.ollama.labels" . | nindent 4 }} +spec: + {{- if .Values.ollama.persistence.storageClass }} + {{- if (eq "-" .Values.ollama.persistence.storageClass) }} + storageClassName: "" + {{- else }} + storageClassName: {{ .Values.ollama.persistence.storageClass | quote }} + {{- end }} + {{- else if .Values.global.storageClass }} + storageClassName: {{ .Values.global.storageClass | quote }} + {{- end }} + accessModes: + {{- range .Values.ollama.persistence.accessModes }} + - {{ . | quote }} + {{- end }} + resources: + requests: + storage: {{ .Values.ollama.persistence.size | quote }} +{{- end }} + +--- +# Deployment for Ollama +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "sim.fullname" . }}-ollama + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.ollama.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.ollama.replicaCount }} + selector: + matchLabels: + {{- include "sim.ollama.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "sim.ollama.selectorLabels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.global.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "sim.serviceAccountName" . }} + {{- include "sim.nodeSelector" .Values.ollama | nindent 6 }} + {{- include "sim.tolerations" .Values.ollama | nindent 6 }} + {{- include "sim.affinity" .Values | nindent 6 }} + containers: + - name: ollama + image: {{ include "sim.image" (dict "context" . "image" .Values.ollama.image) }} + imagePullPolicy: {{ .Values.ollama.image.pullPolicy }} + command: ["ollama", "serve"] + ports: + - name: http + containerPort: {{ .Values.ollama.service.targetPort }} + protocol: TCP + env: + {{- range $key, $value := .Values.ollama.env }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- with .Values.extraEnvVars }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- if .Values.ollama.startupProbe }} + startupProbe: + {{- toYaml .Values.ollama.startupProbe | nindent 12 }} + {{- end }} + {{- if .Values.ollama.livenessProbe }} + livenessProbe: + {{- toYaml .Values.ollama.livenessProbe | nindent 12 }} + {{- end }} + {{- if .Values.ollama.readinessProbe }} + readinessProbe: + {{- toYaml .Values.ollama.readinessProbe | nindent 12 }} + {{- end }} + {{- include "sim.resources" .Values.ollama | nindent 10 }} + volumeMounts: + {{- if .Values.ollama.persistence.enabled }} + - name: ollama-data + mountPath: /root/.ollama + {{- end }} + {{- with .Values.extraVolumeMounts }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- if .Values.ollama.persistence.enabled }} + volumes: + - name: ollama-data + persistentVolumeClaim: + claimName: {{ include "sim.fullname" . }}-ollama-data + {{- with .Values.extraVolumes }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/deployment-realtime.yaml b/helm/sim/templates/deployment-realtime.yaml new file mode 100644 index 000000000..b951aee16 --- /dev/null +++ b/helm/sim/templates/deployment-realtime.yaml @@ -0,0 +1,92 @@ +{{- if .Values.realtime.enabled }} +{{- include "sim.validateSecrets" . }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "sim.fullname" . }}-realtime + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.realtime.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.realtime.replicaCount }} + selector: + matchLabels: + {{- include "sim.realtime.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "sim.realtime.selectorLabels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.global.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "sim.serviceAccountName" . }} + {{- include "sim.podSecurityContext" .Values.realtime | nindent 6 }} + {{- include "sim.nodeSelector" .Values.realtime | nindent 6 }} + {{- include "sim.tolerations" .Values | nindent 6 }} + {{- include "sim.affinity" .Values | nindent 6 }} + containers: + - name: realtime + image: {{ include "sim.image" (dict "context" . "image" .Values.realtime.image) }} + imagePullPolicy: {{ .Values.realtime.image.pullPolicy }} + ports: + - name: http + containerPort: {{ .Values.realtime.service.targetPort }} + protocol: TCP + env: + - name: DATABASE_URL + value: {{ include "sim.databaseUrl" . | quote }} + {{- range $key, $value := .Values.realtime.env }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- if .Values.telemetry.enabled }} + # OpenTelemetry configuration + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://{{ include "sim.fullname" . }}-otel-collector:4318" + - name: OTEL_SERVICE_NAME + value: sim-realtime + - name: OTEL_SERVICE_VERSION + value: {{ .Chart.AppVersion | quote }} + - name: OTEL_RESOURCE_ATTRIBUTES + value: "service.name=sim-realtime,service.version={{ .Chart.AppVersion }},deployment.environment={{ .Values.realtime.env.NODE_ENV }}" + {{- end }} + {{- with .Values.extraEnvVars }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- if .Values.postgresql.enabled }} + envFrom: + - secretRef: + name: {{ include "sim.fullname" . }}-postgresql-secret + {{- else if .Values.externalDatabase.enabled }} + envFrom: + - secretRef: + name: {{ include "sim.fullname" . }}-external-db-secret + {{- end }} + {{- if .Values.realtime.livenessProbe }} + livenessProbe: + {{- toYaml .Values.realtime.livenessProbe | nindent 12 }} + {{- end }} + {{- if .Values.realtime.readinessProbe }} + readinessProbe: + {{- toYaml .Values.realtime.readinessProbe | nindent 12 }} + {{- end }} + {{- include "sim.resources" .Values.realtime | nindent 10 }} + {{- include "sim.securityContext" .Values.realtime | nindent 10 }} + {{- with .Values.extraVolumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.extraVolumes }} + volumes: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/gpu-device-plugin.yaml b/helm/sim/templates/gpu-device-plugin.yaml new file mode 100644 index 000000000..df9a30b3d --- /dev/null +++ b/helm/sim/templates/gpu-device-plugin.yaml @@ -0,0 +1,102 @@ +{{- if and .Values.ollama.enabled .Values.ollama.gpu.enabled }} +--- +# NVIDIA Device Plugin DaemonSet for GPU support +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "sim.fullname" . }}-nvidia-device-plugin + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: nvidia-device-plugin +spec: + selector: + matchLabels: + {{- include "sim.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: nvidia-device-plugin + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + {{- include "sim.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: nvidia-device-plugin + spec: + tolerations: + # Allow scheduling on GPU nodes + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - key: sku + operator: Equal + value: gpu + effect: NoSchedule + nodeSelector: + # Only schedule on nodes with NVIDIA GPUs + accelerator: nvidia + priorityClassName: system-node-critical + runtimeClassName: nvidia + hostNetwork: true + hostPID: true + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: dev + hostPath: + path: /dev + - name: sys + hostPath: + path: /sys + - name: proc-driver-nvidia + hostPath: + path: /proc/driver/nvidia + containers: + - name: nvidia-device-plugin + image: nvcr.io/nvidia/k8s-device-plugin:v0.14.5 + imagePullPolicy: Always + args: + - --mig-strategy=single + - --pass-device-specs=true + - --fail-on-init-error=false + - --device-list-strategy=envvar + - --nvidia-driver-root=/host-sys/fs/cgroup + env: + - name: NVIDIA_MIG_MONITOR_DEVICES + value: all + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: dev + mountPath: /dev + - name: sys + mountPath: /host-sys + readOnly: true + - name: proc-driver-nvidia + mountPath: /proc/driver/nvidia + readOnly: true + resources: + requests: + cpu: 50m + memory: 10Mi + limits: + cpu: 50m + memory: 20Mi + {{- if .Values.nodeSelector }} + nodeSelector: + {{- toYaml .Values.nodeSelector | nindent 8 }} + {{- end }} +--- +# RuntimeClass for NVIDIA Container Runtime +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: {{ include "sim.fullname" . }}-nvidia + labels: + {{- include "sim.labels" . | nindent 4 }} +handler: nvidia +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/hpa.yaml b/helm/sim/templates/hpa.yaml new file mode 100644 index 000000000..01c20058f --- /dev/null +++ b/helm/sim/templates/hpa.yaml @@ -0,0 +1,85 @@ +{{- if .Values.autoscaling.enabled }} +--- +# HorizontalPodAutoscaler for main application +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "sim.fullname" . }}-app + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.app.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "sim.fullname" . }}-app + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} + {{- with .Values.autoscaling.customMetrics }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- if .Values.autoscaling.behavior }} + behavior: + {{- toYaml .Values.autoscaling.behavior | nindent 4 }} + {{- end }} +{{- end }} + +{{- if and .Values.autoscaling.enabled .Values.realtime.enabled }} +--- +# HorizontalPodAutoscaler for realtime service +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "sim.fullname" . }}-realtime + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.realtime.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "sim.fullname" . }}-realtime + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} + {{- with .Values.autoscaling.customMetrics }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- if .Values.autoscaling.behavior }} + behavior: + {{- toYaml .Values.autoscaling.behavior | nindent 4 }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/ingress.yaml b/helm/sim/templates/ingress.yaml new file mode 100644 index 000000000..3195b300e --- /dev/null +++ b/helm/sim/templates/ingress.yaml @@ -0,0 +1,55 @@ +{{- if .Values.ingress.enabled }} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "sim.fullname" . }}-ingress + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.className }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls.enabled }} + tls: + - hosts: + - {{ .Values.ingress.app.host }} + {{- if .Values.realtime.enabled }} + - {{ .Values.ingress.realtime.host }} + {{- end }} + secretName: {{ .Values.ingress.tls.secretName }} + {{- end }} + rules: + # Main application ingress rule + - host: {{ .Values.ingress.app.host }} + http: + paths: + {{- range .Values.ingress.app.paths }} + - path: {{ .path }} + pathType: {{ .pathType }} + backend: + service: + name: {{ include "sim.fullname" $ }}-app + port: + number: {{ $.Values.app.service.port }} + {{- end }} + {{- if .Values.realtime.enabled }} + # Realtime service ingress rule + - host: {{ .Values.ingress.realtime.host }} + http: + paths: + {{- range .Values.ingress.realtime.paths }} + - path: {{ .path }} + pathType: {{ .pathType }} + backend: + service: + name: {{ include "sim.fullname" $ }}-realtime + port: + number: {{ $.Values.realtime.service.port }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/networkpolicy.yaml b/helm/sim/templates/networkpolicy.yaml new file mode 100644 index 000000000..deac5a5db --- /dev/null +++ b/helm/sim/templates/networkpolicy.yaml @@ -0,0 +1,242 @@ +{{- if .Values.networkPolicy.enabled }} +--- +# Network Policy for main application +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "sim.fullname" . }}-app + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.app.labels" . | nindent 4 }} +spec: + podSelector: + matchLabels: + {{- include "sim.app.selectorLabels" . | nindent 6 }} + policyTypes: + - Ingress + - Egress + ingress: + # Allow ingress from realtime service + {{- if .Values.realtime.enabled }} + - from: + - podSelector: + matchLabels: + {{- include "sim.realtime.selectorLabels" . | nindent 10 }} + ports: + - protocol: TCP + port: {{ .Values.app.service.targetPort }} + {{- end }} + # Allow ingress from ingress controller + {{- if .Values.ingress.enabled }} + - from: [] + ports: + - protocol: TCP + port: {{ .Values.app.service.targetPort }} + {{- end }} + # Allow custom ingress rules + {{- with .Values.networkPolicy.ingress }} + {{- toYaml . | nindent 2 }} + {{- end }} + egress: + # Allow egress to PostgreSQL + {{- if .Values.postgresql.enabled }} + - to: + - podSelector: + matchLabels: + {{- include "sim.postgresql.selectorLabels" . | nindent 10 }} + ports: + - protocol: TCP + port: {{ .Values.postgresql.service.targetPort }} + {{- end }} + # Allow egress to realtime service + {{- if .Values.realtime.enabled }} + - to: + - podSelector: + matchLabels: + {{- include "sim.realtime.selectorLabels" . | nindent 10 }} + ports: + - protocol: TCP + port: {{ .Values.realtime.service.targetPort }} + {{- end }} + # Allow egress to Ollama + {{- if .Values.ollama.enabled }} + - to: + - podSelector: + matchLabels: + {{- include "sim.ollama.selectorLabels" . | nindent 10 }} + ports: + - protocol: TCP + port: {{ .Values.ollama.service.targetPort }} + {{- end }} + # Allow DNS resolution + - to: [] + ports: + - protocol: UDP + port: 53 + - protocol: TCP + port: 53 + # Allow HTTPS egress for external APIs + - to: [] + ports: + - protocol: TCP + port: 443 + # Allow custom egress rules + {{- with .Values.networkPolicy.egress }} + {{- toYaml . | nindent 2 }} + {{- end }} + +{{- if .Values.realtime.enabled }} +--- +# Network Policy for realtime service +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "sim.fullname" . }}-realtime + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.realtime.labels" . | nindent 4 }} +spec: + podSelector: + matchLabels: + {{- include "sim.realtime.selectorLabels" . | nindent 6 }} + policyTypes: + - Ingress + - Egress + ingress: + # Allow ingress from main application + - from: + - podSelector: + matchLabels: + {{- include "sim.app.selectorLabels" . | nindent 10 }} + ports: + - protocol: TCP + port: {{ .Values.realtime.service.targetPort }} + # Allow ingress from ingress controller + {{- if .Values.ingress.enabled }} + - from: [] + ports: + - protocol: TCP + port: {{ .Values.realtime.service.targetPort }} + {{- end }} + egress: + # Allow egress to PostgreSQL + {{- if .Values.postgresql.enabled }} + - to: + - podSelector: + matchLabels: + {{- include "sim.postgresql.selectorLabels" . | nindent 10 }} + ports: + - protocol: TCP + port: {{ .Values.postgresql.service.targetPort }} + {{- end }} + # Allow DNS resolution + - to: [] + ports: + - protocol: UDP + port: 53 + - protocol: TCP + port: 53 + # Allow HTTPS egress for external APIs + - to: [] + ports: + - protocol: TCP + port: 443 +{{- end }} + +{{- if .Values.postgresql.enabled }} +--- +# Network Policy for PostgreSQL +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "sim.fullname" . }}-postgresql + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.postgresql.labels" . | nindent 4 }} +spec: + podSelector: + matchLabels: + {{- include "sim.postgresql.selectorLabels" . | nindent 6 }} + policyTypes: + - Ingress + - Egress + ingress: + # Allow ingress from main application + - from: + - podSelector: + matchLabels: + {{- include "sim.app.selectorLabels" . | nindent 10 }} + ports: + - protocol: TCP + port: {{ .Values.postgresql.service.targetPort }} + # Allow ingress from realtime service + {{- if .Values.realtime.enabled }} + - from: + - podSelector: + matchLabels: + {{- include "sim.realtime.selectorLabels" . | nindent 10 }} + ports: + - protocol: TCP + port: {{ .Values.postgresql.service.targetPort }} + {{- end }} + # Allow ingress from migrations job + {{- if .Values.migrations.enabled }} + - from: + - podSelector: + matchLabels: + {{- include "sim.migrations.labels" . | nindent 10 }} + ports: + - protocol: TCP + port: {{ .Values.postgresql.service.targetPort }} + {{- end }} + egress: + # Allow minimal egress (for health checks, etc.) + - to: [] + ports: + - protocol: UDP + port: 53 + - protocol: TCP + port: 53 +{{- end }} + +{{- if .Values.ollama.enabled }} +--- +# Network Policy for Ollama +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "sim.fullname" . }}-ollama + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.ollama.labels" . | nindent 4 }} +spec: + podSelector: + matchLabels: + {{- include "sim.ollama.selectorLabels" . | nindent 6 }} + policyTypes: + - Ingress + - Egress + ingress: + # Allow ingress from main application + - from: + - podSelector: + matchLabels: + {{- include "sim.app.selectorLabels" . | nindent 10 }} + ports: + - protocol: TCP + port: {{ .Values.ollama.service.targetPort }} + egress: + # Allow DNS resolution + - to: [] + ports: + - protocol: UDP + port: 53 + - protocol: TCP + port: 53 + # Allow HTTPS egress for model downloads + - to: [] + ports: + - protocol: TCP + port: 443 +{{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/serviceaccount.yaml b/helm/sim/templates/serviceaccount.yaml new file mode 100644 index 000000000..6a5a63521 --- /dev/null +++ b/helm/sim/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "sim.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/servicemonitor.yaml b/helm/sim/templates/servicemonitor.yaml new file mode 100644 index 000000000..5afeca19f --- /dev/null +++ b/helm/sim/templates/servicemonitor.yaml @@ -0,0 +1,79 @@ +{{- if .Values.monitoring.serviceMonitor.enabled }} +--- +# ServiceMonitor for main application +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "sim.fullname" . }}-app + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.app.labels" . | nindent 4 }} + {{- with .Values.monitoring.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.monitoring.serviceMonitor.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "sim.app.selectorLabels" . | nindent 6 }} + endpoints: + - port: http + path: {{ .Values.monitoring.serviceMonitor.path }} + interval: {{ .Values.monitoring.serviceMonitor.interval }} + scrapeTimeout: {{ .Values.monitoring.serviceMonitor.scrapeTimeout }} + {{- with .Values.monitoring.serviceMonitor.metricRelabelings }} + metricRelabelings: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- with .Values.monitoring.serviceMonitor.relabelings }} + relabelings: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- if .Values.monitoring.serviceMonitor.targetLabels }} + targetLabels: + {{- toYaml .Values.monitoring.serviceMonitor.targetLabels | nindent 4 }} + {{- end }} +{{- end }} + +{{- if and .Values.monitoring.serviceMonitor.enabled .Values.realtime.enabled }} +--- +# ServiceMonitor for realtime service +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "sim.fullname" . }}-realtime + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.realtime.labels" . | nindent 4 }} + {{- with .Values.monitoring.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.monitoring.serviceMonitor.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "sim.realtime.selectorLabels" . | nindent 6 }} + endpoints: + - port: http + path: {{ .Values.monitoring.serviceMonitor.path }} + interval: {{ .Values.monitoring.serviceMonitor.interval }} + scrapeTimeout: {{ .Values.monitoring.serviceMonitor.scrapeTimeout }} + {{- with .Values.monitoring.serviceMonitor.metricRelabelings }} + metricRelabelings: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- with .Values.monitoring.serviceMonitor.relabelings }} + relabelings: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- if .Values.monitoring.serviceMonitor.targetLabels }} + targetLabels: + {{- toYaml .Values.monitoring.serviceMonitor.targetLabels | nindent 4 }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/services.yaml b/helm/sim/templates/services.yaml new file mode 100644 index 000000000..7ce6634b7 --- /dev/null +++ b/helm/sim/templates/services.yaml @@ -0,0 +1,83 @@ +{{- if .Values.app.enabled }} +--- +# Service for main application +apiVersion: v1 +kind: Service +metadata: + name: {{ include "sim.fullname" . }}-app + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.app.labels" . | nindent 4 }} +spec: + type: {{ .Values.app.service.type }} + ports: + - port: {{ .Values.app.service.port }} + targetPort: {{ .Values.app.service.targetPort }} + protocol: TCP + name: http + selector: + {{- include "sim.app.selectorLabels" . | nindent 4 }} +{{- end }} + +{{- if .Values.realtime.enabled }} +--- +# Service for realtime server +apiVersion: v1 +kind: Service +metadata: + name: {{ include "sim.fullname" . }}-realtime + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.realtime.labels" . | nindent 4 }} +spec: + type: {{ .Values.realtime.service.type }} + ports: + - port: {{ .Values.realtime.service.port }} + targetPort: {{ .Values.realtime.service.targetPort }} + protocol: TCP + name: http + selector: + {{- include "sim.realtime.selectorLabels" . | nindent 4 }} +{{- end }} + +{{- if .Values.postgresql.enabled }} +--- +# Service for PostgreSQL +apiVersion: v1 +kind: Service +metadata: + name: {{ include "sim.fullname" . }}-postgresql + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.postgresql.labels" . | nindent 4 }} +spec: + type: {{ .Values.postgresql.service.type }} + ports: + - port: {{ .Values.postgresql.service.port }} + targetPort: {{ .Values.postgresql.service.targetPort }} + protocol: TCP + name: postgresql + selector: + {{- include "sim.postgresql.selectorLabels" . | nindent 4 }} +{{- end }} + +{{- if .Values.ollama.enabled }} +--- +# Service for Ollama +apiVersion: v1 +kind: Service +metadata: + name: {{ include "sim.fullname" . }}-ollama + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.ollama.labels" . | nindent 4 }} +spec: + type: {{ .Values.ollama.service.type }} + ports: + - port: {{ .Values.ollama.service.port }} + targetPort: {{ .Values.ollama.service.targetPort }} + protocol: TCP + name: http + selector: + {{- include "sim.ollama.selectorLabels" . | nindent 4 }} +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/shared-storage.yaml b/helm/sim/templates/shared-storage.yaml new file mode 100644 index 000000000..509af403c --- /dev/null +++ b/helm/sim/templates/shared-storage.yaml @@ -0,0 +1,48 @@ +{{- if .Values.sharedStorage.enabled }} +{{- range .Values.sharedStorage.volumes }} +--- +# Shared Storage PVC for {{ .name }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "sim.fullname" $ }}-{{ .name }} + namespace: {{ $.Release.Namespace }} + labels: + {{- include "sim.labels" $ | nindent 4 }} + sim.ai/volume-type: shared-storage + sim.ai/volume-name: {{ .name }} + {{- with .annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .storageClass }} + {{- if (eq "-" .storageClass) }} + storageClassName: "" + {{- else }} + storageClassName: {{ .storageClass | quote }} + {{- end }} + {{- else if $.Values.sharedStorage.storageClass }} + storageClassName: {{ $.Values.sharedStorage.storageClass | quote }} + {{- else if $.Values.global.storageClass }} + storageClassName: {{ $.Values.global.storageClass | quote }} + {{- end }} + accessModes: + {{- if .accessModes }} + {{- range .accessModes }} + - {{ . | quote }} + {{- end }} + {{- else }} + {{- range $.Values.sharedStorage.defaultAccessModes }} + - {{ . | quote }} + {{- end }} + {{- end }} + resources: + requests: + storage: {{ .size | quote }} + {{- if .selector }} + selector: + {{- toYaml .selector | nindent 4 }} + {{- end }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/statefulset-postgresql.yaml b/helm/sim/templates/statefulset-postgresql.yaml new file mode 100644 index 000000000..f9283b614 --- /dev/null +++ b/helm/sim/templates/statefulset-postgresql.yaml @@ -0,0 +1,195 @@ +{{- if .Values.postgresql.enabled }} +--- +# ConfigMap for PostgreSQL configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "sim.fullname" . }}-postgresql-config + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.postgresql.labels" . | nindent 4 }} +data: + postgresql.conf: | + hba_file = '/etc/postgresql/pg_hba.conf' + listen_addresses = '0.0.0.0' + max_connections = {{ .Values.postgresql.config.maxConnections }} + tcp_keepalives_idle = 60 + tcp_keepalives_interval = 5 + tcp_keepalives_count = 3 + authentication_timeout = 1min + password_encryption = scram-sha-256 + {{- if .Values.postgresql.tls.enabled }} + ssl = on + ssl_cert_file = '/etc/postgresql/tls/tls.crt' + ssl_key_file = '/etc/postgresql/tls/tls.key' + {{- else }} + ssl = off + {{- end }} + shared_buffers = {{ .Values.postgresql.config.sharedBuffers }} + dynamic_shared_memory_type = posix + max_wal_size = {{ .Values.postgresql.config.maxWalSize }} + min_wal_size = {{ .Values.postgresql.config.minWalSize }} + log_timezone = 'Etc/UTC' + idle_in_transaction_session_timeout = 50000000 + datestyle = 'iso, mdy' + timezone = 'Etc/UTC' + lc_messages = 'en_US.utf8' + lc_monetary = 'en_US.utf8' + lc_numeric = 'en_US.utf8' + lc_time = 'en_US.utf8' + default_text_search_config = 'pg_catalog.english' + + pg_hba.conf: | + # Secure authentication for all connections + local all all scram-sha-256 + host all all 127.0.0.1/32 scram-sha-256 + host all all ::1/128 scram-sha-256 + host all all all scram-sha-256 + + # Replication connections also require authentication + local replication all scram-sha-256 + host replication all 127.0.0.1/32 scram-sha-256 + host replication all ::1/128 scram-sha-256 + +--- +# ConfigMap for PostgreSQL environment variables +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "sim.fullname" . }}-postgresql-env + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.postgresql.labels" . | nindent 4 }} +data: + POSTGRES_DB: {{ .Values.postgresql.auth.database | quote }} + POSTGRES_USER: {{ .Values.postgresql.auth.username | quote }} + PGDATA: "/var/lib/postgresql/data/pgdata" + +--- +# Secret for PostgreSQL password +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "sim.fullname" . }}-postgresql-secret + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.postgresql.labels" . | nindent 4 }} +type: Opaque +data: + POSTGRES_PASSWORD: {{ .Values.postgresql.auth.password | b64enc }} + +--- +# StatefulSet for PostgreSQL +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{ include "sim.fullname" . }}-postgresql + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.postgresql.labels" . | nindent 4 }} +spec: + serviceName: {{ include "sim.fullname" . }}-postgresql + replicas: 1 + minReadySeconds: 10 + selector: + matchLabels: + {{- include "sim.postgresql.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "sim.postgresql.selectorLabels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.global.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "sim.serviceAccountName" . }} + {{- include "sim.podSecurityContext" .Values.postgresql | nindent 6 }} + {{- include "sim.nodeSelector" .Values.postgresql | nindent 6 }} + {{- include "sim.tolerations" .Values | nindent 6 }} + {{- include "sim.affinity" .Values | nindent 6 }} + containers: + - name: postgresql + image: {{ include "sim.image" (dict "context" . "image" .Values.postgresql.image) }} + imagePullPolicy: {{ .Values.postgresql.image.pullPolicy }} + args: ["-c", "config_file=/etc/postgresql/postgresql.conf"] + ports: + - name: postgresql + containerPort: {{ .Values.postgresql.service.targetPort }} + protocol: TCP + envFrom: + - configMapRef: + name: {{ include "sim.fullname" . }}-postgresql-env + - secretRef: + name: {{ include "sim.fullname" . }}-postgresql-secret + {{- if .Values.postgresql.livenessProbe }} + livenessProbe: + {{- toYaml .Values.postgresql.livenessProbe | nindent 12 }} + {{- end }} + {{- if .Values.postgresql.readinessProbe }} + readinessProbe: + {{- toYaml .Values.postgresql.readinessProbe | nindent 12 }} + {{- end }} + {{- include "sim.resources" .Values.postgresql | nindent 10 }} + {{- include "sim.securityContext" .Values.postgresql | nindent 10 }} + volumeMounts: + {{- if .Values.postgresql.persistence.enabled }} + - name: postgresql-data + mountPath: /var/lib/postgresql/data + subPath: pgdata + {{- end }} + - name: postgresql-config + mountPath: "/etc/postgresql" + {{- if .Values.postgresql.tls.enabled }} + - name: postgresql-tls + mountPath: "/etc/postgresql/tls" + readOnly: true + {{- end }} + {{- with .Values.extraVolumeMounts }} + {{- toYaml . | nindent 12 }} + {{- end }} + volumes: + - name: postgresql-config + configMap: + name: {{ include "sim.fullname" . }}-postgresql-config + {{- if .Values.postgresql.tls.enabled }} + - name: postgresql-tls + secret: + secretName: {{ .Values.postgresql.tls.certificatesSecret }} + defaultMode: 0600 + {{- end }} + {{- with .Values.extraVolumes }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if .Values.postgresql.persistence.enabled }} + volumeClaimTemplates: + - metadata: + name: postgresql-data + labels: + {{- include "sim.postgresql.labels" . | nindent 10 }} + spec: + {{- if .Values.postgresql.persistence.storageClass }} + {{- if (eq "-" .Values.postgresql.persistence.storageClass) }} + storageClassName: "" + {{- else }} + storageClassName: {{ .Values.postgresql.persistence.storageClass | quote }} + {{- end }} + {{- else if .Values.global.storageClass }} + storageClassName: {{ .Values.global.storageClass | quote }} + {{- end }} + accessModes: + {{- range .Values.postgresql.persistence.accessModes }} + - {{ . | quote }} + {{- end }} + resources: + requests: + storage: {{ .Values.postgresql.persistence.size | quote }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/sim/templates/telemetry.yaml b/helm/sim/templates/telemetry.yaml new file mode 100644 index 000000000..0f4307b8b --- /dev/null +++ b/helm/sim/templates/telemetry.yaml @@ -0,0 +1,224 @@ +{{- if .Values.telemetry.enabled }} +--- +# OpenTelemetry Collector Configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "sim.fullname" . }}-otel-config + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: telemetry +data: + otel-config.yaml: | + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + prometheus: + config: + scrape_configs: + - job_name: 'sim-app' + static_configs: + - targets: ['{{ include "sim.fullname" . }}-app:{{ .Values.app.service.port }}'] + - job_name: 'sim-realtime' + static_configs: + - targets: ['{{ include "sim.fullname" . }}-realtime:{{ .Values.realtime.service.port }}'] + + processors: + batch: + timeout: 1s + send_batch_size: 1024 + memory_limiter: + limit_mib: 512 + + exporters: + {{- if .Values.telemetry.jaeger.enabled }} + jaeger: + endpoint: {{ .Values.telemetry.jaeger.endpoint }} + tls: + insecure: {{ not .Values.telemetry.jaeger.tls.enabled }} + {{- end }} + {{- if .Values.telemetry.prometheus.enabled }} + prometheusremotewrite: + endpoint: {{ .Values.telemetry.prometheus.endpoint }} + headers: + Authorization: {{ .Values.telemetry.prometheus.auth | quote }} + {{- end }} + {{- if .Values.telemetry.otlp.enabled }} + otlp: + endpoint: {{ .Values.telemetry.otlp.endpoint }} + tls: + insecure: {{ not .Values.telemetry.otlp.tls.enabled }} + {{- end }} + logging: + loglevel: info + + extensions: + health_check: + endpoint: 0.0.0.0:13133 + pprof: + endpoint: 0.0.0.0:1777 + zpages: + endpoint: 0.0.0.0:55679 + + service: + extensions: [health_check, pprof, zpages] + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: + - logging + {{- if .Values.telemetry.jaeger.enabled }} + - jaeger + {{- end }} + {{- if .Values.telemetry.otlp.enabled }} + - otlp + {{- end }} + metrics: + receivers: [otlp, prometheus] + processors: [memory_limiter, batch] + exporters: + - logging + {{- if .Values.telemetry.prometheus.enabled }} + - prometheusremotewrite + {{- end }} + {{- if .Values.telemetry.otlp.enabled }} + - otlp + {{- end }} + logs: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: + - logging + {{- if .Values.telemetry.otlp.enabled }} + - otlp + {{- end }} +--- +# OpenTelemetry Collector Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "sim.fullname" . }}-otel-collector + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: telemetry +spec: + replicas: {{ .Values.telemetry.replicaCount }} + selector: + matchLabels: + {{- include "sim.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: telemetry + template: + metadata: + labels: + {{- include "sim.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: telemetry + spec: + {{- with .Values.global.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "sim.serviceAccountName" . }} + securityContext: + runAsNonRoot: true + runAsUser: 10001 + fsGroup: 10001 + containers: + - name: otel-collector + image: {{ include "sim.image" (dict "context" . "image" .Values.telemetry.image) }} + imagePullPolicy: {{ .Values.telemetry.image.pullPolicy }} + command: + - /otelcol-contrib + - --config=/etc/otel-collector-config/otel-config.yaml + ports: + - name: otlp-grpc + containerPort: 4317 + protocol: TCP + - name: otlp-http + containerPort: 4318 + protocol: TCP + - name: health + containerPort: 13133 + protocol: TCP + - name: pprof + containerPort: 1777 + protocol: TCP + - name: zpages + containerPort: 55679 + protocol: TCP + env: + - name: GOGC + value: "80" + volumeMounts: + - name: otel-config + mountPath: /etc/otel-collector-config + readOnly: true + livenessProbe: + httpGet: + path: / + port: health + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: / + port: health + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + resources: + {{- toYaml .Values.telemetry.resources | nindent 12 }} + volumes: + - name: otel-config + configMap: + name: {{ include "sim.fullname" . }}-otel-config + {{- with .Values.telemetry.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.telemetry.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.telemetry.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} +--- +# OpenTelemetry Collector Service +apiVersion: v1 +kind: Service +metadata: + name: {{ include "sim.fullname" . }}-otel-collector + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: telemetry +spec: + type: {{ .Values.telemetry.service.type }} + ports: + - name: otlp-grpc + port: 4317 + targetPort: otlp-grpc + protocol: TCP + - name: otlp-http + port: 4318 + targetPort: otlp-http + protocol: TCP + - name: health + port: 13133 + targetPort: health + protocol: TCP + selector: + {{- include "sim.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: telemetry +{{- end }} \ No newline at end of file diff --git a/helm/sim/values.schema.json b/helm/sim/values.schema.json new file mode 100644 index 000000000..9f7ea1ac8 --- /dev/null +++ b/helm/sim/values.schema.json @@ -0,0 +1,556 @@ +{ + "$schema": "https://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["app", "realtime"], + "properties": { + "global": { + "type": "object", + "properties": { + "imageRegistry": { + "type": "string", + "format": "hostname", + "description": "Global Docker image registry" + }, + "useRegistryForAllImages": { + "type": "boolean", + "description": "Use registry for all images, not just simstudioai/* images" + }, + "imagePullSecrets": { + "type": "array", + "items": { + "type": "object" + }, + "description": "Global image pull secrets" + }, + "storageClass": { + "type": "string", + "description": "Global storage class for PVCs" + } + } + }, + "app": { + "type": "object", + "required": ["enabled"], + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable the main application" + }, + "replicaCount": { + "type": "integer", + "minimum": 1, + "description": "Number of app replicas" + }, + "resources": { + "type": "object", + "properties": { + "limits": { + "type": "object", + "properties": { + "memory": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei|m|k|M|G|T|P|E)?$", + "description": "Memory limit (e.g., 4Gi, 2048Mi)" + }, + "cpu": { + "type": "string", + "pattern": "^[0-9]+(\\.[0-9]+)?m?$", + "description": "CPU limit (e.g., 2000m, 2.0)" + }, + "nvidia.com/gpu": { + "type": "string", + "pattern": "^[0-9]+$", + "description": "GPU limit" + } + } + }, + "requests": { + "type": "object", + "properties": { + "memory": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei|m|k|M|G|T|P|E)?$", + "description": "Memory request (e.g., 2Gi, 1024Mi)" + }, + "cpu": { + "type": "string", + "pattern": "^[0-9]+(\\.[0-9]+)?m?$", + "description": "CPU request (e.g., 1000m, 1.0)" + } + } + } + } + }, + "env": { + "type": "object", + "properties": { + "BETTER_AUTH_SECRET": { + "type": "string", + "minLength": 32, + "description": "Auth secret (minimum 32 characters required)" + }, + "ENCRYPTION_KEY": { + "type": "string", + "minLength": 32, + "description": "Encryption key (minimum 32 characters required)" + }, + "NEXT_PUBLIC_APP_URL": { + "type": "string", + "format": "uri", + "description": "Public application URL" + }, + "BETTER_AUTH_URL": { + "type": "string", + "format": "uri", + "description": "Authentication service URL" + }, + "SOCKET_SERVER_URL": { + "type": "string", + "format": "uri", + "description": "Socket server URL" + }, + "NEXT_PUBLIC_SOCKET_URL": { + "type": "string", + "format": "uri", + "description": "Public socket URL" + } + } + } + } + }, + "realtime": { + "type": "object", + "required": ["enabled"], + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable the realtime service" + }, + "replicaCount": { + "type": "integer", + "minimum": 1, + "description": "Number of realtime replicas" + }, + "resources": { + "type": "object", + "properties": { + "limits": { + "type": "object", + "properties": { + "memory": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei|m|k|M|G|T|P|E)?$", + "description": "Memory limit (e.g., 2Gi, 1024Mi)" + }, + "cpu": { + "type": "string", + "pattern": "^[0-9]+(\\.[0-9]+)?m?$", + "description": "CPU limit (e.g., 1000m, 1.0)" + } + } + }, + "requests": { + "type": "object", + "properties": { + "memory": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei|m|k|M|G|T|P|E)?$", + "description": "Memory request (e.g., 1Gi, 512Mi)" + }, + "cpu": { + "type": "string", + "pattern": "^[0-9]+(\\.[0-9]+)?m?$", + "description": "CPU request (e.g., 500m, 0.5)" + } + } + } + } + }, + "env": { + "type": "object", + "properties": { + "BETTER_AUTH_SECRET": { + "type": "string", + "minLength": 32, + "description": "Auth secret (minimum 32 characters required)" + }, + "NEXT_PUBLIC_APP_URL": { + "type": "string", + "format": "uri", + "description": "Public application URL" + }, + "BETTER_AUTH_URL": { + "type": "string", + "format": "uri", + "description": "Authentication service URL" + }, + "NEXT_PUBLIC_SOCKET_URL": { + "type": "string", + "format": "uri", + "description": "Public socket URL" + }, + "ALLOWED_ORIGINS": { + "type": "string", + "description": "CORS allowed origins" + } + } + } + } + }, + "postgresql": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable internal PostgreSQL" + }, + "resources": { + "type": "object", + "properties": { + "limits": { + "type": "object", + "properties": { + "memory": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei|m|k|M|G|T|P|E)?$", + "description": "Memory limit (e.g., 2Gi, 1024Mi)" + }, + "cpu": { + "type": "string", + "pattern": "^[0-9]+(\\.[0-9]+)?m?$", + "description": "CPU limit" + } + } + }, + "requests": { + "type": "object", + "properties": { + "memory": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei|m|k|M|G|T|P|E)?$", + "description": "Memory request (e.g., 1Gi, 512Mi)" + }, + "cpu": { + "type": "string", + "pattern": "^[0-9]+(\\.[0-9]+)?m?$", + "description": "CPU request (e.g., 500m, 0.5)" + } + } + } + } + }, + "persistence": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable persistent storage" + }, + "size": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$", + "description": "Storage size (e.g., 10Gi, 50Gi)" + } + } + }, + "auth": { + "type": "object", + "properties": { + "username": { + "type": "string", + "minLength": 1, + "description": "PostgreSQL username" + }, + "database": { + "type": "string", + "minLength": 1, + "description": "PostgreSQL database name" + }, + "password": { + "type": "string", + "minLength": 8, + "not": { + "const": "CHANGE-ME-SECURE-PASSWORD" + }, + "description": "PostgreSQL password (minimum 8 characters, must not be default placeholder)" + } + } + } + } + }, + "externalDatabase": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Use external database" + }, + "host": { + "type": "string", + "format": "hostname", + "description": "External database host" + }, + "port": { + "type": "integer", + "minimum": 1, + "maximum": 65535, + "description": "External database port" + }, + "username": { + "type": "string", + "description": "External database username" + }, + "password": { + "type": "string", + "description": "External database password" + }, + "database": { + "type": "string", + "description": "External database name" + }, + "sslMode": { + "type": "string", + "enum": ["disable", "allow", "prefer", "require", "verify-ca", "verify-full"], + "description": "SSL mode for database connection" + } + }, + "if": { + "properties": { + "enabled": { + "const": true + } + } + }, + "then": { + "required": ["host", "username", "password", "database"] + } + }, + "autoscaling": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable autoscaling" + }, + "minReplicas": { + "type": "integer", + "minimum": 1, + "description": "Minimum replicas" + }, + "maxReplicas": { + "type": "integer", + "minimum": 1, + "description": "Maximum replicas" + } + }, + "if": { + "properties": { + "enabled": { + "const": true + } + } + }, + "then": { + "required": ["minReplicas", "maxReplicas"] + } + }, + "ollama": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable Ollama service" + }, + "replicaCount": { + "type": "integer", + "minimum": 1, + "description": "Number of Ollama replicas" + }, + "resources": { + "type": "object", + "properties": { + "limits": { + "type": "object", + "properties": { + "memory": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei|m|k|M|G|T|P|E)?$", + "description": "Memory limit (e.g., 8Gi, 4096Mi)" + }, + "cpu": { + "type": "string", + "pattern": "^[0-9]+(\\.[0-9]+)?m?$", + "description": "CPU limit" + }, + "nvidia.com/gpu": { + "type": "string", + "pattern": "^[0-9]+$", + "description": "GPU limit" + } + } + }, + "requests": { + "type": "object", + "properties": { + "memory": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei|m|k|M|G|T|P|E)?$", + "description": "Memory request (e.g., 4Gi, 2048Mi)" + }, + "cpu": { + "type": "string", + "pattern": "^[0-9]+(\\.[0-9]+)?m?$", + "description": "CPU request (e.g., 1000m, 1.0)" + } + } + } + } + }, + "persistence": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable persistent storage" + }, + "size": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$", + "description": "Storage size (e.g., 100Gi, 500Gi)" + } + } + }, + "gpu": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable GPU support" + }, + "count": { + "type": "integer", + "minimum": 1, + "description": "Number of GPUs" + } + } + } + } + }, + "telemetry": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable telemetry collection" + }, + "replicaCount": { + "type": "integer", + "minimum": 1, + "description": "Number of telemetry collector replicas" + } + } + }, + "sharedStorage": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable shared storage" + }, + "volumes": { + "type": "array", + "items": { + "type": "object", + "required": ["name", "size"], + "properties": { + "name": { + "type": "string", + "minLength": 1, + "description": "Volume name" + }, + "size": { + "type": "string", + "pattern": "^[0-9]+[KMGT]i$", + "description": "Volume size (e.g., 100Gi)" + } + } + } + } + } + }, + "ingress": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable ingress" + }, + "className": { + "type": "string", + "description": "Ingress class name" + }, + "app": { + "type": "object", + "properties": { + "host": { + "type": "string", + "format": "hostname", + "description": "Main application hostname" + } + } + }, + "realtime": { + "type": "object", + "properties": { + "host": { + "type": "string", + "format": "hostname", + "description": "Realtime service hostname" + } + } + }, + "tls": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable TLS" + }, + "secretName": { + "type": "string", + "minLength": 1, + "description": "TLS secret name" + } + } + } + } + } + }, + "allOf": [ + { + "if": { + "properties": { + "postgresql": { + "properties": { + "enabled": { + "const": false + } + } + } + } + }, + "then": { + "properties": { + "externalDatabase": { + "properties": { + "enabled": { + "const": true + } + }, + "required": ["enabled"] + } + } + } + } + ] +} diff --git a/helm/sim/values.yaml b/helm/sim/values.yaml new file mode 100644 index 000000000..026b3ac87 --- /dev/null +++ b/helm/sim/values.yaml @@ -0,0 +1,594 @@ +# Global configuration +global: + # Image registry and pull policy + imageRegistry: "ghcr.io" + # Use registry for all images, not just simstudioai/* images + useRegistryForAllImages: false + imagePullSecrets: [] + + # Common labels applied to all resources + commonLabels: {} + + # Storage class for persistent volumes + storageClass: "" + +# Main Sim application configuration +app: + # Enable/disable the main application + enabled: true + + # Image configuration + image: + repository: simstudioai/simstudio + tag: latest + pullPolicy: Always + + # Number of replicas + replicaCount: 1 + + # Resource limits and requests + resources: + limits: + memory: "4Gi" + cpu: "2000m" + requests: + memory: "2Gi" + cpu: "1000m" + + # Node selector for pod scheduling (leave empty to allow scheduling on any node) + nodeSelector: {} + + # Pod security context + podSecurityContext: + fsGroup: 1001 + + # Container security context + securityContext: + runAsNonRoot: true + runAsUser: 1001 + + # Environment variables + env: + # Application URLs + NEXT_PUBLIC_APP_URL: "http://localhost:3000" + BETTER_AUTH_URL: "http://localhost:3000" + SOCKET_SERVER_URL: "http://localhost:3002" + NEXT_PUBLIC_SOCKET_URL: "http://localhost:3002" + + # Node environment + NODE_ENV: "production" + NEXT_TELEMETRY_DISABLED: "1" + + # Authentication and encryption secrets (REQUIRED for production) + # Generate secure 32-character secrets using: openssl rand -hex 32 + BETTER_AUTH_SECRET: "" # REQUIRED - set via --set flag or external secret manager + ENCRYPTION_KEY: "" # REQUIRED - set via --set flag or external secret manager + + # Optional third-party service integrations (leave empty if not using) + FREESTYLE_API_KEY: "" + GOOGLE_CLIENT_ID: "" + GOOGLE_CLIENT_SECRET: "" + GITHUB_CLIENT_ID: "" + GITHUB_CLIENT_SECRET: "" + RESEND_API_KEY: "" + + # Service configuration + service: + type: ClusterIP + port: 3000 + targetPort: 3000 + + # Health checks + livenessProbe: + httpGet: + path: / + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 90 + timeoutSeconds: 5 + failureThreshold: 3 + + readinessProbe: + httpGet: + path: / + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 90 + timeoutSeconds: 5 + failureThreshold: 3 + +# Realtime socket server configuration +realtime: + # Enable/disable the realtime service + enabled: true + + # Image configuration + image: + repository: simstudioai/realtime + tag: latest + pullPolicy: Always + + # Number of replicas + replicaCount: 1 + + # Resource limits and requests + resources: + limits: + memory: "2Gi" + cpu: "1000m" + requests: + memory: "1Gi" + cpu: "500m" + + # Node selector for pod scheduling (leave empty to allow scheduling on any node) + nodeSelector: {} + + # Pod security context + podSecurityContext: + fsGroup: 1001 + + # Container security context + securityContext: + runAsNonRoot: true + runAsUser: 1001 + + # Environment variables + env: + # Application URLs + NEXT_PUBLIC_APP_URL: "http://localhost:3000" + BETTER_AUTH_URL: "http://localhost:3000" + NEXT_PUBLIC_SOCKET_URL: "http://localhost:3002" + + # Authentication secret (REQUIRED for production) + # Must match the BETTER_AUTH_SECRET value from the main app configuration + BETTER_AUTH_SECRET: "" # REQUIRED - set via --set flag or external secret manager + + # Cross-Origin Resource Sharing (CORS) allowed origins + ALLOWED_ORIGINS: "http://localhost:3000" + + # Node environment + NODE_ENV: "production" + + # Service configuration + service: + type: ClusterIP + port: 3002 + targetPort: 3002 + + # Health checks + livenessProbe: + httpGet: + path: /health + port: 3002 + initialDelaySeconds: 10 + periodSeconds: 90 + timeoutSeconds: 5 + failureThreshold: 3 + + readinessProbe: + httpGet: + path: /health + port: 3002 + initialDelaySeconds: 10 + periodSeconds: 90 + timeoutSeconds: 5 + failureThreshold: 3 + +# Database migrations job configuration +migrations: + # Enable/disable migrations job + enabled: true + + # Image configuration + image: + repository: simstudioai/migrations + tag: latest + pullPolicy: Always + + # Resource limits and requests + resources: + limits: + memory: "1Gi" + requests: + memory: "512Mi" + cpu: "100m" + + # Pod security context + podSecurityContext: + fsGroup: 1001 + + # Container security context + securityContext: + runAsNonRoot: true + runAsUser: 1001 + +# PostgreSQL database configuration +postgresql: + # Enable/disable internal PostgreSQL deployment + enabled: true + + # Image configuration + image: + repository: pgvector/pgvector + tag: pg17 + pullPolicy: IfNotPresent + + # Authentication configuration + auth: + username: postgres + password: "" # REQUIRED - set via --set flag or external secret manager + database: sim + + # Node selector for database pod scheduling (leave empty to allow scheduling on any node) + nodeSelector: {} + + # Resource limits and requests + resources: + limits: + memory: "2Gi" + requests: + memory: "1Gi" + cpu: "500m" + + # Pod security context + podSecurityContext: + fsGroup: 999 + + # Container security context + securityContext: + runAsUser: 999 + + # Persistence configuration + persistence: + enabled: true + storageClass: "" + size: 10Gi + accessModes: + - ReadWriteOnce + + # SSL/TLS configuration (enable for production deployments with certificates) + tls: + enabled: false + certificatesSecret: postgres-tls-secret + + # PostgreSQL configuration + config: + maxConnections: 1000 + sharedBuffers: "1280MB" + maxWalSize: "4GB" + minWalSize: "80MB" + + # Service configuration + service: + type: ClusterIP + port: 5432 + targetPort: 5432 + + # Health checks + livenessProbe: + exec: + command: ["pg_isready", "-U", "postgres", "-d", "sim"] + initialDelaySeconds: 10 + periodSeconds: 5 + + readinessProbe: + exec: + command: ["pg_isready", "-U", "postgres", "-d", "sim"] + initialDelaySeconds: 5 + periodSeconds: 3 + +# External database configuration (use when connecting to managed database services) +externalDatabase: + # Enable to use an external database instead of the internal PostgreSQL instance + enabled: false + + # Database connection details + host: "external-db.example.com" + port: 5432 + username: postgres + password: "" + database: sim + + # SSL configuration + sslMode: require + +# Ollama local AI models configuration +ollama: + # Enable/disable Ollama deployment + enabled: false + + # Image configuration + image: + repository: ollama/ollama + tag: latest + pullPolicy: Always + + # Number of replicas + replicaCount: 1 + + # GPU configuration + gpu: + enabled: false + count: 1 + + # Node selector for GPU workloads (adjust labels based on your cluster configuration) + nodeSelector: + accelerator: nvidia + + # Tolerations for GPU nodes (adjust based on your cluster's GPU node taints) + tolerations: + - key: "sku" + operator: "Equal" + value: "gpu" + effect: "NoSchedule" + + # Resource limits and requests + resources: + limits: + memory: "8Gi" + nvidia.com/gpu: "1" + requests: + memory: "4Gi" + cpu: "1000m" + + # Environment variables + env: + NVIDIA_DRIVER_CAPABILITIES: "all" + OLLAMA_LOAD_TIMEOUT: "-1" + OLLAMA_KEEP_ALIVE: "-1" + OLLAMA_DEBUG: "1" + + # Persistence configuration + persistence: + enabled: true + storageClass: "" + size: 100Gi + accessModes: + - ReadWriteOnce + + # Service configuration + service: + type: ClusterIP + port: 11434 + targetPort: 11434 + + # Health checks + startupProbe: + httpGet: + path: / + port: 11434 + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 10 + + livenessProbe: + httpGet: + path: / + port: 11434 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 5 + + readinessProbe: + httpGet: + path: / + port: 11434 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + +# Ingress configuration +ingress: + # Enable/disable ingress + enabled: false + + # Ingress class name + className: nginx + + # Annotations + annotations: + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + + # Main application host configuration + app: + host: sim.local + paths: + - path: / + pathType: Prefix + + # Realtime service host configuration + realtime: + host: sim-ws.local + paths: + - path: / + pathType: Prefix + + # TLS configuration + tls: + enabled: false + secretName: sim-tls-secret + +# Service Account configuration +serviceAccount: + # Specifies whether a service account should be created + create: true + + # Annotations to add to the service account + annotations: {} + + # The name of the service account to use + name: "" + +# Horizontal Pod Autoscaler +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 10 + targetCPUUtilizationPercentage: 80 + targetMemoryUtilizationPercentage: 80 + # Custom metrics for scaling (advanced users can add custom metrics here) + customMetrics: [] + # Scaling behavior configuration (customize scale-up/down policies) + # Example configuration: + # behavior: + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Percent + # value: 50 + # periodSeconds: 60 + # scaleUp: + # stabilizationWindowSeconds: 60 + # policies: + # - type: Percent + # value: 100 + # periodSeconds: 15 + # - type: Pods + # value: 2 + # periodSeconds: 60 + behavior: {} + +# Pod disruption budget +podDisruptionBudget: + enabled: false + minAvailable: 1 + +# Monitoring configuration +monitoring: + # ServiceMonitor for Prometheus + serviceMonitor: + enabled: false + # Additional labels for ServiceMonitor + labels: {} + # Additional annotations for ServiceMonitor + annotations: {} + # Metrics path + path: /metrics + # Scrape interval + interval: 30s + # Scrape timeout + scrapeTimeout: 10s + # Target labels to be added to scraped metrics + targetLabels: [] + # Metric relabeling configurations + metricRelabelings: [] + # Relabeling configurations + relabelings: [] + +# Network policies +networkPolicy: + enabled: false + + # Custom ingress rules + ingress: [] + + # Custom egress rules + egress: [] + +# Shared storage for enterprise workflows requiring data sharing between pods +sharedStorage: + enabled: false + # Storage class for shared volumes (must support ReadWriteMany access) + storageClass: "" + # Default access modes for shared volumes (ReadWriteMany required for multi-pod access) + defaultAccessModes: + - ReadWriteMany + # Define shared volumes for your workflows (uncomment and customize as needed) + # Example volume configurations: + # volumes: + # - name: output-share + # size: 100Gi + # accessModes: + # - ReadWriteMany + # annotations: {} + # - name: rawdata-share + # size: 500Gi + # accessModes: + # - ReadWriteMany + # - name: model-share + # size: 200Gi + # accessModes: + # - ReadWriteMany + # - name: logs-share + # size: 50Gi + # accessModes: + # - ReadWriteMany + volumes: [] + +# Additional volumes for custom configurations (advanced users) +extraVolumes: [] +extraVolumeMounts: [] + +# Additional environment variables for custom integrations +extraEnvVars: [] + +# Pod annotations for custom metadata +podAnnotations: {} + +# Pod labels for custom labeling +podLabels: {} + +# Affinity settings for advanced pod scheduling +affinity: {} + +# Tolerations for scheduling on tainted nodes +tolerations: [] + +# Observability and telemetry configuration +telemetry: + # Enable/disable telemetry collection + enabled: false + + # OpenTelemetry Collector image + image: + repository: otel/opentelemetry-collector-contrib + tag: 0.91.0 + pullPolicy: IfNotPresent + + # Number of collector replicas + replicaCount: 1 + + # Resource limits and requests + resources: + limits: + memory: "512Mi" + cpu: "500m" + requests: + memory: "256Mi" + cpu: "100m" + + # Node selector for telemetry pod scheduling (leave empty to allow scheduling on any node) + nodeSelector: {} + + # Tolerations for telemetry workloads + tolerations: [] + + # Affinity for telemetry workloads + affinity: {} + + # Service configuration + service: + type: ClusterIP + + # Jaeger tracing backend + jaeger: + enabled: false + endpoint: "http://jaeger-collector:14250" + tls: + enabled: false + + # Prometheus metrics backend + prometheus: + enabled: false + endpoint: "http://prometheus-server/api/v1/write" + auth: "" + + # Generic OTLP backend + otlp: + enabled: false + endpoint: "http://otlp-collector:4317" + tls: + enabled: false \ No newline at end of file diff --git a/package.json b/package.json index 1c9a1074b..e2948154c 100644 --- a/package.json +++ b/package.json @@ -18,6 +18,8 @@ "format:check": "bunx biome format .", "lint": "bunx biome check --write --unsafe .", "lint:check": "bunx biome check --unsafe .", + "lint:helm": "helm lint ./helm/sim --strict --values ./helm/sim/test/values-lint.yaml", + "lint:all": "bun run lint && bun run lint:helm", "check": "bunx biome check --files-ignore-unknown=true", "prepare": "bun husky", "prebuild": "bun run lint:check",