diff --git a/k8s/monitoring/alertmanager-templates.config.yaml b/k8s/monitoring/alertmanager-templates.config.yaml new file mode 100644 index 0000000000..ea082230d1 --- /dev/null +++ b/k8s/monitoring/alertmanager-templates.config.yaml @@ -0,0 +1,177 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + creationTimestamp: null + name: alertmanager-templates + namespace: istio-system +data: + default.tmpl: | + {{ define "__alertmanager" }}AlertManager{{ end }} + {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} + {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} + {{ define "__description" }}{{ end }} + {{ define "__text_alert_list" }}{{ range . }}Labels: + {{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }} + {{ end }}Annotations: + {{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }} + {{ end }}Source: {{ .GeneratorURL }} + {{ end }}{{ end }} + {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }} + {{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }} + {{ define "slack.default.pretext" }}{{ end }} + {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }} + {{ define "slack.default.iconemoji" }}{{ end }} + {{ define "slack.default.iconurl" }}{{ end }} + {{ define "slack.default.text" }}{{ end }} + {{ define "hipchat.default.from" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "hipchat.default.message" }}{{ template "__subject" . }}{{ end }} + {{ define "pagerduty.default.description" }}{{ template "__subject" . }}{{ end }} + {{ define "pagerduty.default.client" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "pagerduty.default.clientURL" }}{{ template "__alertmanagerURL" . }}{{ end }} + {{ define "pagerduty.default.instances" }}{{ template "__text_alert_list" . }}{{ end }} + {{ define "opsgenie.default.message" }}{{ template "__subject" . }}{{ end }} + {{ define "opsgenie.default.description" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} + {{ if gt (len .Alerts.Firing) 0 -}} + Alerts Firing: + {{ template "__text_alert_list" .Alerts.Firing }} + {{- end }} + {{ if gt (len .Alerts.Resolved) 0 -}} + Alerts Resolved: + {{ template "__text_alert_list" .Alerts.Resolved }} + {{- end }} + {{- end }} + {{ define "opsgenie.default.source" }}{{ template "__alertmanagerURL" . }}{{ end }} + {{ define "victorops.default.message" }}{{ template "__subject" . }} | {{ template "__alertmanagerURL" . }}{{ end }} + {{ define "victorops.default.from" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "email.default.subject" }}{{ template "__subject" . }}{{ end }} + {{ define "email.default.html" }} + + + + + + + {{ template "__subject" . }} + + + + + + + + +
+
+ + + + + + + +
+ {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} + {{ .Name }}={{ .Value }} + {{ end }} +
+ + + + + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + {{ range .Alerts.Firing }} + + + + {{ end }} + {{ if gt (len .Alerts.Resolved) 0 }} + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + + + + {{ end }} + {{ range .Alerts.Resolved }} + + + + {{ end }} +
+ View in {{ template "__alertmanager" . }} +
+ [{{ .Alerts.Firing | len }}] Firing +
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + Source
+
+
+
+
+
+ [{{ .Alerts.Resolved | len }}] Resolved +
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + Source
+
+
+
+
+ + + {{ end }} + {{ define "pushover.default.title" }}{{ template "__subject" . }}{{ end }} + {{ define "pushover.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} + {{ if gt (len .Alerts.Firing) 0 }} + Alerts Firing: + {{ template "__text_alert_list" .Alerts.Firing }} + {{ end }} + {{ if gt (len .Alerts.Resolved) 0 }} + Alerts Resolved: + {{ template "__text_alert_list" .Alerts.Resolved }} + {{ end }} + {{ end }} + {{ define "pushover.default.url" }}{{ template "__alertmanagerURL" . }}{{ end }} + slack.tmpl: | + {{ define "slack.devops.text" }} + {{range .Alerts}}{{.Annotations.DESCRIPTION}} + {{end}} + {{ end }} diff --git a/k8s/monitoring/alertmanager.config.yaml b/k8s/monitoring/alertmanager.config.yaml new file mode 100644 index 0000000000..a523cae438 --- /dev/null +++ b/k8s/monitoring/alertmanager.config.yaml @@ -0,0 +1,19 @@ +kind: ConfigMap +apiVersion: v1 +metadata: + name: alertmanager-config + namespace: istio-system +data: + config.yml: |- + global: + templates: + - '/etc/alertmanager/*.tmpl' + route: + receiver: discord + group_by: ['alertname', 'priority'] + group_wait: 10s + repeat_interval: 30m + receivers: + - name: discord + webhook_configs: + - url: http://alertmanager-discord:9094 diff --git a/k8s/monitoring/alertmanager.deploy.yaml b/k8s/monitoring/alertmanager.deploy.yaml new file mode 100644 index 0000000000..cc250d5917 --- /dev/null +++ b/k8s/monitoring/alertmanager.deploy.yaml @@ -0,0 +1,73 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alertmanager + namespace: istio-system +spec: + replicas: 1 + selector: + matchLabels: + app: alertmanager + template: + metadata: + name: alertmanager + labels: + app: alertmanager + spec: + priorityClassName: monitoring-priority + containers: + - name: alertmanager + image: prom/alertmanager:latest + args: + - "--config.file=/etc/alertmanager/config.yml" + - "--storage.path=/alertmanager" + - "--log.level=debug" + ports: + - name: alertmanager + containerPort: 9093 + volumeMounts: + - name: config-volume + mountPath: /etc/alertmanager + - name: templates-volume + mountPath: /etc/alertmanager-templates + - name: alertmanager + mountPath: /alertmanager + volumes: + - name: config-volume + configMap: + name: alertmanager-config + - name: templates-volume + configMap: + name: alertmanager-templates + - name: alertmanager + emptyDir: {} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alertmanager-discord + namespace: istio-system +spec: + replicas: 1 + selector: + matchLabels: + app: alertmanager-discord + template: + metadata: + name: alertmanager-discord + labels: + app: alertmanager-discord + spec: + priorityClassName: monitoring-priority + containers: + - name: alertmanager-discord + image: gcr.io/prysmaticlabs/alertmanager-discord:latest + ports: + - name: webhook + containerPort: 9094 + env: + - name: DISCORD_WEBHOOK + valueFrom: + secretKeyRef: + name: discord-webhook + key: url diff --git a/k8s/monitoring/alertmanager.service.yaml b/k8s/monitoring/alertmanager.service.yaml new file mode 100644 index 0000000000..a7d4b959ff --- /dev/null +++ b/k8s/monitoring/alertmanager.service.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: Service +metadata: + name: alertmanager + namespace: istio-system + annotations: + prometheus.io/scrape: 'true' + prometheus.io/path: / + prometheus.io/port: '8080' +spec: + selector: + app: alertmanager + type: ClusterIP + ports: + - port: 9093 + targetPort: 9093 +--- +apiVersion: v1 +kind: Service +metadata: + name: alertmanager-discord + namespace: istio-system +spec: + selector: + app: alertmanager-discord + type: ClusterIP + ports: + - port: 9094 + targetPort: 9094 diff --git a/k8s/monitoring/alerts.yaml b/k8s/monitoring/alerts.yaml new file mode 100644 index 0000000000..64f2ee2a8e --- /dev/null +++ b/k8s/monitoring/alerts.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-alerts + namespace: istio-system +data: + prometheus.rules.yml: |- + groups: + - name: prysm + rules: + - alert: stalled_chain + expr: delta(avg(state_last_slot > scalar(max(state_last_slot) - 100))[10m:30s]) < 50 + for: 1m + annotations: + summary: No block slots advanced in 2 minutes + - alert: too_long_since_finality + expr: max(state_last_slot / 8) - floor(max(state_last_finalized_epoch)) > 10 + for: 1m + annotations: + summary: No finality in 10 epochs + - alert: high_reorg_rate + expr: max(delta(reorg_counter[10m])) > 5 + for: 1m + annotations: + summary: Some nodes are seeing more than 5 reorgs in 10 minutes + - alert: high_goroutines + expr: max_over_time(go_goroutines{component="beacon-chain"}[1m]) > 1000 + for: 1m + annotations: + summary: Some nodes are experencing more than 1000 goroutines diff --git a/k8s/monitoring/discord-webhook.encrypted_sercret.yaml b/k8s/monitoring/discord-webhook.encrypted_sercret.yaml new file mode 100644 index 0000000000..64a9d5d8c4 --- /dev/null +++ b/k8s/monitoring/discord-webhook.encrypted_sercret.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +data: + url: 3Bh4HrgcSs/QdGDVnvg31lEggQELhDEycmTNAoa5WP7gb7UPd0XC20uPZXftWF37eVX2CF4wWkXn7ge4/E+Ut6Fo3K/nLYQOc5BBGiZQZcgVGnYn5adZ+8SLCXQTTwBaCBhLUmxwB8IQYA9icnJg/ZCgXKjq2pgRnkIEn58Y5kw2lIxlbQ1drFKGha1YCBHCbpJZ+dWHsryGoh7S2d0vDQVBu0AW3L8/cM1O5rIv7OcWkV1QfEY5P4xiAgCtz+uS.yYUhj5ra/rofJcNd.j2DDF/zrMlD0efGqYKhfkg== +kind: Secret +metadata: + name: discord-webhook + namespace: istio-system +type: Opaque +# kubesec:v:3 +# kubesec:gcp:projects/prysmaticlabs/locations/global/keyRings/prysmatic-k8s-secrets/cryptoKeys/testkey:CiQAaKPz55imKn09+ay5Fipt8Ejsa0fl9RAiDUwIB8QjWJniNfESSQD23sIsO81pJ6gDAzc7733PGECLQ+ftcvluf41iLs5GUBBHrk6ziqmtDmfiTzc9E2YDuXYwAU4EejVjXoIHMESywm9EvetOEt4= +# kubesec:mac:JaD9nI5ZuJQNiB9L.5gAZrH4oEyQNTUmUHmSjRQ==