Add eth1 balance monitoring alert (#2575)

* Add eth1 balance monitoring

* lint

* lint

* priority

* lint

* use value in alerts

* fix beacon-chain service

* working on stability

* more yaml

* add more alerts to the finality alerts

* add nother header to ignore

* extend requirement time for low balance

* remove old flag

* remove extra flag

* feedback to use consistent flag
This commit is contained in:
Preston Van Loon
2019-05-19 10:52:17 -04:00
committed by Raul Jordan
parent 632f6797cd
commit 40588021d4
10 changed files with 399 additions and 9 deletions

View File

@@ -56,7 +56,7 @@ spec:
- --p2p-port=5000
#- --enable-tracing
- --tracing-process-name=$(POD_NAME)
- --tracing-endpoint=http://jaeger-collector.istio-system.svc.cluster.local:14268
- --tracing-endpoint=http://jaeger-collector.istio-system.svc.cluster.local:14268/api/traces
- --trace-sample-fraction=1.0
- --datadir=/data
- --p2p-max-peers=50

View File

@@ -9,7 +9,7 @@ spec:
servers:
- port:
number: 30002
name: grpc-beacon-chain
name: grpc-beacon-chain
protocol: GRPC
hosts:
- beacon.prylabs.net
@@ -52,6 +52,25 @@ spec:
port:
number: 4000
host: beacon-chain.beacon-chain.svc.cluster.local
---
kind: VirtualService
apiVersion: networking.istio.io/v1alpha3
metadata:
name: beacon-chain
namespace: beacon-chain
spec:
hosts:
- beacon-chain.beacon-chain.svc.cluster.local
gateways:
- mesh
http:
- match:
- port: 4000
route:
- destination:
port:
number: 4000
host: beacon-chain.beacon-chain.svc.cluster.local
---
apiVersion: networking.istio.io/v1alpha3
kind: DestinationRule

View File

@@ -4,7 +4,7 @@ metadata:
name: validator
namespace: beacon-chain
spec:
replicas: 8
replicas: 9
selector:
matchLabels:
component: validator
@@ -18,6 +18,9 @@ spec:
metadata:
labels:
component: validator
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '9090'
spec:
priorityClassName: production-priority
containers:
@@ -27,10 +30,11 @@ spec:
- --keystore-path=/keystore
- --password=nopass
- --datadir=/data
- --beacon-rpc-provider=beacon-chain:4000
- --beacon-rpc-provider=beacon-chain.beacon-chain.svc.cluster.local:4000
- --monitoring-port=9090
- --enable-tracing
- --tracing-process-name=$(POD_NAME)
- --tracing-endpoint=http://jaeger-collector.istio-system.svc.cluster.local:14268
- --tracing-endpoint=http://jaeger-collector.istio-system.svc.cluster.local:14268/api/traces
- --trace-sample-fraction=1.0
- --log-format=fluentd
- --disable-rewards-penalties-logging

56
k8s/geth/eth1monitor.yaml Normal file
View File

@@ -0,0 +1,56 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: eth1monitor
namespace: pow
spec:
selector:
matchLabels:
app: eth1monitor
replicas: 1
template:
metadata:
labels:
app: eth1monitor
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '9090'
spec:
priorityClassName: monitoring-priority
containers:
- name: monitor
image: gcr.io/prysmaticlabs/eth1monitor:latest
args:
- --port=9090
- --addresses=/etc/config/addresses.txt
- --web3-provider=http://public-rpc-nodes.pow.svc.cluster.local:8545
volumeMounts:
- name: addresses
mountPath: /etc/config
readOnly: true
- name: configmap-reload
image: jimmidyson/configmap-reload:v0.2.2
args:
- --volume-dir=/etc/config
- --webhook-url=http://localhost:9090/reload
volumeMounts:
- name: addresses
mountPath: /etc/config
readOnly: true
volumes:
- name: addresses
configMap:
name: eth1-addresses
---
apiVersion: v1
kind: ConfigMap
metadata:
name: eth1-addresses
namespace: pow
data:
addresses.txt: |
faucet:0xae1f3f522cfb1b0ed128819e8e72fda207c47e5e
pk-manager:0x1bcd2c7dd8d5ffd8b789d90c71ac9aab55e51d5d
prylabs:0xd9a5179f091d85051d3c982785efd1455cec8699

View File

@@ -13,18 +13,52 @@ data:
for: 1m
annotations:
summary: No block slots advanced in 2 minutes
- alert: too_long_since_finality
- alert: too_long_since_finality_10
expr: max(state_last_slot / 8) - floor(max(state_last_finalized_epoch)) > 10
for: 1m
annotations:
summary: No finality in 10 epochs
- alert: too_long_since_finality_25
expr: max(state_last_slot / 8) - floor(max(state_last_finalized_epoch)) > 25
for: 1m
annotations:
summary: No finality in 25 epochs
- alert: too_long_since_finality_100
expr: max(state_last_slot / 8) - floor(max(state_last_finalized_epoch)) > 100
for: 1m
annotations:
summary: No finality in 100 epochs
- alert: high_reorg_rate
expr: max(delta(reorg_counter[10m])) > 5
for: 1m
annotations:
summary: Some nodes are seeing more than 5 reorgs in 10 minutes
- alert: high_goroutines
expr: max_over_time(go_goroutines{component="beacon-chain"}[1m]) > 1000
expr: max_over_time(go_goroutines{component="beacon-chain"}[1m]) > 2000
for: 1m
annotations:
summary: Some nodes are experencing more than 1000 goroutines
summary: Pod {{ $labels.pod_name}} experencing more than 2000 goroutines
description: "{{ $labels.pod_name }} has {{ $value }} goroutines"
- alert: low_disk_space_1gb
expr: kubelet_volume_stats_available_bytes < 1e9
for: 1m
annotations:
summary: Pod has less than 1GB free disk space in their persistent disk.
- alert: balance_too_low
expr: eth_balance < 200
for: 15m
annotations:
summary: ETH1 wallet has less than 200 ETH
description: "{{ $labels.name }} ({{ $labels.address }}) has {{ $value }} ETH"
- alert: high_grpc_beacon_chain_error_rate
expr: sum(rate(istio_requests_total{reporter="destination",destination_service=~"beacon-chain.beacon-chain.svc.cluster.local",response_code!~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination",destination_service=~"beacon-chain.beacon-chain.svc.cluster.local"}[5m])) < 0.99
for: 1m
annotations:
summary: Beacon node global success rate is less than 99%.
description: Success rate = {{ $value }}
- alert: high_grpc_beacon_chain_method_error_rate
expr: sum(rate(grpc_server_handled_total{component="beacon-chain",grpc_code="OK"}[5m])) by (grpc_method) / sum(rate(grpc_server_handled_total{component="beacon-chain"}[5m])) by (grpc_method) < 0.95
for: 1m
annotations:
summary: Beacon chain gRPC method success rate is less than 95%.
description: "{{ $labels.grpc_method }} {{ $value }}"

View File

@@ -0,0 +1,18 @@
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
name: jaeger-collector-virtual-service
namespace: istio-system
spec:
hosts:
- jaeger-collector.istio-system.svc.cluster.local
gateways:
- mesh
http:
- match:
- port: 14268
route:
- destination:
host: jaeger-collector.istio-system.svc.cluster.local
port:
number: 14268

39
k8s/x_headers_rule.yaml Normal file
View File

@@ -0,0 +1,39 @@
apiVersion: config.istio.io/v1alpha2
kind: rule
metadata:
name: x-headers
namespace: istio-system
spec:
match: destination.name != "unknown"
responseHeaderOperations:
- name: x-backend
values: [ destination.name ]
operation: APPEND
- name: x-source
values: [ source.workload.name ]
operation: APPEND
---
apiVersion: config.istio.io/v1alpha2
kind: rule
metadata:
name: x-headers-public
namespace: istio-system
spec:
match: source.workload.name == "istio-ingressgateway"
responseHeaderOperations:
- name: x-backend
operation: REMOVE
- name: x-source
operation: REMOVE
- name: x-envoy-upstream-service-time
operation: REMOVE
- name: server
operation: REMOVE
- name: grpc-metadata-x-backend
operation: REMOVE
- name: grpc-metadata-x-source
operation: REMOVE
- name: grpc-metadata-x-envoy-upstream-service-time
operation: REMOVE
- name: grpc-metadata-server
operation: REMOVE

View File

@@ -0,0 +1,47 @@
load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
load("@io_bazel_rules_docker//go:image.bzl", "go_image")
load("@io_bazel_rules_docker//container:container.bzl", "container_push")
go_library(
name = "go_default_library",
srcs = ["main.go"],
importpath = "github.com/prysmaticlabs/prysm/tools/eth1exporter",
visibility = ["//visibility:private"],
deps = [
"@com_github_ethereum_go_ethereum//common:go_default_library",
"@com_github_ethereum_go_ethereum//ethclient:go_default_library",
],
)
go_binary(
name = "eth1exporter",
embed = [":go_default_library"],
visibility = ["//visibility:public"],
)
go_image(
name = "image",
srcs = ["main.go"],
goarch = "amd64",
goos = "linux",
importpath = "github.com/prysmaticlabs/prysm/tools/eth1exporter",
race = "off",
static = "off",
tags = ["manual"],
visibility = ["//visibility:private"],
deps = [
"@com_github_ethereum_go_ethereum//common:go_default_library",
"@com_github_ethereum_go_ethereum//ethclient:go_default_library",
],
)
container_push(
name = "push_image",
format = "Docker",
image = ":image",
registry = "gcr.io",
repository = "prysmaticlabs/eth1monitor",
tag = "latest",
tags = ["manual"],
visibility = ["//visibility:private"],
)

173
tools/eth1exporter/main.go Normal file
View File

@@ -0,0 +1,173 @@
// Prometheus exporter for Ethereum address balances.
// Forked from https://github.com/hunterlong/ethexporter
package main
import (
"bufio"
"context"
"flag"
"fmt"
"log"
"math/big"
"net/http"
"os"
"strings"
"time"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/ethclient"
)
var (
allWatching []*Watching
loadSeconds float64
totalLoaded int64
eth *ethclient.Client
)
var (
port = flag.Int("port", 9090, "Port to serve /metrics")
web3URL = flag.String("web3-provider", "https://goerli.prylabs.net", "Web3 URL to access information about ETH1")
prefix = flag.String("prefix", "", "Metrics prefix.")
addressFilePath = flag.String("addresses", "", "File path to addresses text file.")
)
func main() {
flag.Parse()
if *addressFilePath == "" {
log.Println("--addresses is required")
return
}
err := OpenAddresses(*addressFilePath)
if err != nil {
panic(err)
}
err = ConnectionToGeth(*web3URL)
if err != nil {
panic(err)
}
// check address balances
go func() {
for {
totalLoaded = 0
t1 := time.Now()
fmt.Printf("Checking %v wallets...\n", len(allWatching))
for _, v := range allWatching {
v.Balance = GetEthBalance(v.Address).String()
totalLoaded++
}
t2 := time.Now()
loadSeconds = t2.Sub(t1).Seconds()
fmt.Printf("Finished checking %v wallets in %0.0f seconds, sleeping for %v seconds.\n", len(allWatching), loadSeconds, 15)
time.Sleep(15 * time.Second)
}
}()
block := CurrentBlock()
fmt.Printf("ETHexporter has started on port %v using web3 server: %v at block #%v\n", *port, *web3URL, block)
http.HandleFunc("/metrics", MetricsHTTP)
http.HandleFunc("/reload", ReloadHTTP)
panic(http.ListenAndServe(fmt.Sprintf("0.0.0.0:%d", *port), nil))
}
// Watching address wrapper
type Watching struct {
Name string
Address string
Balance string
}
// ConnectionToGeth - Connect to remote server.
func ConnectionToGeth(url string) error {
var err error
eth, err = ethclient.Dial(url)
return err
}
// GetEthBalance from remote server.
func GetEthBalance(address string) *big.Float {
balance, err := eth.BalanceAt(context.TODO(), common.HexToAddress(address), nil)
if err != nil {
fmt.Printf("Error fetching ETH Balance for address: %v\n", address)
}
return ToEther(balance)
}
// CurrentBlock in ETH1.
func CurrentBlock() uint64 {
block, err := eth.BlockByNumber(context.TODO(), nil)
if err != nil {
fmt.Printf("Error fetching current block height: %v\n", err)
return 0
}
return block.NumberU64()
}
// ToEther from Wei.
func ToEther(o *big.Int) *big.Float {
pul, int := big.NewFloat(0), big.NewFloat(0)
int.SetInt(o)
pul.Mul(big.NewFloat(0.000000000000000001), int)
return pul
}
// MetricsHTTP - HTTP response handler for /metrics.
func MetricsHTTP(w http.ResponseWriter, r *http.Request) {
var allOut []string
total := big.NewFloat(0)
for _, v := range allWatching {
if v.Balance == "" {
v.Balance = "0"
}
bal := big.NewFloat(0)
bal.SetString(v.Balance)
total.Add(total, bal)
allOut = append(allOut, fmt.Sprintf("%veth_balance{name=\"%v\",address=\"%v\"} %v", *prefix, v.Name, v.Address, v.Balance))
}
allOut = append(allOut, fmt.Sprintf("%veth_balance_total %0.18f", *prefix, total))
allOut = append(allOut, fmt.Sprintf("%veth_load_seconds %0.2f", *prefix, loadSeconds))
allOut = append(allOut, fmt.Sprintf("%veth_loaded_addresses %v", *prefix, totalLoaded))
allOut = append(allOut, fmt.Sprintf("%veth_total_addresses %v", *prefix, len(allWatching)))
fmt.Fprintln(w, strings.Join(allOut, "\n"))
}
// ReloadHTTP reloads the addresses from disk.
func ReloadHTTP(w http.ResponseWriter, _ *http.Request) {
if err := OpenAddresses(*addressFilePath); err != nil {
w.WriteHeader(http.StatusInternalServerError)
return
}
w.WriteHeader(http.StatusOK)
log.Println("Reloaded addresses")
}
// OpenAddresses from text file (name:address)
func OpenAddresses(filename string) error {
file, err := os.Open(filename)
if err != nil {
return err
}
defer file.Close()
scanner := bufio.NewScanner(file)
allWatching = []*Watching{}
for scanner.Scan() {
object := strings.Split(scanner.Text(), ":")
if common.IsHexAddress(object[1]) {
w := &Watching{
Name: object[0],
Address: object[1],
}
allWatching = append(allWatching, w)
}
}
if err := scanner.Err(); err != nil {
return err
}
return err
}

View File

@@ -43,7 +43,7 @@ go_image(
importpath = IMPORT_PATH,
deps = DEPS,
pure = "off", # depends on cgo for go-ethereum crypto
static = "on",
static = "off", # go-ethereum is bad about static
tags = ["manual"],
goarch = "amd64",
goos = "linux",