forked from lavanet/lava
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PRT-add-health-checker (lavanet#1036)
* WIP health command * finished health data gathering * added alerts to health * revert upgrading go version * revert * adapt to go 1.20 * added suppression system * lint * WIP alert attributes for monitoring and suppression * added suppression support, WIP testing and configurability * fix spec raw call to spec expanded * added suppression count configuration * added health example * added latest block metric on all checks * added unhealthy as 0 * fix missing logs issues * fix panic in state tracker when the node is unavailable * added fixes and apiInterface data * added the autogenerating file to ignore * prettify errors, improve suppression by attribute * verified working alert mechanism * more sleep to health test * better error handling * fix resource exhaustion on context done in grpc connector * lint
- Loading branch information
1 parent
4d71b87
commit f6c9d5f
Showing
21 changed files
with
1,775 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
max-provider-latency: 150ms | ||
subscription-days-left-alert: 10 | ||
interval: 5m | ||
allowed_time_lag: 30s | ||
query-retries: 5 | ||
alert-webhook-url: <alert-hook> | ||
identifier: health_example | ||
cu-percent-threshold: 0.2 | ||
alert-suppression-interval: 6h | ||
disable-alert-suppression: false | ||
suppression-alert-count-threshold: 3 | ||
metrics-listen-address: ":7776" | ||
disable-alert-logging: false | ||
subscription_addresses: | ||
- lava@... | ||
- lava@... | ||
provider_addresses: | ||
- lava@... | ||
- lava@... | ||
- lava@... | ||
consumer_endpoints: | ||
- chain-id: ETH1 | ||
api-interface: jsonrpc | ||
network-address: 127.0.0.1:3333 | ||
reference_endpoints: | ||
- chain-id: ETH1 | ||
api-interface: jsonrpc | ||
network-address: public-rpc-1 | ||
- chain-id: ETH1 | ||
api-interface: jsonrpc | ||
network-address: public-rpc-2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
max-provider-latency: 150ms | ||
subscription-days-left-alert: 10 | ||
interval: 5s | ||
allowed_time_lag: 30s | ||
query-retries: 5 | ||
identifier: health_example | ||
cu-percent-threshold: 0.2 | ||
alert-suppression-interval: 60s | ||
disable-alert-suppression: false | ||
suppression-alert-count-threshold: 2 | ||
metrics-listen-address: ":7776" | ||
disable-alert-logging: false | ||
allow-insecure-provider-dialing: true | ||
consumer_endpoints: | ||
- chain-id: ETH1 | ||
api-interface: jsonrpc | ||
network-address: http://127.0.0.1:3333 | ||
- chain-id: LAV1 | ||
api-interface: rest | ||
network-address: http://127.0.0.1:3360 | ||
- chain-id: LAV1 | ||
api-interface: tendermintrpc | ||
network-address: http://127.0.0.1:3361 | ||
- chain-id: LAV1 | ||
api-interface: grpc | ||
network-address: 127.0.0.1:3362 | ||
#REPLACED | ||
subscription_addresses: | ||
provider_addresses: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
package metrics | ||
|
||
import ( | ||
"net/http" | ||
|
||
"github.com/lavanet/lava/utils" | ||
"github.com/prometheus/client_golang/prometheus" | ||
"github.com/prometheus/client_golang/prometheus/promhttp" | ||
) | ||
|
||
type HealthMetrics struct { | ||
failedRuns *prometheus.CounterVec | ||
successfulRuns *prometheus.CounterVec | ||
failureAlerts *prometheus.GaugeVec | ||
healthyChecks *prometheus.GaugeVec | ||
unhealthyChecks *prometheus.GaugeVec | ||
latestBlocks *prometheus.GaugeVec | ||
} | ||
|
||
func NewHealthMetrics(networkAddress string) *HealthMetrics { | ||
if networkAddress == DisabledFlagOption { | ||
utils.LavaFormatWarning("prometheus endpoint inactive, option is disabled", nil) | ||
return nil | ||
} | ||
|
||
latestBlocks := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
Name: "lava_health_latest_blocks", | ||
Help: "The latest blocks queried on all checks", | ||
}, []string{"identifier", "entity"}) | ||
|
||
failureAlerts := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
Name: "lava_health_failure_alerts", | ||
Help: "The current amount of active alerts", | ||
}, []string{"identifier"}) | ||
|
||
healthyChecks := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
Name: "lava_healthy_entities", | ||
Help: "The current amount of healthy checks", | ||
}, []string{"identifier"}) | ||
|
||
unhealthyChecks := prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
Name: "lava_unhealthy_entities", | ||
Help: "The current amount of healthy checks", | ||
}, []string{"identifier"}) | ||
|
||
failedRuns := prometheus.NewCounterVec(prometheus.CounterOpts{ | ||
Name: "lava_health_failed_runs", | ||
Help: "The total of runs failed", | ||
}, []string{"identifier"}) | ||
|
||
successfulRuns := prometheus.NewCounterVec(prometheus.CounterOpts{ | ||
Name: "lava_health_successful_runs", | ||
Help: "The total of runs succeeded", | ||
}, []string{"identifier"}) | ||
// Register the metrics with the Prometheus registry. | ||
prometheus.MustRegister(failedRuns) | ||
prometheus.MustRegister(successfulRuns) | ||
prometheus.MustRegister(failureAlerts) | ||
prometheus.MustRegister(healthyChecks) | ||
prometheus.MustRegister(unhealthyChecks) | ||
prometheus.MustRegister(latestBlocks) | ||
http.Handle("/metrics", promhttp.Handler()) | ||
go func() { | ||
utils.LavaFormatInfo("prometheus endpoint listening", utils.Attribute{Key: "Listen Address", Value: networkAddress}) | ||
http.ListenAndServe(networkAddress, nil) | ||
}() | ||
return &HealthMetrics{ | ||
failedRuns: failedRuns, | ||
successfulRuns: successfulRuns, | ||
failureAlerts: failureAlerts, | ||
healthyChecks: healthyChecks, | ||
unhealthyChecks: unhealthyChecks, | ||
latestBlocks: latestBlocks, | ||
} | ||
} | ||
|
||
func (pme *HealthMetrics) SetFailedRun(label string) { | ||
if pme == nil { | ||
return | ||
} | ||
pme.failedRuns.WithLabelValues(label).Add(1) | ||
} | ||
|
||
func (pme *HealthMetrics) SetSuccess(label string) { | ||
if pme == nil { | ||
return | ||
} | ||
pme.successfulRuns.WithLabelValues(label).Add(1) | ||
} | ||
|
||
func (pme *HealthMetrics) SetLatestBlockData(label string, data map[string]uint64) { | ||
if pme == nil { | ||
return | ||
} | ||
for entity, value := range data { | ||
pme.latestBlocks.WithLabelValues(label, entity).Set(float64(value)) | ||
} | ||
} | ||
|
||
func (pme *HealthMetrics) SetAlertResults(label string, fails uint64, unhealthy uint64, healthy uint64) { | ||
if pme == nil { | ||
return | ||
} | ||
pme.failureAlerts.WithLabelValues(label).Set(float64(fails)) | ||
pme.unhealthyChecks.WithLabelValues(label).Set(float64(unhealthy)) | ||
pme.healthyChecks.WithLabelValues(label).Set(float64(healthy)) | ||
} |
Oops, something went wrong.