Skip to content

Commit

Permalink
adding metrics for health checks
Browse files Browse the repository at this point in the history
  • Loading branch information
otherview committed Mar 8, 2021
1 parent bc2841f commit 2d56f10
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 18 deletions.
20 changes: 11 additions & 9 deletions api/health/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,14 @@ import (
"net/http"
"time"

stdjson "encoding/json"

"github.com/gorilla/rpc/v2"

health "github.com/AppsFlyer/go-sundheit"

"github.com/ava-labs/avalanchego/snow/engine/common"
"github.com/ava-labs/avalanchego/utils/json"
"github.com/ava-labs/avalanchego/utils/logging"
"github.com/gorilla/rpc/v2"
"github.com/prometheus/client_golang/prometheus"

stdjson "encoding/json"
health "github.com/AppsFlyer/go-sundheit"
healthlib "github.com/ava-labs/avalanchego/health"
)

Expand All @@ -29,11 +27,15 @@ type Service interface {
Handler() (*common.HTTPHandler, error)
}

func NewService(checkFreq time.Duration, log logging.Logger) Service {
func NewService(checkFreq time.Duration, log logging.Logger, registry prometheus.Registerer) (Service, error) {
healthL, err := healthlib.NewService(checkFreq, log, registry)
if err != nil {
return nil, err
}
return &apiServer{
Service: healthlib.NewService(checkFreq, log),
Service: healthL,
log: log,
}
}, nil
}

// APIServer serves HTTP for a health service
Expand Down
49 changes: 49 additions & 0 deletions health/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// (c) 2019-2021, Ava Labs, Inc. All rights reserved.
// See the file LICENSE for licensing terms.

package health

import (
health "github.com/AppsFlyer/go-sundheit"
"github.com/prometheus/client_golang/prometheus"

"github.com/ava-labs/avalanchego/utils/logging"
"github.com/ava-labs/avalanchego/utils/wrappers"
)

// Metrics reports commonly used consensus metrics.
type Metrics struct {
// log reports anomalous events.
log logging.Logger

// failingChecks keeps track of the number of check failing
failingChecks prometheus.Gauge
}

func newMetrics(metricName, descriptionName string, log logging.Logger, namespace string, registerer prometheus.Registerer) (*Metrics, error) {
metrics := &Metrics{
log: log,
failingChecks: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: "health_checks_failing",
Help: "number of currently failing health checks",
}),
}

errs := wrappers.Errs{}
errs.Add(
registerer.Register(metrics.failingChecks),
)

return metrics, errs.Err
}

// Healthy handles the metrics for the healthy cases
func (m *Metrics) Healthy(json health.Result) {
m.failingChecks.Dec()
}

// UnHealthy handles the metrics for the unhealthy cases
func (m *Metrics) UnHealthy(result health.Result) {
m.failingChecks.Inc()
}
25 changes: 18 additions & 7 deletions health/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ import (
"sync"
"time"

health "github.com/AppsFlyer/go-sundheit"

"github.com/ava-labs/avalanchego/utils/constants"
"github.com/ava-labs/avalanchego/utils/logging"
"github.com/prometheus/client_golang/prometheus"

health "github.com/AppsFlyer/go-sundheit"
)

// Service performs health checks. Other things register health checks
Expand All @@ -21,17 +22,22 @@ type Service interface {

// NewService returns a new [Service] where the health checks
// run every [checkFreq]
func NewService(checkFreq time.Duration, log logging.Logger) Service {
func NewService(checkFreq time.Duration, log logging.Logger, registry prometheus.Registerer) (Service, error) {
healthChecker := health.New()
metrics, err := newMetrics("healthCheck", "healthCheck metrics", log, "healthcheck", registry)
if err != nil {
return nil, err
}
// Add the check listener to report when a check changes status.
healthChecker.WithCheckListener(&checkListener{
log: log,
checks: make(map[string]bool),
log: log,
checks: make(map[string]bool),
metrics: metrics,
})
return &service{
Health: healthChecker,
checkFreq: checkFreq,
}
}, nil
}

// service implements Service
Expand Down Expand Up @@ -79,7 +85,8 @@ type checkListener struct {
// lock ensures that updates and reads to [checks] are atomic
lock sync.Mutex
// checks maps name -> is healthy
checks map[string]bool
checks map[string]bool
metrics *Metrics
}

func (c *checkListener) OnCheckStarted(name string) {
Expand All @@ -104,15 +111,19 @@ func (c *checkListener) OnCheckCompleted(name string, result health.Result) {
if !exists || isHealthy == previouslyHealthy {
if isHealthy {
c.log.Debug("%q returned healthy with: %s", name, string(resultJSON))
c.metrics.Healthy(result)
} else {
c.log.Debug("%q returned unhealthy with: %s", name, string(resultJSON))
c.metrics.UnHealthy(result)
}
return
}

if isHealthy {
c.log.Info("%q became healthy with: %s", name, string(resultJSON))
c.metrics.Healthy(result)
} else {
c.log.Warn("%q became unhealthy with: %s", name, string(resultJSON))
c.metrics.UnHealthy(result)
}
}
8 changes: 6 additions & 2 deletions node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,11 @@ func (n *Node) initHealthAPI() error {
}

n.Log.Info("initializing Health API")
n.healthService = health.NewService(n.Config.HealthCheckFreq, n.Log)
healthService, err := health.NewService(n.Config.HealthCheckFreq, n.Log, n.Config.ConsensusParams.Metrics)
if err != nil {
return err
}
n.healthService = healthService

isBootstrappedFunc := func() (interface{}, error) {
if pChainID, err := n.chainManager.Lookup("P"); err != nil {
Expand All @@ -751,7 +755,7 @@ func (n *Node) initHealthAPI() error {
return nil, nil
}
// Passes if the P, X and C chains are finished bootstrapping
err := n.healthService.RegisterMonotonicCheck("isBootstrapped", isBootstrappedFunc)
err = n.healthService.RegisterMonotonicCheck("isBootstrapped", isBootstrappedFunc)
if err != nil {
return fmt.Errorf("couldn't register isBootstrapped health check: %w", err)
}
Expand Down

0 comments on commit 2d56f10

Please sign in to comment.