Skip to content

Commit

Permalink
Add additional consensus metrics (ava-labs#1877)
Browse files Browse the repository at this point in the history
Co-authored-by: Stephen Buttolph <[email protected]>
  • Loading branch information
gyuho and StephenButtolph authored Oct 13, 2022
1 parent 32040ca commit 9cb5b46
Show file tree
Hide file tree
Showing 10 changed files with 160 additions and 61 deletions.
9 changes: 5 additions & 4 deletions snow/consensus/avalanche/topological.go
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@ func (ta *Topological) update(vtx Vertex) error {
return err
}
delete(ta.nodes, vtxID)
ta.Latency.Rejected(vtxID, ta.pollNumber)
ta.Latency.Rejected(vtxID, ta.pollNumber, len(vtx.Bytes()))

ta.preferenceCache[vtxID] = false
ta.virtuousCache[vtxID] = false
Expand Down Expand Up @@ -641,15 +641,16 @@ func (ta *Topological) update(vtx Vertex) error {
// I'm acceptable, why not accept?
// Note that ConsensusAcceptor.Accept must be called before vtx.Accept
// to honor Acceptor.Accept's invariant.
if err := ta.ctx.ConsensusAcceptor.Accept(ta.ctx, vtxID, vtx.Bytes()); err != nil {
vtxBytes := vtx.Bytes()
if err := ta.ctx.ConsensusAcceptor.Accept(ta.ctx, vtxID, vtxBytes); err != nil {
return err
}

if err := vtx.Accept(); err != nil {
return err
}
delete(ta.nodes, vtxID)
ta.Latency.Accepted(vtxID, ta.pollNumber)
ta.Latency.Accepted(vtxID, ta.pollNumber, len(vtxBytes))
case rejectable:
// I'm rejectable, why not reject?
ta.ctx.Log.Trace("rejecting vertex",
Expand All @@ -666,7 +667,7 @@ func (ta *Topological) update(vtx Vertex) error {
return err
}
delete(ta.nodes, vtxID)
ta.Latency.Rejected(vtxID, ta.pollNumber)
ta.Latency.Rejected(vtxID, ta.pollNumber, len(vtx.Bytes()))
}
return nil
}
Expand Down
54 changes: 47 additions & 7 deletions snow/consensus/metrics/latency.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@ type Latency interface {
Issued(id ids.ID, pollNumber uint64)

// Accepted marks the item as having been accepted.
Accepted(id ids.ID, pollNumber uint64)
// Pass the container size in bytes for metrics tracking.
Accepted(id ids.ID, pollNumber uint64, containerSize int)

// Rejected marks the item as having been rejected.
Rejected(id ids.ID, pollNumber uint64)
// Pass the container size in bytes for metrics tracking.
Rejected(id ids.ID, pollNumber uint64, containerSize int)

// MeasureAndGetOldestDuration returns the amount of time the oldest item
// has been processing.
Expand Down Expand Up @@ -67,11 +69,13 @@ type latency struct {

// latAccepted tracks the number of nanoseconds that an item was processing
// before being accepted
latAccepted metric.Averager
latAccepted metric.Averager
containerSizeAcceptedSum prometheus.Gauge

// rejected tracks the number of nanoseconds that an item was processing
// before being rejected
latRejected metric.Averager
latRejected metric.Averager
containerSizeRejectedSum prometheus.Gauge
}

// Initialize the metrics with the provided names.
Expand All @@ -80,11 +84,15 @@ func NewLatency(metricName, descriptionName string, log logging.Logger, namespac
l := &latency{
processingEntries: linkedhashmap.New[ids.ID, opStart](),
log: log,

// e.g.,
// "avalanche_7y7zwo7XatqnX4dtTakLo32o7jkMX4XuDa26WaxbCXoCT1qKK_blks_processing" to count how blocks are currently processing
numProcessing: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: fmt.Sprintf("%s_processing", metricName),
Help: fmt.Sprintf("Number of currently processing %s", metricName),
}),

pollsAccepted: metric.NewAveragerWithErrs(
namespace,
fmt.Sprintf("%s_polls_accepted", metricName),
Expand All @@ -99,22 +107,50 @@ func NewLatency(metricName, descriptionName string, log logging.Logger, namespac
reg,
&errs,
),

// e.g.,
// "avalanche_C_blks_accepted_count" to count how many "Observe" gets called -- count all "Accept"
// "avalanche_C_blks_accepted_sum" to count how many ns have elapsed since its issuance on acceptance
// "avalanche_C_blks_accepted_sum / avalanche_C_blks_accepted_count" is the average block acceptance latency in ns
// "avalanche_C_blks_accepted_container_size_sum" to track cumulative sum of all accepted blocks' sizes
// "avalanche_C_blks_accepted_container_size_sum / avalanche_C_blks_accepted_count" is the average block size
latAccepted: metric.NewAveragerWithErrs(
namespace,
fmt.Sprintf("%s_accepted", metricName),
fmt.Sprintf("time (in ns) from issuance of a %s to its acceptance", descriptionName),
reg,
&errs,
),
containerSizeAcceptedSum: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: fmt.Sprintf("%s_accepted_container_size_sum", metricName),
Help: fmt.Sprintf("Cumulative sum of container size of all accepted %s", metricName),
}),

// e.g.,
// "avalanche_P_blks_rejected_count" to count how many "Observe" gets called -- count all "Reject"
// "avalanche_P_blks_rejected_sum" to count how many ns have elapsed since its issuance on rejection
// "avalanche_P_blks_accepted_sum / avalanche_P_blks_accepted_count" is the average block acceptance latency in ns
// "avalanche_P_blks_accepted_container_size_sum" to track cumulative sum of all accepted blocks' sizes
// "avalanche_P_blks_accepted_container_size_sum / avalanche_P_blks_accepted_count" is the average block size
latRejected: metric.NewAveragerWithErrs(
namespace,
fmt.Sprintf("%s_rejected", metricName),
fmt.Sprintf("time (in ns) from issuance of a %s to its rejection", descriptionName),
reg,
&errs,
),
containerSizeRejectedSum: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: fmt.Sprintf("%s_rejected_container_size_sum", metricName),
Help: fmt.Sprintf("Cumulative sum of container size of all rejected %s", metricName),
}),
}
errs.Add(reg.Register(l.numProcessing))
errs.Add(
reg.Register(l.numProcessing),
reg.Register(l.containerSizeAcceptedSum),
reg.Register(l.containerSizeRejectedSum),
)
return l, errs.Err
}

Expand All @@ -126,7 +162,7 @@ func (l *latency) Issued(id ids.ID, pollNumber uint64) {
l.numProcessing.Inc()
}

func (l *latency) Accepted(id ids.ID, pollNumber uint64) {
func (l *latency) Accepted(id ids.ID, pollNumber uint64, containerSize int) {
start, ok := l.processingEntries.Get(id)
if !ok {
l.log.Debug("unable to measure tx latency",
Expand All @@ -142,9 +178,11 @@ func (l *latency) Accepted(id ids.ID, pollNumber uint64) {
duration := time.Since(start.time)
l.latAccepted.Observe(float64(duration))
l.numProcessing.Dec()

l.containerSizeAcceptedSum.Add(float64(containerSize))
}

func (l *latency) Rejected(id ids.ID, pollNumber uint64) {
func (l *latency) Rejected(id ids.ID, pollNumber uint64, containerSize int) {
start, ok := l.processingEntries.Get(id)
if !ok {
l.log.Debug("unable to measure tx latency",
Expand All @@ -160,6 +198,8 @@ func (l *latency) Rejected(id ids.ID, pollNumber uint64) {
duration := time.Since(start.time)
l.latRejected.Observe(float64(duration))
l.numProcessing.Dec()

l.containerSizeRejectedSum.Add(float64(containerSize))
}

func (l *latency) MeasureAndGetOldestDuration() time.Duration {
Expand Down
38 changes: 38 additions & 0 deletions snow/consensus/metrics/timestamp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright (C) 2019-2022, Ava Labs, Inc. All rights reserved.
// See the file LICENSE for licensing terms.

package metrics

import (
"time"

"github.com/prometheus/client_golang/prometheus"
)

var _ Timestamp = &timestamp{}

// Timestamp reports the last accepted block time,
// to track it in unix seconds.
type Timestamp interface {
Accepted(ts time.Time)
}

type timestamp struct {
// lastAcceptedTimestamp keeps track of the last accepted timestamp
lastAcceptedTimestamp prometheus.Gauge
}

func NewTimestamp(namespace string, reg prometheus.Registerer) (Timestamp, error) {
t := &timestamp{
lastAcceptedTimestamp: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: "last_accepted_timestamp",
Help: "Last accepted block timestamp in unix seconds",
}),
}
return t, reg.Register(t.lastAcceptedTimestamp)
}

func (t *timestamp) Accepted(ts time.Time) {
t.lastAcceptedTimestamp.Set(float64(ts.Unix()))
}
3 changes: 3 additions & 0 deletions snow/consensus/snowman/consensus.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
package snowman

import (
"time"

"github.com/ava-labs/avalanchego/ids"
"github.com/ava-labs/avalanchego/snow"
"github.com/ava-labs/avalanchego/snow/consensus/snowball"
Expand All @@ -18,6 +20,7 @@ type Consensus interface {
params snowball.Parameters,
lastAcceptedID ids.ID,
lastAcceptedHeight uint64,
lastAcceptedTime time.Time,
) error

// Returns the parameters that describe this snowman instance
Expand Down
Loading

0 comments on commit 9cb5b46

Please sign in to comment.