Skip to content

Commit

Permalink
test: metrics gathering improvements (algorand#4639)
Browse files Browse the repository at this point in the history
  • Loading branch information
brianolson authored Oct 26, 2022
1 parent ad08f74 commit a3e90ad
Show file tree
Hide file tree
Showing 9 changed files with 85 additions and 14 deletions.
4 changes: 4 additions & 0 deletions config/localTemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,10 @@ type Local struct {
// MaxAcctLookback sets the maximum lookback range for account states,
// i.e. the ledger can answer account states questions for the range Latest-MaxAcctLookback...Latest
MaxAcctLookback uint64 `version[23]:"4"`

// EnableUsageLog enables 10Hz log of CPU and RAM usage.
// Also adds 'algod_ram_usage` (number of bytes in use) to /metrics
EnableUsageLog bool `version[24]:"false"`
}

// DNSBootstrapArray returns an array of one or more DNS Bootstrap identifiers
Expand Down
1 change: 1 addition & 0 deletions config/local_defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ var defaultLocal = Local{
EnableRequestLogger: false,
EnableRuntimeMetrics: false,
EnableTopAccountsReporting: false,
EnableUsageLog: false,
EnableVerbosedTransactionSyncLogging: false,
EndpointAddress: "127.0.0.1:0",
FallbackDNSResolverAddress: "",
Expand Down
1 change: 1 addition & 0 deletions installer/config.json.example
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
"EnableRequestLogger": false,
"EnableRuntimeMetrics": false,
"EnableTopAccountsReporting": false,
"EnableUsageLog": false,
"EnableVerbosedTransactionSyncLogging": false,
"EndpointAddress": "127.0.0.1:0",
"FallbackDNSResolverAddress": "",
Expand Down
10 changes: 9 additions & 1 deletion logging/usage.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,16 @@ package logging

import (
"context"
"runtime"
"sync"
"time"

"github.com/algorand/go-algorand/util"
"github.com/algorand/go-algorand/util/metrics"
)

var ramUsageGauge = metrics.MakeGauge(metrics.MetricName{Name: "algod_ram_usage", Description: "number of bytes runtime.ReadMemStats().HeapInuse"})

// UsageLogThread utility logging method
func UsageLogThread(ctx context.Context, log Logger, period time.Duration, wg *sync.WaitGroup) {
if wg != nil {
Expand All @@ -34,6 +38,7 @@ func UsageLogThread(ctx context.Context, log Logger, period time.Duration, wg *s
var prevUtime, prevStime int64
var Utime, Stime int64
var prevTime time.Time
var mst runtime.MemStats

ticker := time.NewTicker(period)
hasPrev := false
Expand All @@ -48,13 +53,16 @@ func UsageLogThread(ctx context.Context, log Logger, period time.Duration, wg *s
now = time.Now()
Utime, Stime, _ = util.GetCurrentProcessTimes()

runtime.ReadMemStats(&mst)
ramUsageGauge.Set(float64(mst.HeapInuse))

if hasPrev {
userNanos := Utime - prevUtime
sysNanos := Stime - prevStime
wallNanos := now.Sub(prevTime).Nanoseconds()
userf := float64(userNanos) / float64(wallNanos)
sysf := float64(sysNanos) / float64(wallNanos)
log.Infof("usage nanos wall=%d user=%d sys=%d pu=%0.4f%% ps=%0.4f%%", wallNanos, userNanos, sysNanos, userf*100.0, sysf*100.0)
log.Infof("usage nanos wall=%d user=%d sys=%d pu=%0.4f%% ps=%0.4f%% inuse=%d", wallNanos, userNanos, sysNanos, userf*100.0, sysf*100.0, mst.HeapInuse)
} else {
hasPrev = true
}
Expand Down
5 changes: 3 additions & 2 deletions node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -385,8 +385,9 @@ func (node *AlgorandFullNode) startMonitoringRoutines() {
// Delete old participation keys
go node.oldKeyDeletionThread(node.ctx.Done())

// TODO re-enable with configuration flag post V1
//go logging.UsageLogThread(node.ctx, node.log, 100*time.Millisecond, nil)
if node.config.EnableUsageLog {
go logging.UsageLogThread(node.ctx, node.log, 100*time.Millisecond, nil)
}
}

// waitMonitoringRoutines waits for all the monitoring routines to exit. Note that
Expand Down
19 changes: 17 additions & 2 deletions test/heapwatch/block_history.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,19 @@ def addr_token_from_algod(algorand_data):
def loads(blob):
return msgpack.loads(base64.b64decode(blob), strict_map_key=False)

def bstr(x):
if isinstance(x, bytes):
try:
return x.decode()
except:
pass
return x

def obnice(ob):
if isinstance(ob, dict):
return {bstr(k):obnice(v) for k,v in ob.items()}
return ob

def dumps(blob):
return base64.b64encode(msgpack.dumps(blob))

Expand Down Expand Up @@ -180,8 +193,10 @@ def _loop_inner(self, lastround):
if b is None:
print("got None nextblock. exiting")
return
b = msgpack.loads(b, strict_map_key=False)
b = msgpack.loads(b, strict_map_key=False, raw=True)
b = obnice(b)
nowround = b['block'].get('rnd', 0)
logger.debug('r%d', nowround)
if (lastround is not None) and (nowround != lastround + 1):
logger.info('round jump %d to %d', lastround, nowround)
self._block_handler(b)
Expand Down Expand Up @@ -226,7 +241,7 @@ def main():
logging.basicConfig(level=logging.INFO)

algorand_data = args.algod or os.getenv('ALGORAND_DATA')
if not algorand_data and not (args.token and args.addr):
if not algorand_data and not ((args.token or args.headers) and args.addr):
sys.stderr.write('must specify algod data dir by $ALGORAND_DATA or -d/--algod; OR --a/--addr and -t/--token\n')
sys.exit(1)

Expand Down
9 changes: 7 additions & 2 deletions test/heapwatch/block_history_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,12 @@ def process(path, args):
ax1.set_title('round time (seconds)')
ax1.hist(list(filter(lambda x: x < 9,dtv[start:end])),bins=20)

ax2.set_title('TPS')
ax2.hist(tpsv[start:end],bins=20)
if args.rtime:
ax2.set_title('round time')
ax2.plot(dtv)
else:
ax2.set_title('TPS')
ax2.hist(tpsv[start:end],bins=20)

ax3.set_title('txn/block')
ax3.hist(txnv[start:end],bins=20)
Expand Down Expand Up @@ -152,6 +156,7 @@ def main():
ap.add_argument('files', nargs='+')
ap.add_argument('--all', default=False, action='store_true')
ap.add_argument('--tps1', default=False, action='store_true')
ap.add_argument('--rtime', default=False, action='store_true')
ap.add_argument('--start', default=0, type=int, help='start round')
args = ap.parse_args()

Expand Down
49 changes: 42 additions & 7 deletions test/heapwatch/metrics_delta.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def __call__(self, ttr, nick):
def blockinfo(self, curtime):
return self.biByTime.get(curtime)

def byMsg(self):
def byMsg(self, html=False):
txPSums = {}
rxPSums = {}
secondsSum = 0
Expand All @@ -209,18 +209,27 @@ def byMsg(self):
dictMax(rxMax, ns.rxPLists)
dictMin(txMin, ns.txPLists)
dictMin(rxMin, ns.rxPLists)
lines = [
'{} nodes: {}'.format(len(nicks), nicks),
'\ttx B/s\trx B/s',
]
nodesummary = '{} nodes: {}'.format(len(nicks), nicks)
lines = []
if html:
lines.append('<div>{}</div>'.format(nodesummary))
lines.append('<table><tr><th></th><th>tx B/s</th><th>rx B/s</th></tr>')
else:
lines.append(nodesummary)
lines.append('\ttx B/s\trx B/s')
for msg, txB in txPSums.items():
if msg not in rxPSums:
rxPSums[msg] = 0
for rxBps, msg in sorted([(rxB/secondsSum, msg) for msg, rxB in rxPSums.items()], reverse=True):
txBps = txPSums.get(msg,0)/secondsSum
if (txBps < 0.5) and (rxBps < 0.5):
continue
lines.append('{}\t{:.0f}\t{:.0f}'.format(msg, txBps, rxBps))
if html:
lines.append('<tr><td>{}</td><td>{:.0f}</td><td>{:.0f}</td></tr>'.format(msg, txBps, rxBps))
else:
lines.append('{}\t{:.0f}\t{:.0f}'.format(msg, txBps, rxBps))
if html:
lines.append('</table>')
return '\n'.join(lines)

def txPool(self):
Expand All @@ -242,6 +251,12 @@ def txPool(self):
)

def __str__(self):
return self.str(html=False)

def html(self):
return self.str(html=True)

def str(self, html=False):
if not self.sumsCount:
tps, txbps, rxbps = math.nan, math.nan, math.nan
blockTimes = math.nan
Expand All @@ -256,9 +271,17 @@ def __str__(self):
labelspace = self.label + " "
if self.verifyMillis:
verifyMillis = labelspace + 'verify ms ({:.0f}/{:.0f}/{:.0f})\n'.format(min(self.verifyMillis), meanOrZero(self.verifyMillis), max(self.verifyMillis))
if html:
verifyMillis = '<div>' + verifyMillis + '</div>'
else:
verifyMillis = ''
return '{byMsg}\n{verifyMillis}{labelspace}{txPool}\n{labelspace}summary: {TPS:0.2f} TPS, {bt:1.2f}s/block, tx {txBps}B/s, rx {rxBps}B/s'.format(labelspace=labelspace, byMsg=self.byMsg(), txPool=self.txPool(), TPS=tps, txBps=hunum(txbps), rxBps=hunum(rxbps), bt=blockTimes, verifyMillis=verifyMillis)
if html:
fmt = '{byMsg}\n{verifyMillis}<div>{labelspace}{txPool}</div>\n<div>{labelspace}summary: {TPS:0.2f} TPS, {bt:1.2f}s/block, tx {txBps}B/s, rx {rxBps}B/s</div>'
if self.label:
fmt = '<div class="lh">' + self.label + '</div>' + fmt
else:
fmt = '{byMsg}\n{verifyMillis}{labelspace}{txPool}\n{labelspace}summary: {TPS:0.2f} TPS, {bt:1.2f}s/block, tx {txBps}B/s, rx {rxBps}B/s'
return fmt.format(labelspace=labelspace, byMsg=self.byMsg(html), txPool=self.txPool(), TPS=tps, txBps=hunum(txbps), rxBps=hunum(rxbps), bt=blockTimes, verifyMillis=verifyMillis)

def plot_pool(self, outpath):
from matplotlib import pyplot as plt
Expand Down Expand Up @@ -330,6 +353,7 @@ def main():
ap.add_argument('--mintps', default=None, type=float, help="records below min TPS don't add into summary")
ap.add_argument('--deltas', default=None, help='path to write csv deltas')
ap.add_argument('--report', default=None, help='path to write csv report')
ap.add_argument('--html-summary', default=None, help='path to write html summary')
ap.add_argument('--nick-re', action='append', default=[], help='regexp to filter node names, may be repeated')
ap.add_argument('--nick-lre', action='append', default=[], help='label:regexp to filter node names, may be repeated')
ap.add_argument('--pool-plot-root', help='write to foo.svg and .png')
Expand Down Expand Up @@ -396,6 +420,9 @@ def main():
if args.pool_plot_root:
grsum.plot_pool(args.pool_plot_root)

htmlout = None
if args.html_summary:
htmlout = open(args.html_summary, 'wt')
# maybe subprocess for stats across named groups
if args.nick_re:
# use each --nick-re=foo as a group
Expand All @@ -404,6 +431,8 @@ def main():
process_nick_re(nre, filesByNick, nick_to_tfname, rsum, args, grsum)
print(rsum)
print('\n')
if htmlout:
htmlout.write(rsum.html())
return 0
if args.nick_lre:
for lnre in args.nick_lre:
Expand All @@ -412,10 +441,14 @@ def main():
process_nick_re(nre, filesByNick, nick_to_tfname, rsum, args, grsum)
print(rsum)
print('\n')
if htmlout:
htmlout.write(rsum.html())
return 0

# no filters, print global result
print(grsum)
if htmlout:
htmlout.write(grsum.html())
return 0

def perProtocol(prefix, lists, sums, deltas, dt):
Expand Down Expand Up @@ -515,6 +548,8 @@ def process_files(self, args, nick=None, metrics_files=None, bisource=None):
self.biByTime[curtime] = bi
if bi is None:
bi = bisource.get(curtime)
if bi is None:
logger.warning('%s no blockinfo', path)
self.txPool.append(cur.get('algod_tx_pool_count{}'))
#logger.debug('%s: %r', path, cur)
verifyGood = cur.get('algod_agreement_proposal_verify_good{}')
Expand Down
1 change: 1 addition & 0 deletions test/testdata/configs/config-v24.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
"EnableRequestLogger": false,
"EnableRuntimeMetrics": false,
"EnableTopAccountsReporting": false,
"EnableUsageLog": false,
"EnableVerbosedTransactionSyncLogging": false,
"EndpointAddress": "127.0.0.1:0",
"FallbackDNSResolverAddress": "",
Expand Down

0 comments on commit a3e90ad

Please sign in to comment.