From f0a613b64298f4f2154c8efb6ffa05eac04017c4 Mon Sep 17 00:00:00 2001 From: Samar Abbas - Uber Date: Fri, 10 May 2019 14:59:59 -0700 Subject: [PATCH] Correctly emit workflow execution stats (#1829) Workflow execution stats were incorrectly emitted under persistence scope instead of execution stats scope. Session state metric was not tagged correctly for all domains scope. --- service/history/shardContext.go | 7 +++++-- service/history/workflowExecutionContext.go | 18 +++++++++++++----- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/service/history/shardContext.go b/service/history/shardContext.go index ac782d54bc9..887c4366d13 100644 --- a/service/history/shardContext.go +++ b/service/history/shardContext.go @@ -776,9 +776,12 @@ func (s *shardContextImpl) AppendHistoryEvents(request *persistence.AppendHistor defer func() { // N.B. - Dual emit here makes sense so that we can see aggregate timer stats across all // domains along with the individual domains stats - s.metricsClient.RecordTimer(metrics.SessionSizeStatsScope, metrics.HistorySize, time.Duration(size)) + allDomainSizeScope := s.metricsClient.Scope(metrics.SessionSizeStatsScope, metrics.DomainAllTag()) + allDomainSizeScope.RecordTimer(metrics.HistorySize, time.Duration(size)) if domainEntry != nil && domainEntry.GetInfo() != nil { - s.metricsClient.Scope(metrics.SessionSizeStatsScope, metrics.DomainTag(domainEntry.GetInfo().Name)).RecordTimer(metrics.HistorySize, time.Duration(size)) + domainSizeScope := s.metricsClient.Scope(metrics.SessionSizeStatsScope, metrics.DomainTag( + domainEntry.GetInfo().Name)) + domainSizeScope.RecordTimer(metrics.HistorySize, time.Duration(size)) } if size >= historySizeLogThreshold { s.throttledLogger.Warn("history size threshold breached", diff --git a/service/history/workflowExecutionContext.go b/service/history/workflowExecutionContext.go index 608798a1f54..471aa6656ef 100644 --- a/service/history/workflowExecutionContext.go +++ b/service/history/workflowExecutionContext.go @@ -622,14 +622,22 @@ func (c *workflowExecutionContextImpl) update(transferTasks []persistence.Task, historyCount := int(c.msBuilder.GetNextEventID()) - 1 historySize := int(c.msBuilder.GetHistorySize()) + newHistorySize + // All execution stats are emitted under emitWorkflowExecutionStats which is only invoked when the mutableState + // is loaded. Looks like MutableStateStats are returned by persistence layer when mutableState is loaded from DB. + // It is much better to emit the entire execution stats on each update. So for now we are explicitly emitting + // historySize and historyCount metric for execution on each update explicitly. // N.B. - Dual emit is required here so that we can see aggregate timer stats across all // domains along with the individual domains stats - c.metricsClient.RecordTimer(metrics.PersistenceUpdateWorkflowExecutionScope, metrics.HistorySize, time.Duration(historySize)) - c.metricsClient.RecordTimer(metrics.PersistenceUpdateWorkflowExecutionScope, metrics.HistoryCount, time.Duration(historyCount)) + allDomainSizeScope := c.metricsClient.Scope(metrics.ExecutionSizeStatsScope, metrics.DomainAllTag()) + allDomainCountScope := c.metricsClient.Scope(metrics.ExecutionCountStatsScope, metrics.DomainAllTag()) + allDomainSizeScope.RecordTimer(metrics.HistorySize, time.Duration(historySize)) + allDomainCountScope.RecordTimer(metrics.HistoryCount, time.Duration(historyCount)) if entry, err := c.shard.GetDomainCache().GetDomainByID(executionInfo.DomainID); err == nil && entry != nil && entry.GetInfo() != nil { - scope := c.metricsClient.Scope(metrics.PersistenceUpdateWorkflowExecutionScope, metrics.DomainTag(entry.GetInfo().Name)) - scope.RecordTimer(metrics.HistorySize, time.Duration(historySize)) - scope.RecordTimer(metrics.HistoryCount, time.Duration(historyCount)) + domain := entry.GetInfo().Name + domainSizeScope := c.metricsClient.Scope(metrics.ExecutionSizeStatsScope, metrics.DomainTag(domain)) + domainCountScope := c.metricsClient.Scope(metrics.ExecutionCountStatsScope, metrics.DomainTag(domain)) + domainSizeScope.RecordTimer(metrics.HistorySize, time.Duration(historySize)) + domainCountScope.RecordTimer(metrics.HistoryCount, time.Duration(historyCount)) } if historySize > sizeLimitWarn || historyCount > countLimitWarn {