Skip to content

Commit

Permalink
Add logging/metrics for decision attempts (cadence-workflow#3849)
Browse files Browse the repository at this point in the history
  • Loading branch information
yycptt authored Dec 11, 2020
1 parent c5a16ab commit e855a6a
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 2 deletions.
2 changes: 2 additions & 0 deletions common/metrics/defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -1777,6 +1777,7 @@ const (
EmptyCompletionDecisionsCounter
MultipleCompletionDecisionsCounter
FailedDecisionsCounter
DecisionAttemptTimer
StaleMutableStateCounter
AutoResetPointsLimitExceededCounter
AutoResetPointCorruptionCounter
Expand Down Expand Up @@ -2251,6 +2252,7 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{
EmptyCompletionDecisionsCounter: {metricName: "empty_completion_decisions", metricType: Counter},
MultipleCompletionDecisionsCounter: {metricName: "multiple_completion_decisions", metricType: Counter},
FailedDecisionsCounter: {metricName: "failed_decisions", metricType: Counter},
DecisionAttemptTimer: {metricName: "decision_attempt", metricType: Timer},
StaleMutableStateCounter: {metricName: "stale_mutable_state", metricType: Counter},
AutoResetPointsLimitExceededCounter: {metricName: "auto_reset_points_exceed_limit", metricType: Counter},
AutoResetPointCorruptionCounter: {metricName: "auto_reset_point_corruption", metricType: Counter},
Expand Down
3 changes: 3 additions & 0 deletions common/service/dynamicconfig/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ var keys = map[Key]string{
HistoryThrottledLogRPS: "history.throttledLogRPS",
StickyTTL: "history.stickyTTL",
DecisionHeartbeatTimeout: "history.decisionHeartbeatTimeout",
DecisionRetryCriticalAttempts: "history.decisionRetryCriticalAttempts",
ParentClosePolicyThreshold: "history.parentClosePolicyThreshold",
NumParentClosePolicySystemWorkflows: "history.numParentClosePolicySystemWorkflows",
ReplicationTaskFetcherParallelism: "history.ReplicationTaskFetcherParallelism",
Expand Down Expand Up @@ -781,6 +782,8 @@ const (
StickyTTL
// DecisionHeartbeatTimeout for decision heartbeat
DecisionHeartbeatTimeout
// DecisionRetryCriticalAttempts is the decision attempt threshold for logging and emiting metrics
DecisionRetryCriticalAttempts

// EnableDropStuckTaskByDomainID is whether stuck timer/transfer task should be dropped for a domain
EnableDropStuckTaskByDomainID
Expand Down
2 changes: 2 additions & 0 deletions service/history/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ type Config struct {
DecisionHeartbeatTimeout dynamicconfig.DurationPropertyFnWithDomainFilter
// MaxDecisionStartToCloseSeconds is the StartToCloseSeconds for decision
MaxDecisionStartToCloseSeconds dynamicconfig.IntPropertyFnWithDomainFilter
DecisionRetryCriticalAttempts dynamicconfig.IntPropertyFn

// The following is used by the new RPC replication stack
ReplicationTaskFetcherParallelism dynamicconfig.IntPropertyFn
Expand Down Expand Up @@ -439,6 +440,7 @@ func New(dc *dynamicconfig.Collection, numberOfShards int, storeType string, isA
SearchAttributesTotalSizeLimit: dc.GetIntPropertyFilteredByDomain(dynamicconfig.SearchAttributesTotalSizeLimit, 40*1024),
StickyTTL: dc.GetDurationPropertyFilteredByDomain(dynamicconfig.StickyTTL, time.Hour*24*365),
DecisionHeartbeatTimeout: dc.GetDurationPropertyFilteredByDomain(dynamicconfig.DecisionHeartbeatTimeout, time.Minute*30),
DecisionRetryCriticalAttempts: dc.GetIntProperty(dynamicconfig.DecisionRetryCriticalAttempts, 10), // about 30m

ReplicationTaskFetcherParallelism: dc.GetIntProperty(dynamicconfig.ReplicationTaskFetcherParallelism, 1),
ReplicationTaskFetcherAggregationInterval: dc.GetDurationProperty(dynamicconfig.ReplicationTaskFetcherAggregationInterval, 2*time.Second),
Expand Down
12 changes: 12 additions & 0 deletions service/history/execution/mutable_state_decision_task_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/uber/cadence/common"
"github.com/uber/cadence/common/errors"
"github.com/uber/cadence/common/log/tag"
"github.com/uber/cadence/common/metrics"
"github.com/uber/cadence/common/persistence"
"github.com/uber/cadence/common/types"
)
Expand Down Expand Up @@ -585,6 +586,17 @@ func (m *mutableStateDecisionTaskManagerImpl) FailDecision(
if incrementAttempt {
failDecisionInfo.Attempt = m.msb.executionInfo.DecisionAttempt + 1
failDecisionInfo.ScheduledTimestamp = m.msb.timeSource.Now().UnixNano()

if failDecisionInfo.Attempt >= int64(m.msb.shard.GetConfig().DecisionRetryCriticalAttempts()) {
domainName := m.msb.GetDomainEntry().GetInfo().Name
domainTag := metrics.DomainTag(domainName)
m.msb.metricsClient.Scope(metrics.WorkflowContextScope, domainTag).RecordTimer(metrics.DecisionAttemptTimer, time.Duration(failDecisionInfo.Attempt))
m.msb.logger.Warn("Critical error processing decision task, retrying.",
tag.WorkflowDomainName(m.msb.GetDomainEntry().GetInfo().Name),
tag.WorkflowID(m.msb.GetExecutionInfo().WorkflowID),
tag.WorkflowRunID(m.msb.GetExecutionInfo().RunID),
)
}
}
m.UpdateDecision(failDecisionInfo)
}
Expand Down
2 changes: 1 addition & 1 deletion service/history/ndc/events_reapplier_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion service/history/reset/resetter_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit e855a6a

Please sign in to comment.