Skip to content

Commit

Permalink
Fix failover error causing child workflows to get stuck (cadence-work…
Browse files Browse the repository at this point in the history
…flow#5919)

* Fix failover error causing child workflows to get stuck

* fix test
  • Loading branch information
davidporter-id-au authored Apr 19, 2024
1 parent cc0805f commit b005d32
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 17 deletions.
20 changes: 7 additions & 13 deletions service/history/task/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,20 +288,14 @@ func (t *taskImpl) HandleErr(err error) (retErr error) {
err = nil
}

// target domain not active error, we should retry the task
// so that a cross-cluster task can be created.
if err == errTargetDomainNotActive {
t.scope.IncCounter(metrics.TaskTargetNotActiveCounterPerDomain)
t.logger.Error("Dropping 'domain-not-active' error as non-retriable", tag.Error(err))
return nil
}

// this is a transient error, and means source domain not active
// TODO remove this error check special case
// since the new task life cycle will not give up until task processed / verified
if _, ok := err.(*types.DomainNotActiveError); ok {
if t.timeSource.Now().Sub(t.submitTime) > 2*cache.DomainCacheRefreshInterval {
// using a fairly long timeout here because domain updates is an async process
// which could take a fair while to be processed by the domain queue, the DB updated
// the domain cache refeshed and then updated here.
var e *types.DomainNotActiveError
if errors.As(err, &e) || errors.Is(err, errTargetDomainNotActive) {
if t.timeSource.Now().Sub(t.submitTime) > 5*cache.DomainCacheRefreshInterval {
t.scope.IncCounter(metrics.TaskNotActiveCounterPerDomain)
// If the domain is *still* not active, drop after a while.
return nil
}

Expand Down
8 changes: 4 additions & 4 deletions service/history/task/task_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,11 @@ func (s *taskSuite) TestHandleErr_ErrTargetDomainNotActive() {

// we should always return the target domain not active error
// no matter that the submit time is
taskBase.submitTime = time.Now().Add(-cache.DomainCacheRefreshInterval * time.Duration(2))
s.Equal(nil, taskBase.HandleErr(err))
taskBase.submitTime = time.Now().Add(-cache.DomainCacheRefreshInterval*time.Duration(5) - time.Second)
s.Equal(nil, taskBase.HandleErr(err), "should drop errors after a reasonable time")

taskBase.submitTime = time.Now()
s.Equal(nil, taskBase.HandleErr(err))
s.Equal(err, taskBase.HandleErr(err))
}

func (s *taskSuite) TestHandleErr_ErrDomainNotActive() {
Expand All @@ -176,7 +176,7 @@ func (s *taskSuite) TestHandleErr_ErrDomainNotActive() {

err := &types.DomainNotActiveError{}

taskBase.submitTime = time.Now().Add(-cache.DomainCacheRefreshInterval * time.Duration(2))
taskBase.submitTime = time.Now().Add(-cache.DomainCacheRefreshInterval*time.Duration(5) - time.Second)
s.NoError(taskBase.HandleErr(err))

taskBase.submitTime = time.Now()
Expand Down

0 comments on commit b005d32

Please sign in to comment.