Skip to content

Commit

Permalink
Add a step to scan workflow to be in DLQ (cadence-workflow#4471)
Browse files Browse the repository at this point in the history
* Add a step to scan workflow to be in DLQ
  • Loading branch information
yux0 authored Sep 10, 2021
1 parent df3e552 commit 1cc94d5
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 11 deletions.
2 changes: 2 additions & 0 deletions common/metrics/defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -2025,6 +2025,7 @@ const (
ReplicationDLQAckLevelGauge
ReplicationDLQProbeFailed
ReplicationDLQSize
ReplicationDLQValidationFailed
GetReplicationMessagesForShardLatency
GetDLQReplicationMessagesLatency
EventReapplySkippedCount
Expand Down Expand Up @@ -2528,6 +2529,7 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{
ReplicationDLQAckLevelGauge: {metricName: "replication_dlq_ack_level", metricType: Gauge},
ReplicationDLQProbeFailed: {metricName: "replication_dlq_probe_failed", metricType: Counter},
ReplicationDLQSize: {metricName: "replication_dlq_size", metricType: Gauge},
ReplicationDLQValidationFailed: {metricName: "replication_dlq_validation_failed", metricType: Counter},
GetReplicationMessagesForShardLatency: {metricName: "get_replication_messages_for_shard", metricType: Timer},
GetDLQReplicationMessagesLatency: {metricName: "get_dlq_replication_messages", metricType: Timer},
EventReapplySkippedCount: {metricName: "event_reapply_skipped_count", metricType: Counter},
Expand Down
32 changes: 32 additions & 0 deletions common/reconciliation/constants.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright (c) 2021 Uber Technologies, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

package reconciliation

// Execution fixer workflow relates

const (
ExecutionFixerWorkflowType = "execution-fixer-workflow"
ExecutionFixerWorkflowTaskList = "execution-fixer-tl"
ExecutionFixerWorkflowSignalName = "execution-fixer-signal"
ExecutionFixerWorkflowID = "execution-fixer-workflow-id"
ExecutionFixerWorkflowTimeout = 24 * 60 * 60
ExecutionFixerWorkflowTaskTimeoutInSeconds = 60
)
65 changes: 65 additions & 0 deletions service/history/replication/task_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,15 @@ package replication

import (
"context"
"encoding/json"
"errors"
"fmt"
"math"
"strconv"
"sync/atomic"
"time"

"github.com/pborman/uuid"
"go.uber.org/yarpc/yarpcerrors"

"github.com/uber/cadence/common"
Expand All @@ -39,6 +42,8 @@ import (
"github.com/uber/cadence/common/metrics"
"github.com/uber/cadence/common/persistence"
"github.com/uber/cadence/common/quotas"
"github.com/uber/cadence/common/reconciliation"
"github.com/uber/cadence/common/reconciliation/entity"
"github.com/uber/cadence/common/types"
"github.com/uber/cadence/service/history/config"
"github.com/uber/cadence/service/history/engine"
Expand Down Expand Up @@ -427,6 +432,11 @@ func (p *taskProcessorImpl) processSingleTask(replicationTask *types.Replication
tag.TaskID(replicationTask.GetSourceTaskID()),
tag.Error(err),
)
//TODO: uncomment this when the execution fixer workflow is ready
//if err = p.triggerDataInconsistencyScan(replicationTask); err != nil {
// p.logger.Warn("Failed to trigger data scan", tag.Error(err))
// p.metricsClient.IncCounter(metrics.ReplicationDLQStatsScope, metrics.ReplicationDLQValidationFailed)
//}
return p.putReplicationTaskToDLQ(replicationTask)
}
}
Expand Down Expand Up @@ -540,6 +550,61 @@ func (p *taskProcessorImpl) generateDLQRequest(
}
}

func (p *taskProcessorImpl) triggerDataInconsistencyScan(replicationTask *types.ReplicationTask) error {

var failoverVersion int64
var domainID string
var workflowID string
var runID string
switch {
case replicationTask.GetHistoryTaskV2Attributes() != nil:
attr := replicationTask.GetHistoryTaskV2Attributes()
versionHistoryItems := attr.GetVersionHistoryItems()
if versionHistoryItems == nil || len(versionHistoryItems) == 0 {
return errors.New("failed to trigger data scan due to invalid version history")
}
// version history items in same batch should be the same
failoverVersion = versionHistoryItems[0].GetVersion()
domainID = attr.GetDomainID()
workflowID = attr.GetWorkflowID()
runID = attr.GetRunID()
case replicationTask.GetSyncActivityTaskAttributes() != nil:
attr := replicationTask.GetSyncActivityTaskAttributes()
failoverVersion = replicationTask.GetSyncActivityTaskAttributes().Version
domainID = attr.GetDomainID()
workflowID = attr.GetWorkflowID()
runID = attr.GetRunID()
default:
return nil
}
clusterName := p.shard.GetClusterMetadata().ClusterNameForFailoverVersion(failoverVersion)
client := p.shard.GetService().GetClientBean().GetRemoteFrontendClient(clusterName)
fixExecution := entity.Execution{
DomainID: domainID,
WorkflowID: workflowID,
RunID: runID,
}
fixExecutionInput, err := json.Marshal(fixExecution)
if err != nil {
return err
}
// Assume the workflow is corrupted, rely on invariant to validate it
_, err = client.SignalWithStartWorkflowExecution(context.Background(), &types.SignalWithStartWorkflowExecutionRequest{
Domain: common.SystemLocalDomainName,
WorkflowID: reconciliation.ExecutionFixerWorkflowID,
WorkflowType: &types.WorkflowType{Name: reconciliation.ExecutionFixerWorkflowType},
TaskList: &types.TaskList{Name: reconciliation.ExecutionFixerWorkflowTaskList},
ExecutionStartToCloseTimeoutSeconds: common.Int32Ptr(reconciliation.ExecutionFixerWorkflowTimeout),
TaskStartToCloseTimeoutSeconds: common.Int32Ptr(reconciliation.ExecutionFixerWorkflowTaskTimeoutInSeconds),
Identity: "cadence-history-replication",
RequestID: uuid.New(),
WorkflowIDReusePolicy: types.WorkflowIDReusePolicyAllowDuplicate.Ptr(),
SignalName: reconciliation.ExecutionFixerWorkflowSignalName,
SignalInput: fixExecutionInput,
})
return err
}

func (p *taskProcessorImpl) emitDLQSizeMetricsLoop() {
timer := time.NewTimer(backoff.JitDuration(
dlqMetricsEmitTimerInterval,
Expand Down
66 changes: 55 additions & 11 deletions service/history/replication/task_processor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
package replication

import (
"context"
"encoding/json"
"testing"
"time"

Expand All @@ -33,6 +35,7 @@ import (

"github.com/uber/cadence/client"
"github.com/uber/cadence/client/admin"
"github.com/uber/cadence/client/frontend"
"github.com/uber/cadence/common"
"github.com/uber/cadence/common/cache"
"github.com/uber/cadence/common/cluster"
Expand All @@ -41,6 +44,8 @@ import (
"github.com/uber/cadence/common/mocks"
"github.com/uber/cadence/common/persistence"
"github.com/uber/cadence/common/quotas"
"github.com/uber/cadence/common/reconciliation"
"github.com/uber/cadence/common/reconciliation/entity"
"github.com/uber/cadence/common/types"
"github.com/uber/cadence/service/history/config"
"github.com/uber/cadence/service/history/engine"
Expand All @@ -53,17 +58,18 @@ type (
*require.Assertions
controller *gomock.Controller

mockShard *shard.TestContext
mockEngine *engine.MockEngine
config *config.Config
taskFetcher *MockTaskFetcher
mockDomainCache *cache.MockDomainCache
mockClientBean *client.MockBean
adminClient *admin.MockClient
clusterMetadata *cluster.MockMetadata
executionManager *mocks.ExecutionManager
requestChan chan *request
taskExecutor *MockTaskExecutor
mockShard *shard.TestContext
mockEngine *engine.MockEngine
config *config.Config
taskFetcher *MockTaskFetcher
mockDomainCache *cache.MockDomainCache
mockClientBean *client.MockBean
mockFrontendClient *frontend.MockClient
adminClient *admin.MockClient
clusterMetadata *cluster.MockMetadata
executionManager *mocks.ExecutionManager
requestChan chan *request
taskExecutor *MockTaskExecutor

taskProcessor *taskProcessorImpl
}
Expand Down Expand Up @@ -98,6 +104,7 @@ func (s *taskProcessorSuite) SetupTest() {

s.mockDomainCache = s.mockShard.Resource.DomainCache
s.mockClientBean = s.mockShard.Resource.ClientBean
s.mockFrontendClient = s.mockShard.Resource.RemoteFrontendClient
s.adminClient = s.mockShard.Resource.RemoteAdminClient
s.clusterMetadata = s.mockShard.Resource.ClusterMetadata
s.executionManager = s.mockShard.Resource.ExecutionMgr
Expand Down Expand Up @@ -293,3 +300,40 @@ func (s *taskProcessorSuite) TestGenerateDLQRequest_ReplicationTaskTypeSyncActiv
s.Equal(runID, request.TaskInfo.GetRunID())
s.Equal(persistence.ReplicationTaskTypeSyncActivity, request.TaskInfo.GetTaskType())
}

func (s *taskProcessorSuite) TestTriggerDataInconsistencyScan_Success() {
domainID := uuid.New()
workflowID := uuid.New()
runID := uuid.New()
task := &types.ReplicationTask{
TaskType: types.ReplicationTaskTypeSyncActivity.Ptr(),
SyncActivityTaskAttributes: &types.SyncActivityTaskAttributes{
DomainID: domainID,
WorkflowID: workflowID,
RunID: runID,
ScheduledID: 1,
Version: 100,
},
}
fixExecution := entity.Execution{
DomainID: domainID,
WorkflowID: workflowID,
RunID: runID,
}
jsArray, err := json.Marshal(fixExecution)
s.NoError(err)
s.mockFrontendClient.EXPECT().SignalWithStartWorkflowExecution(gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, request *types.SignalWithStartWorkflowExecutionRequest) {
s.Equal(common.SystemLocalDomainName, request.GetDomain())
s.Equal(reconciliation.ExecutionFixerWorkflowID, request.GetWorkflowID())
s.Equal(reconciliation.ExecutionFixerWorkflowType, request.GetWorkflowType().GetName())
s.Equal(reconciliation.ExecutionFixerWorkflowTaskList, request.GetTaskList().GetName())
s.Equal(types.WorkflowIDReusePolicyAllowDuplicate.String(), request.GetWorkflowIDReusePolicy().String())
s.Equal(reconciliation.ExecutionFixerWorkflowSignalName, request.GetSignalName())
s.Equal(jsArray, request.GetSignalInput())
}).Return(&types.StartWorkflowExecutionResponse{}, nil)
s.clusterMetadata.EXPECT().ClusterNameForFailoverVersion(int64(100)).Return("active")

err = s.taskProcessor.triggerDataInconsistencyScan(task)
s.NoError(err)
}

0 comments on commit 1cc94d5

Please sign in to comment.