Skip to content

Commit

Permalink
Optimize failover performance by batching domain failovers (cadence-w…
Browse files Browse the repository at this point in the history
…orkflow#1280)

* domain cache will periodically scan domain v2 table for domain updates
* domain updates will be sent in batch to history engine and every domain failover in this period will be handled by one failover processor
  • Loading branch information
wxing1292 authored Dec 4, 2018
1 parent 5a8d1ed commit 38e1a0b
Show file tree
Hide file tree
Showing 23 changed files with 495 additions and 336 deletions.
237 changes: 132 additions & 105 deletions common/cache/domainCache.go

Large diffs are not rendered by default.

21 changes: 4 additions & 17 deletions common/cache/domainCache_mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,23 +131,10 @@ func (_m *DomainCacheMock) GetDomainID(name string) (string, error) {
return r0, r1
}

// GetDomainNotificationVersion provides a mock function with given fields:
func (_m *DomainCacheMock) GetDomainNotificationVersion() int64 {
ret := _m.Called()

var r0 int64
if rf, ok := ret.Get(0).(func() int64); ok {
r0 = rf()
} else {
r0 = ret.Get(0).(int64)
}

return r0
}

// RegisterDomainChangeCallback provides a mock function with given fields: shard, initialNotificationVersion, beforeCallback, afterCallback
func (_m *DomainCacheMock) RegisterDomainChangeCallback(shard int, initialNotificationVersion int64, beforeCallback CallbackFn, afterCallback CallbackFn) {
_m.Called(shard, initialNotificationVersion, beforeCallback, afterCallback)
// RegisterDomainChangeCallback provides a mock function with given fields: shard, initialNotificationVersion, prepareCallbackFn, callback
func (_m *DomainCacheMock) RegisterDomainChangeCallback(shard int, initialNotificationVersion int64,
prepareCallbackFn PrepareCallbackFn, callback CallbackFn) {
_m.Called(shard, initialNotificationVersion, prepareCallbackFn, callback)
}

// Start provides a mock function with given fields:
Expand Down
152 changes: 81 additions & 71 deletions common/cache/domainCache_test.go

Large diffs are not rendered by default.

10 changes: 4 additions & 6 deletions common/metrics/defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -872,9 +872,8 @@ const (
CadenceClientFailures
CadenceClientLatency

DomainCacheTotalCallbacksLatency
DomainCacheBeforeCallbackLatency
DomainCacheAfterCallbackLatency
DomainCachePrepareCallbacksLatency
DomainCacheCallbacksLatency

HistorySize

Expand Down Expand Up @@ -1050,9 +1049,8 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{
CadenceClientRequests: {metricName: "cadence.client.requests", metricType: Counter},
CadenceClientFailures: {metricName: "cadence.client.errors", metricType: Counter},
CadenceClientLatency: {metricName: "cadence.client.latency", metricType: Timer},
DomainCacheTotalCallbacksLatency: {metricName: "domain-cache.total-callbacks.latency", metricType: Timer},
DomainCacheBeforeCallbackLatency: {metricName: "domain-cache.before-callbacks.latency", metricType: Timer},
DomainCacheAfterCallbackLatency: {metricName: "domain-cache.after-callbacks.latency", metricType: Timer},
DomainCachePrepareCallbacksLatency: {metricName: "domain-cache.prepare-callbacks.latency", metricType: Timer},
DomainCacheCallbacksLatency: {metricName: "domain-cache.callbacks.latency", metricType: Timer},
HistorySize: {metricName: "history-size", metricType: Timer},
},
Frontend: {},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ package cassandra

import (
"fmt"

"sort"

"github.com/gocql/gocql"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import (

"github.com/gocql/gocql"
"github.com/uber-common/bark"

workflow "github.com/uber/cadence/.gen/go/shared"
p "github.com/uber/cadence/common/persistence"
"github.com/uber/cadence/common/service/config"
Expand Down Expand Up @@ -317,6 +316,9 @@ func (m *cassandraMetadataPersistenceV2) GetDomain(request *p.GetDomainRequest)
return nil, handleError(request.Name, request.ID, err)
}

if info.Data == nil {
info.Data = map[string]string{}
}
replicationConfig.ActiveClusterName = p.GetOrUseDefaultActiveCluster(m.currentClusterName, replicationConfig.ActiveClusterName)
replicationConfig.Clusters = p.DeserializeClusterConfigs(replicationClusters)
replicationConfig.Clusters = p.GetOrUseDefaultClusters(m.currentClusterName, replicationConfig.Clusters)
Expand Down Expand Up @@ -364,6 +366,9 @@ func (m *cassandraMetadataPersistenceV2) ListDomains(request *p.ListDomainsReque
) {
if name != domainMetadataRecordName {
// do not inlcude the metadata record
if domain.Info.Data == nil {
domain.Info.Data = map[string]string{}
}
domain.ReplicationConfig.ActiveClusterName = p.GetOrUseDefaultActiveCluster(m.currentClusterName, domain.ReplicationConfig.ActiveClusterName)
domain.ReplicationConfig.Clusters = p.DeserializeClusterConfigs(replicationClusters)
domain.ReplicationConfig.Clusters = p.GetOrUseDefaultClusters(m.currentClusterName, domain.ReplicationConfig.Clusters)
Expand Down
4 changes: 2 additions & 2 deletions common/persistence/dataInterfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ type (
MinLevel int64
CurrentLevel int64
MaxLevel int64
DomainIDs []string
DomainIDs map[string]struct{}
}

// TimerFailoverLevel contains domain IDs and corresponding start / end level
Expand All @@ -204,7 +204,7 @@ type (
MinLevel time.Time
CurrentLevel time.Time
MaxLevel time.Time
DomainIDs []string
DomainIDs map[string]struct{}
}

// WorkflowExecutionInfo describes a workflow execution
Expand Down
14 changes: 12 additions & 2 deletions service/history/MockTimerQueueProcessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,21 @@ func (_m *MockTimerQueueProcessor) Stop() {
}

// FailoverDomain is mock implementation for FailoverDomain of Processor
func (_m *MockTimerQueueProcessor) FailoverDomain(domainID string) {
_m.Called(domainID)
func (_m *MockTimerQueueProcessor) FailoverDomain(domainIDs map[string]struct{}) {
_m.Called(domainIDs)
}

// NotifyNewTimers is mock implementation for NotifyNewTimers of Processor
func (_m *MockTimerQueueProcessor) NotifyNewTimers(clusterName string, currentTime time.Time, timerTask []persistence.Task) {
_m.Called(clusterName, currentTime, timerTask)
}

// LockTaskPrrocessing is mock implementation for LockTaskPrrocessing of Processor
func (_m *MockTimerQueueProcessor) LockTaskPrrocessing() {
_m.Called()
}

// UnlockTaskPrrocessing is mock implementation for UnlockTaskPrrocessing of Processor
func (_m *MockTimerQueueProcessor) UnlockTaskPrrocessing() {
_m.Called()
}
14 changes: 12 additions & 2 deletions service/history/MockTransferQueueProcessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,21 @@ func (_m *MockTransferQueueProcessor) Stop() {
}

// FailoverDomain is mock implementation for FailoverDomain of Processor
func (_m *MockTransferQueueProcessor) FailoverDomain(domainID string) {
_m.Called(domainID)
func (_m *MockTransferQueueProcessor) FailoverDomain(domainIDs map[string]struct{}) {
_m.Called(domainIDs)
}

// NotifyNewTask is mock implementation for NotifyNewTask of Processor
func (_m *MockTransferQueueProcessor) NotifyNewTask(clusterName string, transferTask []persistence.Task) {
_m.Called(clusterName, transferTask)
}

// LockTaskPrrocessing is mock implementation for LockTaskPrrocessing of Processor
func (_m *MockTransferQueueProcessor) LockTaskPrrocessing() {
_m.Called()
}

// UnlockTaskPrrocessing is mock implementation for UnlockTaskPrrocessing of Processor
func (_m *MockTransferQueueProcessor) UnlockTaskPrrocessing() {
_m.Called()
}
59 changes: 0 additions & 59 deletions service/history/failoverCheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,65 +27,6 @@ import (
"github.com/uber/cadence/common/persistence"
)

// verifyActiveTask, will return true if task activeness check is successful
func verifyActiveTask(shard ShardContext, logger bark.Logger, taskDomainID string, task interface{}) (bool, error) {
currentClusterName := shard.GetService().GetClusterMetadata().GetCurrentClusterName()
domainEntry, err := shard.GetDomainCache().GetDomainByID(taskDomainID)
if err != nil {
// it is possible that the domain is deleted
// we should treat that domain as active
if _, ok := err.(*workflow.EntityNotExistsError); !ok {
logger.Warnf("Cannot find domainID: %v, err: %v.", taskDomainID)
return false, err
}
logger.Warnf("Cannot find domainID: %v, default to process task: %v.", taskDomainID, task)
return true, nil
}
if domainEntry.IsGlobalDomain() && currentClusterName != domainEntry.GetReplicationConfig().ActiveClusterName {
// timer task does not belong to cluster name
logger.Debugf("DomainID: %v is not active, skip task: %v.", taskDomainID, task)
return false, nil
}
logger.Debugf("DomainID: %v is active, process task: %v.", taskDomainID, task)
return true, nil
}

// verifyFailoverActiveTask, will return true if task activeness check is successful
func verifyFailoverActiveTask(logger bark.Logger, targetDomainID string, taskDomainID string, task interface{}) (bool, error) {
if targetDomainID == taskDomainID {
logger.Debugf("Failover DomainID: %v is active, process task: %v.", taskDomainID, task)
return true, nil
}
logger.Debugf("Failover DomainID: %v is not active, skip task: %v.", taskDomainID, task)
return false, nil
}

// verifyStandbyTask, will return true if task standbyness check is successful
func verifyStandbyTask(shard ShardContext, logger bark.Logger, standbyCluster string, taskDomainID string, task interface{}) (bool, error) {
domainEntry, err := shard.GetDomainCache().GetDomainByID(taskDomainID)
if err != nil {
// it is possible that the domain is deleted
// we should treat that domain as not active
if _, ok := err.(*workflow.EntityNotExistsError); !ok {
logger.Warnf("Cannot find domainID: %v, err: %v.", taskDomainID)
return false, err
}
logger.Warnf("Cannot find domainID: %v, default to not process task: %v.", taskDomainID, task)
return false, nil
}
if !domainEntry.IsGlobalDomain() {
// non global domain, timer task does not belong here
logger.Debugf("DomainID: %v is not global, skip task: %v.", taskDomainID, task)
return false, nil
} else if domainEntry.IsGlobalDomain() && domainEntry.GetReplicationConfig().ActiveClusterName != standbyCluster {
// timer task does not belong here
logger.Debugf("DomainID: %v is not standby, skip task: %v.", taskDomainID, task)
return false, nil
}
logger.Debugf("DomainID: %v is standby, process task: %v.", taskDomainID, task)
return true, nil
}

// verifyTaskVersion, will return true if failover version check is successful
func verifyTaskVersion(shard ShardContext, logger bark.Logger, domainID string, version int64, taskVersion int64, task interface{}) (bool, error) {
if !shard.GetService().GetClusterMetadata().IsGlobalDomainEnabled() {
Expand Down
64 changes: 36 additions & 28 deletions service/history/historyEngine.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,11 @@ package history

import (
"context"
"encoding/json"
"errors"
"fmt"
"time"

"encoding/json"

"go.uber.org/yarpc"

"github.com/pborman/uuid"
"github.com/uber-common/bark"
h "github.com/uber/cadence/.gen/go/history"
Expand All @@ -45,6 +42,7 @@ import (
"github.com/uber/cadence/common/messaging"
"github.com/uber/cadence/common/metrics"
"github.com/uber/cadence/common/persistence"
"go.uber.org/yarpc"
)

const (
Expand All @@ -65,6 +63,7 @@ type (
executionManager persistence.ExecutionManager
txProcessor transferQueueProcessor
timerProcessor timerQueueProcessor
taskAllocator taskAllocator
replicator *historyReplicator
replicatorProcessor queueProcessor
historyEventNotifier historyEventNotifier
Expand Down Expand Up @@ -220,9 +219,8 @@ func (e *historyEngineImpl) Stop() {

func (e *historyEngineImpl) registerDomainFailoverCallback() {

failoverPredicate := func(nextDomain *cache.DomainCacheEntry, action func()) {
failoverPredicate := func(shardNotificationVersion int64, nextDomain *cache.DomainCacheEntry, action func()) {
domainFailoverNotificationVersion := nextDomain.GetFailoverNotificationVersion()
shardNotificationVersion := e.shard.GetDomainNotificationVersion()
domainActiveCluster := nextDomain.GetReplicationConfig().ActiveClusterName

if nextDomain.IsGlobalDomain() &&
Expand All @@ -235,27 +233,37 @@ func (e *historyEngineImpl) registerDomainFailoverCallback() {
// first set the failover callback
e.shard.GetDomainCache().RegisterDomainChangeCallback(
e.shard.GetShardID(),
e.shard.GetDomainCache().GetDomainNotificationVersion(),
// before the domain change, this will be invoked when (most of time) domain cache is locked
func(prevDomain *cache.DomainCacheEntry, nextDomain *cache.DomainCacheEntry) {
e.logger.Infof("Domain Change Event: Shard: %v, Domain: %v, ID: %v, Failover Notification Version: %v, Active Cluster: %v, Shard Domain Notification Version: %v\n",
e.shard.GetShardID(), nextDomain.GetInfo().Name, nextDomain.GetInfo().ID,
nextDomain.GetFailoverNotificationVersion(), nextDomain.GetReplicationConfig().ActiveClusterName, e.shard.GetDomainNotificationVersion())

failoverPredicate(nextDomain, func() {
e.logger.Infof("Domain Failover Start: Shard: %v, Domain: %v, ID: %v\n",
e.shard.GetShardID(), nextDomain.GetInfo().Name, nextDomain.GetInfo().ID)

domainID := nextDomain.GetInfo().ID
e.txProcessor.FailoverDomain(domainID)
e.timerProcessor.FailoverDomain(domainID)
})
e.shard.GetDomainNotificationVersion(),
func() {
e.txProcessor.LockTaskPrrocessing()
e.timerProcessor.LockTaskPrrocessing()
},
// after the domain change, this will be invoked when domain cache is NOT locked
func(prevDomain *cache.DomainCacheEntry, nextDomain *cache.DomainCacheEntry) {
failoverPredicate(nextDomain, func() {
e.logger.Infof("Domain Failover Notify Active: Shard: %v, Domain: %v, ID: %v\n",
e.shard.GetShardID(), nextDomain.GetInfo().Name, nextDomain.GetInfo().ID)
func(prevDomains []*cache.DomainCacheEntry, nextDomains []*cache.DomainCacheEntry) {
defer func() {
e.txProcessor.UnlockTaskPrrocessing()
e.timerProcessor.UnlockTaskPrrocessing()
}()

if len(nextDomains) == 0 {
return
}

shardNotificationVersion := e.shard.GetDomainNotificationVersion()
failoverDomainIDs := map[string]struct{}{}

for _, nextDomain := range nextDomains {
failoverPredicate(shardNotificationVersion, nextDomain, func() {
failoverDomainIDs[nextDomain.GetInfo().ID] = struct{}{}
})
}

if len(failoverDomainIDs) > 0 {
e.logger.WithFields(bark.Fields{
logging.TagDomainID: failoverDomainIDs,
}).Infof("Domain Failover Start.")

e.txProcessor.FailoverDomain(failoverDomainIDs)
e.timerProcessor.FailoverDomain(failoverDomainIDs)

now := e.shard.GetTimeSource().Now()
// the fake tasks will not be actually used, we just need to make sure
Expand All @@ -264,8 +272,8 @@ func (e *historyEngineImpl) registerDomainFailoverCallback() {
fakeDecisionTimeoutTask := []persistence.Task{&persistence.DecisionTimeoutTask{VisibilityTimestamp: now}}
e.txProcessor.NotifyNewTask(e.currentClusterName, fakeDecisionTask)
e.timerProcessor.NotifyNewTimers(e.currentClusterName, now, fakeDecisionTimeoutTask)
})
e.shard.UpdateDomainNotificationVersion(nextDomain.GetNotificationVersion() + 1)
}
e.shard.UpdateDomainNotificationVersion(nextDomains[len(nextDomains)-1].GetNotificationVersion() + 1)
},
)
}
Expand Down
8 changes: 6 additions & 2 deletions service/history/historyEngineInterfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,17 +112,21 @@ type (

transferQueueProcessor interface {
common.Daemon
FailoverDomain(domainID string)
FailoverDomain(domainIDs map[string]struct{})
NotifyNewTask(clusterName string, transferTasks []persistence.Task)
LockTaskPrrocessing()
UnlockTaskPrrocessing()
}

// TODO the timer queue processor and the one below, timer processor
// in combination are confusing, we should consider a better naming
// convention, or at least come with a better name for this case.
timerQueueProcessor interface {
common.Daemon
FailoverDomain(domainID string)
FailoverDomain(domainIDs map[string]struct{})
NotifyNewTimers(clusterName string, currentTime time.Time, timerTask []persistence.Task)
LockTaskPrrocessing()
UnlockTaskPrrocessing()
}

timerProcessor interface {
Expand Down
Loading

0 comments on commit 38e1a0b

Please sign in to comment.