Skip to content

Commit

Permalink
Standardize metrics for task isolation leaking and include cause (cad…
Browse files Browse the repository at this point in the history
  • Loading branch information
natemort authored Dec 18, 2024
1 parent 6032d70 commit 2bd1e5d
Show file tree
Hide file tree
Showing 14 changed files with 140 additions and 289 deletions.
50 changes: 7 additions & 43 deletions common/isolationgroup/defaultisolationgroupstate/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,12 @@ func NewDefaultIsolationGroupStateWatcherWithConfigStoreClient(
}, nil
}

func (z *defaultIsolationGroupStateHandler) AvailableIsolationGroupsByDomainID(ctx context.Context, domainID string, tasklistName string, availablePollerIsolationGroups []string) (types.IsolationGroupConfiguration, error) {
func (z *defaultIsolationGroupStateHandler) IsolationGroupsByDomainID(ctx context.Context, domainID string) (types.IsolationGroupConfiguration, error) {
state, err := z.getByDomainID(ctx, domainID)
if err != nil {
return nil, fmt.Errorf("unable to get isolation group state: %w", err)
}
availableIsolationGroupsCfg := isolationGroupHealthyListToConfig(availablePollerIsolationGroups)
scope := z.createAvailableisolationGroupMetricsScope(domainID, tasklistName)
return availableIG(z.config.AllIsolationGroups(), availableIsolationGroupsCfg, state.Global, state.Domain, scope), nil
return toIsolationGroupConfiguration(z.config.AllIsolationGroups(), state.Global, state.Domain), nil
}

func (z *defaultIsolationGroupStateHandler) IsDrained(ctx context.Context, domain string, isolationGroup string) (bool, error) {
Expand Down Expand Up @@ -162,44 +160,21 @@ func (z *defaultIsolationGroupStateHandler) get(ctx context.Context, domain stri
return ig, nil
}

func (z *defaultIsolationGroupStateHandler) createAvailableisolationGroupMetricsScope(domainID string, tasklistName string) metrics.Scope {
domainName, _ := z.domainCache.GetDomainName(domainID)
return z.metricsClient.Scope(metrics.GetAvailableIsolationGroupsScope).
Tagged(metrics.DomainTag(domainName)).
Tagged(metrics.TaskListTag(tasklistName))
}

// A simple explicit deny-based isolation group implementation
func availableIG(
func toIsolationGroupConfiguration(
allIsolationGroups []string,
availablePollers types.IsolationGroupConfiguration,
global types.IsolationGroupConfiguration,
domain types.IsolationGroupConfiguration,
scope metrics.Scope,
) types.IsolationGroupConfiguration {
out := types.IsolationGroupConfiguration{}
for _, isolationGroup := range allIsolationGroups {
_, hasAvailablePollers := availablePollers[isolationGroup]
globalCfg, hasGlobalConfig := global[isolationGroup]
domainCfg, hasDomainConfig := domain[isolationGroup]
if hasGlobalConfig {
if globalCfg.State == types.IsolationGroupStateDrained {
scope.Tagged(metrics.PollerIsolationGroupTag(isolationGroup)).IncCounter(metrics.IsolationGroupStateDrained)
continue
}
}
if hasDomainConfig {
if domainCfg.State == types.IsolationGroupStateDrained {
scope.Tagged(metrics.PollerIsolationGroupTag(isolationGroup)).IncCounter(metrics.IsolationGroupStateDrained)
continue
if isDrained(isolationGroup, global, domain) {
out[isolationGroup] = types.IsolationGroupPartition{
Name: isolationGroup,
State: types.IsolationGroupStateDrained,
}
}
if !hasAvailablePollers {
// we don't attempt to dispatch tasks to isolation groups where there are no pollers
scope.Tagged(metrics.PollerIsolationGroupTag(isolationGroup)).IncCounter(metrics.IsolationGroupStatePollerUnavailable)
continue
}
scope.Tagged(metrics.PollerIsolationGroupTag(isolationGroup)).IncCounter(metrics.IsolationGroupStateHealthy)
out[isolationGroup] = types.IsolationGroupPartition{
Name: isolationGroup,
State: types.IsolationGroupStateHealthy,
Expand All @@ -223,14 +198,3 @@ func isDrained(isolationGroup string, global types.IsolationGroupConfiguration,
}
return false
}

func isolationGroupHealthyListToConfig(igs []string) types.IsolationGroupConfiguration {
out := make(types.IsolationGroupConfiguration, len(igs))
for _, ig := range igs {
out[ig] = types.IsolationGroupPartition{
Name: ig,
State: types.IsolationGroupStateHealthy,
}
}
return out
}
112 changes: 33 additions & 79 deletions common/isolationgroup/defaultisolationgroupstate/state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,24 +74,21 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
json.Unmarshal(validCfgDataDrained, &dynamicConfigResponseDrained)

tests := map[string]struct {
availablePollerIsolationGroups []string
dcAffordance func(client *dynamicconfig.MockClient)
domainAffordance func(mock *cache.MockDomainCache)
cfg defaultConfig
expected types.IsolationGroupConfiguration
expectedErr error
dcAffordance func(client *dynamicconfig.MockClient)
domainAffordance func(mock *cache.MockDomainCache)
cfg defaultConfig
expected types.IsolationGroupConfiguration
expectedErr error
}{
"normal case - feature is disabled": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return false },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {},
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
},
expected: types.IsolationGroupConfiguration{
"zone-1": {
Expand All @@ -106,10 +103,9 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
},

"normal case - no drains present - no configuration specifying a drain - feature is enabled": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {
client.EXPECT().GetListValue(
Expand All @@ -120,7 +116,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
mock.EXPECT().GetDomain("domain").Return(domainResponse, nil).AnyTimes()
},
expected: types.IsolationGroupConfiguration{
Expand All @@ -136,10 +131,9 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
},

"normal case - one drain present - no configuration specifying a drain - feature is enabled": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {
client.EXPECT().GetListValue(
Expand All @@ -150,21 +144,23 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
mock.EXPECT().GetDomain("domain").Return(domainResponse, nil).AnyTimes()
},
expected: types.IsolationGroupConfiguration{
"zone-1": {
Name: "zone-1",
State: types.IsolationGroupStateDrained,
},
"zone-2": {
Name: "zone-2",
State: types.IsolationGroupStateHealthy,
},
},
},
"expected case - no global drain data configured": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {
client.EXPECT().GetListValue(
Expand All @@ -175,7 +171,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
mock.EXPECT().GetDomain("domain").Return(domainResponse, nil).AnyTimes()
},
expected: types.IsolationGroupConfiguration{
Expand All @@ -190,7 +185,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
},
},
"pathological case - problems with global drain data - 1": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -210,7 +204,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
expectedErr: errors.New("unable to get isolation group state: could not resolve global drains in an error"),
},
"pathological case - problems with domain drain data - cannot resolve domain": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -224,7 +217,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
},

"pathological case - problems with global drain data - malformed data returned 1": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -244,7 +236,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
expectedErr: errors.New("unable to get isolation group state: could not resolve global drains in an error"),
},
"pathological case - problems with domain drain data - malformed data returned 1": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -260,7 +251,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
expectedErr: errors.New("unable to get isolation group state: could not resolve domain in isolationGroup handler: a failure"),
},
"pathological case - problems with domain drain data - malformed data returned 2": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -275,26 +265,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
expected: nil,
expectedErr: errors.New("unable to get isolation group state: could not resolve domain in isolationGroup handler: %!w(<nil>)"),
},
"pathological case - no available pollers": {
availablePollerIsolationGroups: nil,
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {
client.EXPECT().GetListValue(
dynamicconfig.DefaultIsolationGroupConfigStoreManagerGlobalMapping,
gomock.Any(),
).Return(nil, nil)
},
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
mock.EXPECT().GetDomain("domain").Return(domainResponse, nil).AnyTimes()
},
expected: types.IsolationGroupConfiguration{},
},
}

for name, td := range tests {
Expand All @@ -311,7 +281,7 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
config: td.cfg,
metricsClient: metrics.NewNoopMetricsClient(),
}
res, err := handler.AvailableIsolationGroupsByDomainID(context.TODO(), "domain-id", "a-tasklist", td.availablePollerIsolationGroups)
res, err := handler.IsolationGroupsByDomainID(context.TODO(), "domain-id")
assert.Equal(t, td.expected, res)
if td.expectedErr != nil {
assert.Equal(t, td.expectedErr.Error(), err.Error())
Expand Down Expand Up @@ -445,7 +415,7 @@ func TestAvailableIsolationGroups(t *testing.T) {
},
}

isolationGroupsSetB := types.IsolationGroupConfiguration{
isolationGroupsWithCDrained := types.IsolationGroupConfiguration{
igA: {
Name: igA,
State: types.IsolationGroupStateHealthy,
Expand All @@ -454,60 +424,44 @@ func TestAvailableIsolationGroups(t *testing.T) {
Name: igB,
State: types.IsolationGroupStateHealthy,
},
igC: {
Name: igC,
State: types.IsolationGroupStateDrained,
},
}

isolationGroupsSetC := types.IsolationGroupConfiguration{
drainedC := types.IsolationGroupConfiguration{
igC: {
Name: igC,
State: types.IsolationGroupStateDrained,
},
}

tests := map[string]struct {
globalIGCfg types.IsolationGroupConfiguration
domainIGCfg types.IsolationGroupConfiguration
availablePollers types.IsolationGroupConfiguration
expected types.IsolationGroupConfiguration
globalIGCfg types.IsolationGroupConfiguration
domainIGCfg types.IsolationGroupConfiguration
expected types.IsolationGroupConfiguration
}{
"default behaviour - no drains - everything should be healthy": {
globalIGCfg: types.IsolationGroupConfiguration{},
domainIGCfg: types.IsolationGroupConfiguration{},
availablePollers: isolationGroupsAllHealthy,
expected: isolationGroupsAllHealthy,
},
"default behaviour - no drains - only one zone is healthy in terms of pollers, should only return that": {
globalIGCfg: types.IsolationGroupConfiguration{},
domainIGCfg: types.IsolationGroupConfiguration{},
availablePollers: types.IsolationGroupConfiguration{
igC: types.IsolationGroupPartition{
Name: igC,
State: types.IsolationGroupStateHealthy,
},
},
expected: types.IsolationGroupConfiguration{
igC: types.IsolationGroupPartition{
Name: igC,
State: types.IsolationGroupStateHealthy,
},
},
expected: isolationGroupsAllHealthy,
},
"default behaviour - one is drained - should return remaining 1/2": {
globalIGCfg: types.IsolationGroupConfiguration{},
availablePollers: isolationGroupsAllHealthy,
domainIGCfg: isolationGroupsSetC, // C is drained
expected: isolationGroupsSetB, // A and B
"default behaviour - drained at domain level": {
globalIGCfg: types.IsolationGroupConfiguration{},
domainIGCfg: drainedC, // C is drained
expected: isolationGroupsWithCDrained, // A and B
},
"default behaviour - one is drained - should return remaining 2/2": {
globalIGCfg: isolationGroupsSetC, // C is drained
availablePollers: isolationGroupsAllHealthy,
domainIGCfg: types.IsolationGroupConfiguration{},
expected: isolationGroupsSetB, // A and B
"default behaviour - drained at global level": {
globalIGCfg: drainedC, // C is drained
domainIGCfg: types.IsolationGroupConfiguration{},
expected: isolationGroupsWithCDrained, // A and B
},
}

for name, td := range tests {
t.Run(name, func(t *testing.T) {
assert.Equal(t, td.expected, availableIG(all, td.availablePollers, td.globalIGCfg, td.domainIGCfg, metrics.NewNoopMetricsClient().Scope(0)))
assert.Equal(t, td.expected, toIsolationGroupConfiguration(all, td.globalIGCfg, td.domainIGCfg))
})
}
}
Expand Down
2 changes: 1 addition & 1 deletion common/isolationgroup/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ type State interface {

// AvailableIsolationGroupsByDomainID returns the available isolation zones for a domain.
// Takes into account global and domain zones
AvailableIsolationGroupsByDomainID(ctx context.Context, domainID string, tasklistName string, availableIsolationGroups []string) (types.IsolationGroupConfiguration, error)
IsolationGroupsByDomainID(ctx context.Context, domainID string) (types.IsolationGroupConfiguration, error)
}
30 changes: 15 additions & 15 deletions common/isolationgroup/isolation_group_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 2bd1e5d

Please sign in to comment.