Skip to content

Commit

Permalink
Merge pull request kubernetes#16053 from saad-ali/attachDetachMutextFix
Browse files Browse the repository at this point in the history
Fix GCE Cloud/Attach/Detach stability issues
  • Loading branch information
lavalamp committed Oct 26, 2015
2 parents e42f5af + 19115b2 commit b07dd73
Show file tree
Hide file tree
Showing 6 changed files with 264 additions and 352 deletions.
77 changes: 39 additions & 38 deletions pkg/cloudprovider/providers/gce/gce.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ const (
gceAffinityTypeClientIP = "CLIENT_IP"
// AffinityTypeClientIPProto - affinity based on Client IP and port.
gceAffinityTypeClientIPProto = "CLIENT_IP_PROTO"

operationPollInterval = 3 * time.Second
operationPollTimeoutDuration = 30 * time.Minute
)

// GCECloud is an implementation of Interface, TCPLoadBalancer and Instances for Google Compute Engine.
Expand Down Expand Up @@ -259,48 +262,57 @@ func (gce *GCECloud) targetPoolURL(name, region string) string {
return fmt.Sprintf("https://www.googleapis.com/compute/v1/projects/%s/regions/%s/targetPools/%s", gce.projectID, region, name)
}

func waitForOp(op *compute.Operation, getOperation func() (*compute.Operation, error)) error {
pollOp := op
consecPollFails := 0
for pollOp.Status != "DONE" {
var err error
time.Sleep(3 * time.Second)
pollOp, err = getOperation()
func waitForOp(op *compute.Operation, getOperation func(operationName string) (*compute.Operation, error)) error {
if op == nil {
return fmt.Errorf("operation must not be nil")
}

if opIsDone(op) {
return getErrorFromOp(op)
}

opName := op.Name
return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) {
pollOp, err := getOperation(opName)
if err != nil {
if consecPollFails == 2 {
// Only bail if we've seen 3 consecutive polling errors.
return err
}
consecPollFails++
} else {
consecPollFails = 0
glog.Warningf("GCE poll operation failed: %v", err)
}
}
if pollOp.Error != nil && len(pollOp.Error.Errors) > 0 {
return &googleapi.Error{
Code: int(pollOp.HttpErrorStatusCode),
Message: pollOp.Error.Errors[0].Message,
return opIsDone(pollOp), getErrorFromOp(pollOp)
})
}

func opIsDone(op *compute.Operation) bool {
return op != nil && op.Status == "DONE"
}

func getErrorFromOp(op *compute.Operation) error {
if op != nil && op.Error != nil && len(op.Error.Errors) > 0 {
err := &googleapi.Error{
Code: int(op.HttpErrorStatusCode),
Message: op.Error.Errors[0].Message,
}
glog.Errorf("GCE operation failed: %v", err)
return err
}
return nil

return nil
}

func (gce *GCECloud) waitForGlobalOp(op *compute.Operation) error {
return waitForOp(op, func() (*compute.Operation, error) {
return gce.service.GlobalOperations.Get(gce.projectID, op.Name).Do()
return waitForOp(op, func(operationName string) (*compute.Operation, error) {
return gce.service.GlobalOperations.Get(gce.projectID, operationName).Do()
})
}

func (gce *GCECloud) waitForRegionOp(op *compute.Operation, region string) error {
return waitForOp(op, func() (*compute.Operation, error) {
return gce.service.RegionOperations.Get(gce.projectID, region, op.Name).Do()
return waitForOp(op, func(operationName string) (*compute.Operation, error) {
return gce.service.RegionOperations.Get(gce.projectID, region, operationName).Do()
})
}

func (gce *GCECloud) waitForZoneOp(op *compute.Operation) error {
return waitForOp(op, func() (*compute.Operation, error) {
return gce.service.ZoneOperations.Get(gce.projectID, gce.zone, op.Name).Do()
return waitForOp(op, func(operationName string) (*compute.Operation, error) {
return gce.service.ZoneOperations.Get(gce.projectID, gce.zone, operationName).Do()
})
}

Expand Down Expand Up @@ -1457,18 +1469,7 @@ func (gce *GCECloud) AttachDisk(diskName string, readOnly bool) error {

attachOp, err := gce.service.Instances.AttachDisk(gce.projectID, gce.zone, gce.instanceID, attachedDisk).Do()
if err != nil {
// Check if the disk is already attached to this instance. We do this only
// in the error case, since it is expected to be exceptional.
instance, err := gce.service.Instances.Get(gce.projectID, gce.zone, gce.instanceID).Do()
if err != nil {
return err
}
for _, disk := range instance.Disks {
if disk.Source == attachedDisk.Source {
// Disk is already attached, we're good to go.
return nil
}
}
return err
}

return gce.waitForZoneOp(attachOp)
Expand Down
82 changes: 82 additions & 0 deletions pkg/util/keymutex/keymutex.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package keymutex

import (
"fmt"
"github.com/golang/glog"
"sync"
)

// KeyMutex is a thread-safe interface for aquiring locks on arbitrary strings.
type KeyMutex interface {
// Aquires a lock associated with the specified ID, creates the lock if one doesn't already exist.
LockKey(id string)

// Releases the lock associated with the specified ID.
// Returns an error if the specified ID doesn't exist.
UnlockKey(id string) error
}

// Returns a new instance of a key mutex.
func NewKeyMutex() KeyMutex {
return &keyMutex{
mutexMap: make(map[string]*sync.Mutex),
}
}

type keyMutex struct {
sync.RWMutex
mutexMap map[string]*sync.Mutex
}

// Aquires a lock associated with the specified ID (creates the lock if one doesn't already exist).
func (km *keyMutex) LockKey(id string) {
glog.V(5).Infof("LockKey(...) called for id %q\r\n", id)
mutex := km.getOrCreateLock(id)
mutex.Lock()
glog.V(5).Infof("LockKey(...) for id %q completed.\r\n", id)
}

// Releases the lock associated with the specified ID.
// Returns an error if the specified ID doesn't exist.
func (km *keyMutex) UnlockKey(id string) error {
glog.V(5).Infof("UnlockKey(...) called for id %q\r\n", id)
km.RLock()
defer km.RUnlock()
mutex, exists := km.mutexMap[id]
if !exists {
return fmt.Errorf("id %q not found", id)
}
glog.V(5).Infof("UnlockKey(...) for id. Mutex found, trying to unlock it. %q\r\n", id)

mutex.Unlock()
glog.V(5).Infof("UnlockKey(...) for id %q completed.\r\n", id)
return nil
}

// Returns lock associated with the specified ID, or creates the lock if one doesn't already exist.
func (km *keyMutex) getOrCreateLock(id string) *sync.Mutex {
km.Lock()
defer km.Unlock()

if _, exists := km.mutexMap[id]; !exists {
km.mutexMap[id] = &sync.Mutex{}
}

return km.mutexMap[id]
}
111 changes: 111 additions & 0 deletions pkg/util/keymutex/keymutex_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package keymutex

import (
"testing"
"time"
)

const (
callbackTimeout = 1 * time.Second
)

func Test_SingleLock_NoUnlock(t *testing.T) {
// Arrange
km := NewKeyMutex()
key := "fakeid"
callbackCh := make(chan interface{})

// Act
go lockAndCallback(km, key, callbackCh)

// Assert
verifyCallbackHappens(t, callbackCh)
}

func Test_SingleLock_SingleUnlock(t *testing.T) {
// Arrange
km := NewKeyMutex()
key := "fakeid"
callbackCh := make(chan interface{})

// Act & Assert
go lockAndCallback(km, key, callbackCh)
verifyCallbackHappens(t, callbackCh)
km.UnlockKey(key)
}

func Test_DoubleLock_DoubleUnlock(t *testing.T) {
// Arrange
km := NewKeyMutex()
key := "fakeid"
callbackCh1stLock := make(chan interface{})
callbackCh2ndLock := make(chan interface{})

// Act & Assert
go lockAndCallback(km, key, callbackCh1stLock)
verifyCallbackHappens(t, callbackCh1stLock)
go lockAndCallback(km, key, callbackCh2ndLock)
verifyCallbackDoesntHappens(t, callbackCh2ndLock)
km.UnlockKey(key)
verifyCallbackHappens(t, callbackCh2ndLock)
km.UnlockKey(key)
}

func lockAndCallback(km KeyMutex, id string, callbackCh chan<- interface{}) {
km.LockKey(id)
callbackCh <- true
}

func verifyCallbackHappens(t *testing.T, callbackCh <-chan interface{}) bool {
select {
case <-callbackCh:
return true
case <-time.After(callbackTimeout):
t.Fatalf("Timed out waiting for callback.")
return false
}
}

func verifyCallbackDoesntHappens(t *testing.T, callbackCh <-chan interface{}) bool {
select {
case <-callbackCh:
t.Fatalf("Unexpected callback.")
return false
case <-time.After(callbackTimeout):
return true
}
}

func verifyNoError(t *testing.T, err error, name string) {
if err != nil {
t.Fatalf("Unexpected response on %q. Expected: <no error> Actual: <%v>", name, err)
}
}

func verifyError(t *testing.T, err error, name string) {
if err == nil {
t.Fatalf("Unexpected response on %q. Expected: <error> Actual: <no error>", name)
}
}

func verifyMsg(t *testing.T, expected, actual string) {
if actual != expected {
t.Fatalf("Unexpected testMsg value. Expected: <%v> Actual: <%v>", expected, actual)
}
}
Loading

0 comments on commit b07dd73

Please sign in to comment.