From 98b62cbec8fda7f2628c49a8d736e65da48819aa Mon Sep 17 00:00:00 2001 From: Frank Wessels Date: Fri, 11 Aug 2017 11:38:46 -0700 Subject: [PATCH] Implement an offline mode for a distributed node (#4646) Implement an offline mode for remote storage to cache the offline status of a node in order to prevent network calls that are bound to fail. After a time interval an attempt will be made to restore the connection and mark the node as online if successful. Fixes #4183 --- cmd/prepare-storage.go | 2 + cmd/retry-storage.go | 279 ++++++++++++++++++++------------- cmd/retry-storage_test.go | 29 ++++ cmd/storage-errors.go | 8 +- cmd/storage-rpc-client.go | 4 +- cmd/storage-rpc-client_test.go | 4 +- 6 files changed, 214 insertions(+), 112 deletions(-) diff --git a/cmd/prepare-storage.go b/cmd/prepare-storage.go index ebf57676f14f6..c97aa2091e250 100644 --- a/cmd/prepare-storage.go +++ b/cmd/prepare-storage.go @@ -341,6 +341,7 @@ func waitForFormatXLDisks(firstDisk bool, endpoints EndpointList, storageDisks [ maxRetryAttempts: globalStorageInitRetryThreshold, retryUnit: time.Second, retryCap: time.Second * 30, // 30 seconds. + offlineTimestamp: UTCNow(), } } @@ -361,6 +362,7 @@ func waitForFormatXLDisks(firstDisk bool, endpoints EndpointList, storageDisks [ maxRetryAttempts: globalStorageRetryThreshold, retryUnit: time.Millisecond, retryCap: time.Millisecond * 5, // 5 milliseconds. + offlineTimestamp: UTCNow(), // Set timestamp to prevent immediate marking as offline } } diff --git a/cmd/retry-storage.go b/cmd/retry-storage.go index ea88460c3d391..d92345055e538 100644 --- a/cmd/retry-storage.go +++ b/cmd/retry-storage.go @@ -30,210 +30,275 @@ const ( // Attempt to retry only this many number of times before // giving up on the remote disk entirely after initialization. globalStorageRetryThreshold = 1 + + // Interval to check health status of a node whether it has + // come back up online + globalStorageHealthCheckInterval = 5 * time.Minute ) +// Converts rpc.ServerError to underlying error. This function is +// written so that the storageAPI errors are consistent across network +// disks as well. +func retryToStorageErr(err error) error { + if err == errDiskNotFoundFromNetError || err == errDiskNotFoundFromRPCShutdown { + return errDiskNotFound + } + return err +} + // Retry storage is an instance of StorageAPI which // additionally verifies upon network shutdown if the // underlying storage is available and is really -// formatted. +// formatted. After the initialization phase it will +// also cache when the underlying storage is offline +// to prevent needless calls and recheck the health of +// underlying storage in regular intervals. type retryStorage struct { remoteStorage StorageAPI maxRetryAttempts int retryUnit time.Duration retryCap time.Duration + offline bool // Mark whether node is offline + offlineTimestamp time.Time // Last timestamp of checking status of node } // String representation of remoteStorage. -func (f retryStorage) String() string { +func (f *retryStorage) String() string { return f.remoteStorage.String() } -// Reconncts to underlying remote storage. -func (f retryStorage) Init() (err error) { - return f.remoteStorage.Init() +// Reconnects to underlying remote storage. +func (f *retryStorage) Init() (err error) { + return retryToStorageErr(f.remoteStorage.Init()) } // Closes the underlying remote storage connection. -func (f retryStorage) Close() (err error) { - return f.remoteStorage.Close() +func (f *retryStorage) Close() (err error) { + return retryToStorageErr(f.remoteStorage.Close()) +} + +// Return whether the underlying remote storage is offline +// and, if so, try to reconnect at regular intervals to +// restore the connection +func (f *retryStorage) IsOffline() bool { + // Check if offline and whether enough time has lapsed since most recent check + if f.offline && UTCNow().Sub(f.offlineTimestamp) >= globalStorageHealthCheckInterval { + f.offlineTimestamp = UTCNow() // reset timestamp + + if e := f.reInit(nil); e == nil { + // Connection has been re-established + f.offline = false // Mark node as back online + } + } + return f.offline } // DiskInfo - a retryable implementation of disk info. -func (f retryStorage) DiskInfo() (info disk.Info, err error) { +func (f *retryStorage) DiskInfo() (info disk.Info, err error) { + if f.IsOffline() { + return info, errDiskNotFound + } info, err = f.remoteStorage.DiskInfo() - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.DiskInfo() - } + if f.reInitUponDiskNotFound(err) { + info, err = f.remoteStorage.DiskInfo() + return info, retryToStorageErr(err) } - return info, err + return info, retryToStorageErr(err) } // MakeVol - a retryable implementation of creating a volume. -func (f retryStorage) MakeVol(volume string) (err error) { +func (f *retryStorage) MakeVol(volume string) (err error) { + if f.IsOffline() { + return errDiskNotFound + } err = f.remoteStorage.MakeVol(volume) - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.MakeVol(volume) - } + if f.reInitUponDiskNotFound(err) { + return retryToStorageErr(f.remoteStorage.MakeVol(volume)) } - return err + return retryToStorageErr(err) } // ListVols - a retryable implementation of listing all the volumes. -func (f retryStorage) ListVols() (vols []VolInfo, err error) { +func (f *retryStorage) ListVols() (vols []VolInfo, err error) { + if f.IsOffline() { + return vols, errDiskNotFound + } vols, err = f.remoteStorage.ListVols() - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.ListVols() - } + if f.reInitUponDiskNotFound(err) { + vols, err = f.remoteStorage.ListVols() + return vols, retryToStorageErr(err) } - return vols, err + return vols, retryToStorageErr(err) } // StatVol - a retryable implementation of stating a volume. -func (f retryStorage) StatVol(volume string) (vol VolInfo, err error) { +func (f *retryStorage) StatVol(volume string) (vol VolInfo, err error) { + if f.IsOffline() { + return vol, errDiskNotFound + } vol, err = f.remoteStorage.StatVol(volume) - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.StatVol(volume) - } + if f.reInitUponDiskNotFound(err) { + vol, err = f.remoteStorage.StatVol(volume) + return vol, retryToStorageErr(err) } - return vol, err + return vol, retryToStorageErr(err) } // DeleteVol - a retryable implementation of deleting a volume. -func (f retryStorage) DeleteVol(volume string) (err error) { +func (f *retryStorage) DeleteVol(volume string) (err error) { + if f.IsOffline() { + return errDiskNotFound + } err = f.remoteStorage.DeleteVol(volume) - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.DeleteVol(volume) - } + if f.reInitUponDiskNotFound(err) { + return retryToStorageErr(f.remoteStorage.DeleteVol(volume)) } - return err + return retryToStorageErr(err) } // PrepareFile - a retryable implementation of preparing a file. -func (f retryStorage) PrepareFile(volume, path string, length int64) (err error) { +func (f *retryStorage) PrepareFile(volume, path string, length int64) (err error) { + if f.IsOffline() { + return errDiskNotFound + } err = f.remoteStorage.PrepareFile(volume, path, length) - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.PrepareFile(volume, path, length) - } + if f.reInitUponDiskNotFound(err) { + return retryToStorageErr(f.remoteStorage.PrepareFile(volume, path, length)) } - return err + return retryToStorageErr(err) } // AppendFile - a retryable implementation of append to a file. -func (f retryStorage) AppendFile(volume, path string, buffer []byte) (err error) { +func (f *retryStorage) AppendFile(volume, path string, buffer []byte) (err error) { + if f.IsOffline() { + return errDiskNotFound + } err = f.remoteStorage.AppendFile(volume, path, buffer) - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.AppendFile(volume, path, buffer) - } + if f.reInitUponDiskNotFound(err) { + return retryToStorageErr(f.remoteStorage.AppendFile(volume, path, buffer)) } - return err + return retryToStorageErr(err) } // StatFile - a retryable implementation of stating a file. -func (f retryStorage) StatFile(volume, path string) (fileInfo FileInfo, err error) { +func (f *retryStorage) StatFile(volume, path string) (fileInfo FileInfo, err error) { + if f.IsOffline() { + return fileInfo, errDiskNotFound + } fileInfo, err = f.remoteStorage.StatFile(volume, path) - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.StatFile(volume, path) - } + if f.reInitUponDiskNotFound(err) { + fileInfo, err = f.remoteStorage.StatFile(volume, path) + return fileInfo, retryToStorageErr(err) } - return fileInfo, err + return fileInfo, retryToStorageErr(err) } // ReadAll - a retryable implementation of reading all the content from a file. -func (f retryStorage) ReadAll(volume, path string) (buf []byte, err error) { +func (f *retryStorage) ReadAll(volume, path string) (buf []byte, err error) { + if f.IsOffline() { + return buf, errDiskNotFound + } buf, err = f.remoteStorage.ReadAll(volume, path) - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.ReadAll(volume, path) - } + if f.reInitUponDiskNotFound(err) { + buf, err = f.remoteStorage.ReadAll(volume, path) + return buf, retryToStorageErr(err) } - return buf, err + return buf, retryToStorageErr(err) } // ReadFile - a retryable implementation of reading at offset from a file. -func (f retryStorage) ReadFile(volume, path string, offset int64, buffer []byte) (m int64, err error) { +func (f *retryStorage) ReadFile(volume, path string, offset int64, buffer []byte) (m int64, err error) { + if f.IsOffline() { + return m, errDiskNotFound + } m, err = f.remoteStorage.ReadFile(volume, path, offset, buffer) - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.ReadFile(volume, path, offset, buffer) - } + if f.reInitUponDiskNotFound(err) { + m, err = f.remoteStorage.ReadFile(volume, path, offset, buffer) + return m, retryToStorageErr(err) } - return m, err + return m, retryToStorageErr(err) } // ReadFileWithVerify - a retryable implementation of reading at // offset from a file with verification. -func (f retryStorage) ReadFileWithVerify(volume, path string, offset int64, buffer []byte, +func (f *retryStorage) ReadFileWithVerify(volume, path string, offset int64, buffer []byte, algo HashAlgo, expectedHash string) (m int64, err error) { + if f.IsOffline() { + return m, errDiskNotFound + } m, err = f.remoteStorage.ReadFileWithVerify(volume, path, offset, buffer, algo, expectedHash) - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.ReadFileWithVerify(volume, path, - offset, buffer, algo, expectedHash) - } + if f.reInitUponDiskNotFound(err) { + m, err = f.remoteStorage.ReadFileWithVerify(volume, path, + offset, buffer, algo, expectedHash) + return m, retryToStorageErr(err) } - return m, err + return m, retryToStorageErr(err) } // ListDir - a retryable implementation of listing directory entries. -func (f retryStorage) ListDir(volume, path string) (entries []string, err error) { +func (f *retryStorage) ListDir(volume, path string) (entries []string, err error) { + if f.IsOffline() { + return entries, errDiskNotFound + } entries, err = f.remoteStorage.ListDir(volume, path) - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.ListDir(volume, path) - } + if f.reInitUponDiskNotFound(err) { + entries, err = f.remoteStorage.ListDir(volume, path) + return entries, retryToStorageErr(err) } - return entries, err + return entries, retryToStorageErr(err) } // DeleteFile - a retryable implementation of deleting a file. -func (f retryStorage) DeleteFile(volume, path string) (err error) { +func (f *retryStorage) DeleteFile(volume, path string) (err error) { + if f.IsOffline() { + return errDiskNotFound + } err = f.remoteStorage.DeleteFile(volume, path) - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.DeleteFile(volume, path) - } + if f.reInitUponDiskNotFound(err) { + return retryToStorageErr(f.remoteStorage.DeleteFile(volume, path)) } - return err + return retryToStorageErr(err) } // RenameFile - a retryable implementation of renaming a file. -func (f retryStorage) RenameFile(srcVolume, srcPath, dstVolume, dstPath string) (err error) { +func (f *retryStorage) RenameFile(srcVolume, srcPath, dstVolume, dstPath string) (err error) { + if f.IsOffline() { + return errDiskNotFound + } err = f.remoteStorage.RenameFile(srcVolume, srcPath, dstVolume, dstPath) - if err == errDiskNotFound { - err = f.reInit() - if err == nil { - return f.remoteStorage.RenameFile(srcVolume, srcPath, dstVolume, dstPath) - } + if f.reInitUponDiskNotFound(err) { + return retryToStorageErr(f.remoteStorage.RenameFile(srcVolume, srcPath, dstVolume, dstPath)) } - return err + return retryToStorageErr(err) +} + +// Try to reinitialize the connection when we have some form of DiskNotFound error +func (f *retryStorage) reInitUponDiskNotFound(err error) bool { + if err == errDiskNotFound || err == errDiskNotFoundFromNetError || err == errDiskNotFoundFromRPCShutdown { + return f.reInit(err) == nil + } + return false } // Connect and attempt to load the format from a disconnected node, // attempts three times before giving up. -func (f retryStorage) reInit() (err error) { +func (f *retryStorage) reInit(e error) (err error) { + + // Only after initialization and minimum of one interval + // has passed (to prevent marking a node as offline right + // after initialization), check whether node has gone offline + if f.maxRetryAttempts == globalStorageRetryThreshold && + UTCNow().Sub(f.offlineTimestamp) >= globalStorageHealthCheckInterval { + if e == errDiskNotFoundFromNetError { // Make node offline due to network error + f.offline = true // Marking node offline + f.offlineTimestamp = UTCNow() + return errDiskNotFound + } + // Continue for other errors like RPC shutdown (and retry connection below) + } + // Close the underlying connection. f.remoteStorage.Close() // Error here is purposefully ignored. diff --git a/cmd/retry-storage_test.go b/cmd/retry-storage_test.go index 2e2b227e8b41f..ae87adaddcc6d 100644 --- a/cmd/retry-storage_test.go +++ b/cmd/retry-storage_test.go @@ -20,6 +20,7 @@ import ( "bytes" "crypto/sha256" "encoding/hex" + "errors" "reflect" "testing" "time" @@ -423,3 +424,31 @@ func TestRetryStorage(t *testing.T) { } } } + +// Tests reply storage error transformation. +func TestReplyStorageErr(t *testing.T) { + unknownErr := errors.New("Unknown error") + testCases := []struct { + expectedErr error + err error + }{ + { + expectedErr: errDiskNotFound, + err: errDiskNotFoundFromNetError, + }, + { + expectedErr: errDiskNotFound, + err: errDiskNotFoundFromRPCShutdown, + }, + { + expectedErr: unknownErr, + err: unknownErr, + }, + } + for i, testCase := range testCases { + resultErr := retryToStorageErr(testCase.err) + if testCase.expectedErr != resultErr { + t.Errorf("Test %d: Expected %s, got %s", i+1, testCase.expectedErr, resultErr) + } + } +} diff --git a/cmd/storage-errors.go b/cmd/storage-errors.go index 0148985e23d5d..12ba66b8a756a 100644 --- a/cmd/storage-errors.go +++ b/cmd/storage-errors.go @@ -37,9 +37,15 @@ var errUnformattedDisk = errors.New("unformatted disk found") // errDiskFull - cannot create volume or files when disk is full. var errDiskFull = errors.New("disk path full") -// errDiskNotFount - cannot find the underlying configured disk anymore. +// errDiskNotFound - cannot find the underlying configured disk anymore. var errDiskNotFound = errors.New("disk not found") +// errDiskNotFoundFromNetError - cannot find the underlying configured disk anymore due to network error. +var errDiskNotFoundFromNetError = errors.New("disk not found from net error") + +// errDiskNotFoundFromShutdown - cannot find the underlying configured disk anymore due to rpc shutdown. +var errDiskNotFoundFromRPCShutdown = errors.New("disk not found from rpc shutdown") + // errFaultyRemoteDisk - remote disk is faulty. var errFaultyRemoteDisk = errors.New("remote disk is faulty") diff --git a/cmd/storage-rpc-client.go b/cmd/storage-rpc-client.go index 982e5e1585ac4..2208d923cdadc 100644 --- a/cmd/storage-rpc-client.go +++ b/cmd/storage-rpc-client.go @@ -45,11 +45,11 @@ func toStorageErr(err error) error { switch err.(type) { case *net.OpError: - return errDiskNotFound + return errDiskNotFoundFromNetError } if err == rpc.ErrShutdown { - return errDiskNotFound + return errDiskNotFoundFromRPCShutdown } switch err.Error() { diff --git a/cmd/storage-rpc-client_test.go b/cmd/storage-rpc-client_test.go index a3bf44f52ea06..9fb1df1c7542f 100644 --- a/cmd/storage-rpc-client_test.go +++ b/cmd/storage-rpc-client_test.go @@ -127,11 +127,11 @@ func TestStorageErr(t *testing.T) { err: fmt.Errorf("%s", io.ErrUnexpectedEOF.Error()), }, { - expectedErr: errDiskNotFound, + expectedErr: errDiskNotFoundFromNetError, err: &net.OpError{}, }, { - expectedErr: errDiskNotFound, + expectedErr: errDiskNotFoundFromRPCShutdown, err: rpc.ErrShutdown, }, {