Skip to content

Commit

Permalink
heal: Enable periodic bitrot scan configuration (minio#14464)
Browse files Browse the repository at this point in the history
  • Loading branch information
vadmeste authored Apr 7, 2022
1 parent ee49a23 commit 16431d2
Show file tree
Hide file tree
Showing 20 changed files with 192 additions and 50 deletions.
2 changes: 1 addition & 1 deletion cmd/admin-heal-ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
if source.opts != nil {
task.opts = *source.opts
} else {
task.opts.ScanMode = globalHealConfig.ScanMode()
task.opts.ScanMode = madmin.HealNormalScan
}

h.mutex.Lock()
Expand Down
98 changes: 90 additions & 8 deletions cmd/data-scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"bytes"
"context"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"io/fs"
Expand Down Expand Up @@ -103,6 +104,63 @@ func (s *safeDuration) Get() time.Duration {
return s.t
}

func getCycleScanMode(currentCycle, bitrotStartCycle uint64, bitrotStartTime time.Time) madmin.HealScanMode {
bitrotCycle := globalHealConfig.BitrotScanCycle()
switch bitrotCycle {
case -1:
return madmin.HealNormalScan
case 0:
return madmin.HealDeepScan
}

if currentCycle-bitrotStartCycle < healObjectSelectProb {
return madmin.HealDeepScan
}

if time.Since(bitrotStartTime) > bitrotCycle {
return madmin.HealDeepScan
}

return madmin.HealNormalScan
}

type backgroundHealInfo struct {
BitrotStartTime time.Time `json:"bitrotStartTime"`
BitrotStartCycle uint64 `json:"bitrotStartCycle"`
CurrentScanMode madmin.HealScanMode `json:"currentScanMode"`
}

func readBackgroundHealInfo(ctx context.Context, objAPI ObjectLayer) backgroundHealInfo {
// Get last healing information
buf, err := readConfig(ctx, objAPI, backgroundHealInfoPath)
if err != nil {
if !errors.Is(err, errConfigNotFound) {
logger.LogIf(ctx, err)
}
return backgroundHealInfo{}
}
var info backgroundHealInfo
err = json.Unmarshal(buf, &info)
if err != nil {
logger.LogIf(ctx, err)
return backgroundHealInfo{}
}
return info
}

func saveBackgroundHealInfo(ctx context.Context, objAPI ObjectLayer, info backgroundHealInfo) {
b, err := json.Marshal(info)
if err != nil {
logger.LogIf(ctx, err)
return
}
// Get last healing information
err = saveConfig(ctx, objAPI, backgroundHealInfoPath, b)
if err != nil {
logger.LogIf(ctx, err)
}
}

// runDataScanner will start a data scanner.
// The function will block until the context is canceled.
// There should only ever be one scanner running per cluster.
Expand Down Expand Up @@ -145,12 +203,24 @@ func runDataScanner(pctx context.Context, objAPI ObjectLayer) {
console.Debugln("starting scanner cycle")
}

bgHealInfo := readBackgroundHealInfo(ctx, objAPI)
scanMode := getCycleScanMode(nextBloomCycle, bgHealInfo.BitrotStartCycle, bgHealInfo.BitrotStartTime)
if bgHealInfo.CurrentScanMode != scanMode {
newHealInfo := bgHealInfo
newHealInfo.CurrentScanMode = scanMode
if scanMode == madmin.HealDeepScan {
newHealInfo.BitrotStartTime = time.Now().UTC()
newHealInfo.BitrotStartCycle = nextBloomCycle
}
saveBackgroundHealInfo(ctx, objAPI, newHealInfo)
}

// Wait before starting next cycle and wait on startup.
results := make(chan DataUsageInfo, 1)
go storeDataUsageInBackend(ctx, objAPI, results)
bf, err := globalNotificationSys.updateBloomFilter(ctx, nextBloomCycle)
logger.LogIf(ctx, err)
err = objAPI.NSScanner(ctx, bf, results, uint32(nextBloomCycle))
err = objAPI.NSScanner(ctx, bf, results, uint32(nextBloomCycle), scanMode)
logger.LogIf(ctx, err)
if err == nil {
// Store new cycle...
Expand Down Expand Up @@ -182,6 +252,7 @@ type folderScanner struct {
dataUsageScannerDebug bool
healFolderInclude uint32 // Include a clean folder one in n cycles.
healObjectSelect uint32 // Do a heal check on an object once every n cycles. Must divide into healFolderInclude
scanMode madmin.HealScanMode

disks []StorageAPI
disksQuorum int
Expand Down Expand Up @@ -250,7 +321,7 @@ var globalScannerStats scannerStats
// The returned cache will always be valid, but may not be updated from the existing.
// Before each operation sleepDuration is called which can be used to temporarily halt the scanner.
// If the supplied context is canceled the function will return at the first chance.
func scanDataFolder(ctx context.Context, poolIdx, setIdx int, basePath string, cache dataUsageCache, getSize getSizeFn) (dataUsageCache, error) {
func scanDataFolder(ctx context.Context, poolIdx, setIdx int, basePath string, cache dataUsageCache, getSize getSizeFn, scanMode madmin.HealScanMode) (dataUsageCache, error) {
t := UTCNow()

logPrefix := color.Green("data-usage: ")
Expand Down Expand Up @@ -279,6 +350,7 @@ func scanDataFolder(ctx context.Context, poolIdx, setIdx int, basePath string, c
dataUsageScannerDebug: intDataUpdateTracker.debug,
healFolderInclude: 0,
healObjectSelect: 0,
scanMode: scanMode,
updates: cache.Info.updates,
}

Expand Down Expand Up @@ -482,12 +554,15 @@ func (f *folderScanner) scanFolder(ctx context.Context, folder cachedFolder, int
debug: f.dataUsageScannerDebug,
lifeCycle: activeLifeCycle,
replication: replicationCfg,
heal: thisHash.modAlt(f.oldCache.Info.NextCycle/folder.objectHealProbDiv, f.healObjectSelect/folder.objectHealProbDiv) && globalIsErasure,
}

item.heal.enabled = thisHash.modAlt(f.oldCache.Info.NextCycle/folder.objectHealProbDiv, f.healObjectSelect/folder.objectHealProbDiv) && globalIsErasure
item.heal.bitrot = f.scanMode == madmin.HealDeepScan

// if the drive belongs to an erasure set
// that is already being healed, skip the
// healing attempt on this drive.
item.heal = item.heal && f.healObjectSelect > 0
item.heal.enabled = item.heal.enabled && f.healObjectSelect > 0

sz, err := f.getSize(item)
if err != nil {
Expand Down Expand Up @@ -821,8 +896,11 @@ type scannerItem struct {
replication replicationConfig
lifeCycle *lifecycle.Lifecycle
Typ fs.FileMode
heal bool // Has the object been selected for heal check?
debug bool
heal struct {
enabled bool
bitrot bool
} // Has the object been selected for heal check?
debug bool
}

type sizeSummary struct {
Expand Down Expand Up @@ -874,9 +952,13 @@ func (i *scannerItem) applyHealing(ctx context.Context, o ObjectLayer, oi Object
console.Debugf(applyActionsLogPrefix+" heal checking: %v/%v\n", i.bucket, i.objectPath())
}
}
scanMode := madmin.HealNormalScan
if i.heal.bitrot {
scanMode = madmin.HealDeepScan
}
healOpts := madmin.HealOpts{
Remove: healDeleteDangling,
ScanMode: globalHealConfig.ScanMode(),
ScanMode: scanMode,
}
res, err := o.HealObject(ctx, i.bucket, i.objectPath(), oi.VersionID, healOpts)
if err != nil && !errors.Is(err, NotImplemented{}) {
Expand Down Expand Up @@ -1040,7 +1122,7 @@ func (i *scannerItem) applyActions(ctx context.Context, o ObjectLayer, oi Object
// from the current deployment, which means we don't have to call healing
// routine even if we are asked to do via heal flag.
if !applied {
if i.heal {
if i.heal.enabled {
size = i.applyHealing(ctx, o, oi)
}
// replicate only if lifecycle rules are not applied.
Expand Down
2 changes: 2 additions & 0 deletions cmd/data-usage.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ const (
dataUsageBloomName = ".bloomcycle.bin"
dataUsageBloomNamePath = bucketMetaPrefix + SlashSeparator + dataUsageBloomName

backgroundHealInfoPath = bucketMetaPrefix + SlashSeparator + ".background-heal.json"

dataUsageCacheName = ".usage-cache.bin"
)

Expand Down
10 changes: 5 additions & 5 deletions cmd/data-usage_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func TestDataUsageUpdate(t *testing.T) {
return
}

got, err := scanDataFolder(context.Background(), 0, 0, base, dataUsageCache{Info: dataUsageCacheInfo{Name: bucket}}, getSize)
got, err := scanDataFolder(context.Background(), 0, 0, base, dataUsageCache{Info: dataUsageCacheInfo{Name: bucket}}, getSize, 0)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -178,7 +178,7 @@ func TestDataUsageUpdate(t *testing.T) {
}
// Changed dir must be picked up in this many cycles.
for i := 0; i < dataUsageUpdateDirCycles; i++ {
got, err = scanDataFolder(context.Background(), 0, 0, base, got, getSize)
got, err = scanDataFolder(context.Background(), 0, 0, base, got, getSize, 0)
got.Info.NextCycle++
if err != nil {
t.Fatal(err)
Expand Down Expand Up @@ -289,7 +289,7 @@ func TestDataUsageUpdatePrefix(t *testing.T) {
}
return
}
got, err := scanDataFolder(context.Background(), 0, 0, base, dataUsageCache{Info: dataUsageCacheInfo{Name: "bucket"}}, getSize)
got, err := scanDataFolder(context.Background(), 0, 0, base, dataUsageCache{Info: dataUsageCacheInfo{Name: "bucket"}}, getSize, 0)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -423,7 +423,7 @@ func TestDataUsageUpdatePrefix(t *testing.T) {
}
// Changed dir must be picked up in this many cycles.
for i := 0; i < dataUsageUpdateDirCycles; i++ {
got, err = scanDataFolder(context.Background(), 0, 0, base, got, getSize)
got, err = scanDataFolder(context.Background(), 0, 0, base, got, getSize, 0)
got.Info.NextCycle++
if err != nil {
t.Fatal(err)
Expand Down Expand Up @@ -575,7 +575,7 @@ func TestDataUsageCacheSerialize(t *testing.T) {
}
return
}
want, err := scanDataFolder(context.Background(), 0, 0, base, dataUsageCache{Info: dataUsageCacheInfo{Name: bucket}}, getSize)
want, err := scanDataFolder(context.Background(), 0, 0, base, dataUsageCache{Info: dataUsageCacheInfo{Name: bucket}}, getSize, 0)
if err != nil {
t.Fatal(err)
}
Expand Down
4 changes: 2 additions & 2 deletions cmd/erasure-server-pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,7 @@ func (z *erasureServerPools) StorageInfo(ctx context.Context) (StorageInfo, []er
return storageInfo, errs
}

func (z *erasureServerPools) NSScanner(ctx context.Context, bf *bloomFilter, updates chan<- DataUsageInfo, wantCycle uint32) error {
func (z *erasureServerPools) NSScanner(ctx context.Context, bf *bloomFilter, updates chan<- DataUsageInfo, wantCycle uint32, healScanMode madmin.HealScanMode) error {
// Updates must be closed before we return.
defer close(updates)

Expand Down Expand Up @@ -576,7 +576,7 @@ func (z *erasureServerPools) NSScanner(ctx context.Context, bf *bloomFilter, upd
}
}()
// Start scanner. Blocks until done.
err := erObj.nsScanner(ctx, allBuckets, bf, wantCycle, updates)
err := erObj.nsScanner(ctx, allBuckets, bf, wantCycle, updates, healScanMode)
if err != nil {
logger.LogIf(ctx, err)
mu.Lock()
Expand Down
4 changes: 2 additions & 2 deletions cmd/erasure.go
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ func (er erasureObjects) cleanupDeletedObjects(ctx context.Context) {

// nsScanner will start scanning buckets and send updated totals as they are traversed.
// Updates are sent on a regular basis and the caller *must* consume them.
func (er erasureObjects) nsScanner(ctx context.Context, buckets []BucketInfo, bf *bloomFilter, wantCycle uint32, updates chan<- dataUsageCache) error {
func (er erasureObjects) nsScanner(ctx context.Context, buckets []BucketInfo, bf *bloomFilter, wantCycle uint32, updates chan<- dataUsageCache, healScanMode madmin.HealScanMode) error {
if len(buckets) == 0 {
return nil
}
Expand Down Expand Up @@ -490,7 +490,7 @@ func (er erasureObjects) nsScanner(ctx context.Context, buckets []BucketInfo, bf
// Calc usage
before := cache.Info.LastUpdate
var err error
cache, err = disk.NSScanner(ctx, cache, updates)
cache, err = disk.NSScanner(ctx, cache, updates, healScanMode)
cache.Info.BloomFilter = nil
if err != nil {
if !cache.Info.LastUpdate.IsZero() && cache.Info.LastUpdate.After(before) {
Expand Down
4 changes: 2 additions & 2 deletions cmd/fs-v1.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ func (fs *FSObjects) StorageInfo(ctx context.Context) (StorageInfo, []error) {
}

// NSScanner returns data usage stats of the current FS deployment
func (fs *FSObjects) NSScanner(ctx context.Context, bf *bloomFilter, updates chan<- DataUsageInfo, wantCycle uint32) error {
func (fs *FSObjects) NSScanner(ctx context.Context, bf *bloomFilter, updates chan<- DataUsageInfo, wantCycle uint32, _ madmin.HealScanMode) error {
defer close(updates)
// Load bucket totals
var totalCache dataUsageCache
Expand Down Expand Up @@ -396,7 +396,7 @@ func (fs *FSObjects) scanBucket(ctx context.Context, bucket string, cache dataUs
}

return sizeSummary{totalSize: fi.Size(), versions: 1}, nil
})
}, 0)

return cache, err
}
Expand Down
2 changes: 1 addition & 1 deletion cmd/gateway-unsupported.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func (a GatewayUnsupported) LocalStorageInfo(ctx context.Context) (StorageInfo,
}

// NSScanner - scanner is not implemented for gateway
func (a GatewayUnsupported) NSScanner(ctx context.Context, bf *bloomFilter, updates chan<- DataUsageInfo, wantCycle uint32) error {
func (a GatewayUnsupported) NSScanner(ctx context.Context, bf *bloomFilter, updates chan<- DataUsageInfo, wantCycle uint32, scanMode madmin.HealScanMode) error {
logger.CriticalIf(ctx, errors.New("not implemented"))
return NotImplemented{}
}
Expand Down
5 changes: 2 additions & 3 deletions cmd/global-heal.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ func newBgHealSequence() *healSequence {

hs := madmin.HealOpts{
// Remove objects that do not have read-quorum
Remove: healDeleteDangling,
ScanMode: globalHealConfig.ScanMode(),
Remove: healDeleteDangling,
}

return &healSequence{
Expand Down Expand Up @@ -165,7 +164,7 @@ func mustGetHealSequence(ctx context.Context) *healSequence {
// healErasureSet lists and heals all objects in a specific erasure set
func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, tracker *healingTracker) error {
bgSeq := mustGetHealSequence(ctx)
scanMode := globalHealConfig.ScanMode()
scanMode := madmin.HealNormalScan

// Make sure to copy since `buckets slice`
// is modified in place by tracker.
Expand Down
2 changes: 1 addition & 1 deletion cmd/mrf.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ func (m *mrfState) healRoutine() {
defer idler.Stop()

mrfHealingOpts := madmin.HealOpts{
ScanMode: globalHealConfig.ScanMode(),
ScanMode: madmin.HealNormalScan,
Remove: healDeleteDangling,
}

Expand Down
6 changes: 4 additions & 2 deletions cmd/naughty-disk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ import (
"io"
"sync"
"time"

"github.com/minio/madmin-go"
)

// naughtyDisk wraps a POSIX disk and returns programmed errors
Expand Down Expand Up @@ -110,8 +112,8 @@ func (d *naughtyDisk) SetDiskID(id string) {
d.disk.SetDiskID(id)
}

func (d *naughtyDisk) NSScanner(ctx context.Context, cache dataUsageCache, updates chan<- dataUsageEntry) (info dataUsageCache, err error) {
return d.disk.NSScanner(ctx, cache, updates)
func (d *naughtyDisk) NSScanner(ctx context.Context, cache dataUsageCache, updates chan<- dataUsageEntry, scanMode madmin.HealScanMode) (info dataUsageCache, err error) {
return d.disk.NSScanner(ctx, cache, updates, scanMode)
}

func (d *naughtyDisk) DiskInfo(ctx context.Context) (info DiskInfo, err error) {
Expand Down
2 changes: 1 addition & 1 deletion cmd/object-api-interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ type ObjectLayer interface {

// Storage operations.
Shutdown(context.Context) error
NSScanner(ctx context.Context, bf *bloomFilter, updates chan<- DataUsageInfo, wantCycle uint32) error
NSScanner(ctx context.Context, bf *bloomFilter, updates chan<- DataUsageInfo, wantCycle uint32, scanMode madmin.HealScanMode) error
BackendInfo() madmin.BackendInfo
StorageInfo(ctx context.Context) (StorageInfo, []error)
LocalStorageInfo(ctx context.Context) (StorageInfo, []error)
Expand Down
6 changes: 4 additions & 2 deletions cmd/storage-interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"context"
"io"
"time"

"github.com/minio/madmin-go"
)

// StorageAPI interface.
Expand Down Expand Up @@ -64,7 +66,7 @@ type StorageAPI interface {
// has never been replaced.
Healing() *healingTracker
DiskInfo(ctx context.Context) (info DiskInfo, err error)
NSScanner(ctx context.Context, cache dataUsageCache, updates chan<- dataUsageEntry) (dataUsageCache, error)
NSScanner(ctx context.Context, cache dataUsageCache, updates chan<- dataUsageEntry, scanMode madmin.HealScanMode) (dataUsageCache, error)

// Volume operations.
MakeVol(ctx context.Context, volume string) (err error)
Expand Down Expand Up @@ -142,7 +144,7 @@ func (p *unrecognizedDisk) Healing() *healingTracker {
return nil
}

func (p *unrecognizedDisk) NSScanner(ctx context.Context, cache dataUsageCache, updates chan<- dataUsageEntry) (dataUsageCache, error) {
func (p *unrecognizedDisk) NSScanner(ctx context.Context, cache dataUsageCache, updates chan<- dataUsageEntry, scanMode madmin.HealScanMode) (dataUsageCache, error) {
return dataUsageCache{}, errDiskNotFound
}

Expand Down
Loading

0 comments on commit 16431d2

Please sign in to comment.