diff --git a/br/pkg/errors/errors.go b/br/pkg/errors/errors.go index 130569514db35..1b78a861fa668 100644 --- a/br/pkg/errors/errors.go +++ b/br/pkg/errors/errors.go @@ -97,6 +97,7 @@ var ( ErrKVClusterIDMismatch = errors.Normalize("tikv cluster ID mismatch", errors.RFCCodeText("BR:KV:ErrKVClusterIDMismatch")) ErrKVNotLeader = errors.Normalize("not leader", errors.RFCCodeText("BR:KV:ErrKVNotLeader")) ErrKVNotTiKV = errors.Normalize("storage is not tikv", errors.RFCCodeText("BR:KV:ErrNotTiKVStorage")) + ErrKVDiskFull = errors.Normalize("disk is full", errors.RFCCodeText("BR:KV:ErrKVDiskFull")) // ErrKVEpochNotMatch is the error raised when ingestion failed with "epoch // not match". This error is retryable. diff --git a/br/pkg/task/BUILD.bazel b/br/pkg/task/BUILD.bazel index cdaebcd217366..b8bb4177bd6af 100644 --- a/br/pkg/task/BUILD.bazel +++ b/br/pkg/task/BUILD.bazel @@ -135,6 +135,7 @@ go_test( "//pkg/testkit", "//pkg/types", "//pkg/util/table-filter", + "@com_github_docker_go_units//:go-units", "@com_github_gogo_protobuf//proto", "@com_github_golang_protobuf//proto", "@com_github_pingcap_errors//:errors", diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go index 1c047044d09b0..8bc6383be78b6 100644 --- a/br/pkg/task/restore.go +++ b/br/pkg/task/restore.go @@ -102,7 +102,6 @@ const ( defaultFlagDdlBatchSize = 128 resetSpeedLimitRetryTimes = 3 maxRestoreBatchSizeLimit = 10240 - pb = 1024 * 1024 * 1024 * 1024 * 1024 ) const ( @@ -1240,37 +1239,41 @@ func getStores(ctx context.Context, mgr *conn.Mgr) (stores *http.StoresInfo, err return stores, nil } -func EstimateTikvUsage(files []*backuppb.File, maxReplica uint64, storeCnt int) uint64 { +func EstimateTikvUsage(files []*backuppb.File, replicaCnt uint64, storeCnt uint64) uint64 { if storeCnt == 0 { return 0 } - var totalSize uint64 = 0 + if replicaCnt > storeCnt { + replicaCnt = storeCnt + } + totalSize := uint64(0) for _, file := range files { totalSize += file.GetSize_() } - return totalSize * maxReplica / uint64(storeCnt) + log.Info("estimate tikv usage", zap.Uint64("total size", totalSize), zap.Uint64("replicaCnt", replicaCnt), zap.Uint64("store count", storeCnt)) + return totalSize * replicaCnt / storeCnt } -func EstimateTiflashUsage(tables []*metautil.Table, storeCnt int) uint64 { +func EstimateTiflashUsage(tables []*metautil.Table, storeCnt uint64) uint64 { if storeCnt == 0 { return 0 } - var tiflashTotal uint64 = 0 + tiflashTotal := uint64(0) for _, table := range tables { - if table.TiFlashReplicas <= 0 { + if table.Info.TiFlashReplica == nil || table.Info.TiFlashReplica.Count <= 0 { continue } tableBytes := uint64(0) for _, file := range table.Files { tableBytes += file.GetSize_() } - tiflashTotal += tableBytes * uint64(table.TiFlashReplicas) + tiflashTotal += tableBytes * table.Info.TiFlashReplica.Count } - return tiflashTotal / uint64(storeCnt) + log.Info("estimate tiflash usage", zap.Uint64("total size", tiflashTotal), zap.Uint64("store count", storeCnt)) + return tiflashTotal / storeCnt } func CheckStoreSpace(necessary uint64, store *http.StoreInfo) error { - // Be careful editing the message, it is used in DiskCheckBackoffer available, err := units.RAMInBytes(store.Status.Available) if err != nil { return errors.Annotatef(berrors.ErrPDInvalidResponse, "store %d has invalid available space %s", store.Store.ID, store.Status.Available) @@ -1279,7 +1282,7 @@ func CheckStoreSpace(necessary uint64, store *http.StoreInfo) error { return errors.Annotatef(berrors.ErrPDInvalidResponse, "store %d has invalid available space %s", store.Store.ID, store.Status.Available) } if uint64(available) < necessary { - return errors.Errorf("store %d has no space left on device, available %s, necessary %s", + return errors.Annotatef(berrors.ErrKVDiskFull, "store %d has no space left on device, available %s, necessary %s", store.Store.ID, units.BytesSize(float64(available)), units.BytesSize(float64(necessary))) } return nil @@ -1295,7 +1298,7 @@ func checkDiskSpace(ctx context.Context, mgr *conn.Mgr, files []*backuppb.File, return errors.Trace(err) } - tikvCnt, tiflashCnt := 0, 0 + var tikvCnt, tiflashCnt uint64 = 0, 0 for i := range stores.Stores { store := &stores.Stores[i] if engine.IsTiFlashHTTPResp(&store.Store) { @@ -1307,13 +1310,14 @@ func checkDiskSpace(ctx context.Context, mgr *conn.Mgr, files []*backuppb.File, // We won't need to restore more than 1800 PB data at one time, right? preserve := func(base uint64, ratio float32) uint64 { - if base > 1000*pb { + if base > 1000*units.PB { return base } return base * uint64(ratio*10) / 10 } tikvUsage := preserve(EstimateTikvUsage(files, maxReplica, tikvCnt), 1.1) - tiflashUsage := preserve(EstimateTiflashUsage(tables, tiflashCnt), 1.1) + tiflashUsage := preserve(EstimateTiflashUsage(tables, tiflashCnt), 1.4) + log.Info("preserved disk space", zap.Uint64("tikv", tikvUsage), zap.Uint64("tiflash", tiflashUsage)) err = utils.WithRetry(ctx, func() error { stores, err = getStores(ctx, mgr) diff --git a/br/pkg/task/restore_test.go b/br/pkg/task/restore_test.go index 4f50946acd449..10ac59f7b0932 100644 --- a/br/pkg/task/restore_test.go +++ b/br/pkg/task/restore_test.go @@ -9,6 +9,7 @@ import ( "strconv" "testing" + "github.com/docker/go-units" "github.com/gogo/protobuf/proto" backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/kvproto/pkg/encryptionpb" @@ -32,7 +33,7 @@ import ( pdhttp "github.com/tikv/pd/client/http" ) -const pb uint64 = 1024 * 1024 * 1024 * 1024 * 1024 +const pb uint64 = units.PB func TestPreCheckTableTiFlashReplicas(t *testing.T) { mockStores := []*metapb.Store{ @@ -492,23 +493,26 @@ func TestTikvUsage(t *testing.T) { {Name: "F5", Size_: 5 * pb}, } replica := uint64(3) - storeCnt := 6 + storeCnt := uint64(6) total := uint64(0) for _, f := range files { total += f.GetSize_() } ret := task.EstimateTikvUsage(files, replica, storeCnt) - require.Equal(t, total*replica/uint64(storeCnt), ret) + require.Equal(t, total*replica/storeCnt, ret) } func TestTiflashUsage(t *testing.T) { tables := []*metautil.Table{ - {TiFlashReplicas: 0, Files: []*backuppb.File{{Size_: 1 * pb}}}, - {TiFlashReplicas: 1, Files: []*backuppb.File{{Size_: 2 * pb}}}, - {TiFlashReplicas: 2, Files: []*backuppb.File{{Size_: 3 * pb}}}, + {Info: &model.TableInfo{TiFlashReplica: &model.TiFlashReplicaInfo{Count: 0}}, + Files: []*backuppb.File{{Size_: 1 * pb}}}, + {Info: &model.TableInfo{TiFlashReplica: &model.TiFlashReplicaInfo{Count: 1}}, + Files: []*backuppb.File{{Size_: 2 * pb}}}, + {Info: &model.TableInfo{TiFlashReplica: &model.TiFlashReplicaInfo{Count: 2}}, + Files: []*backuppb.File{{Size_: 3 * pb}}}, } - storeCnt := 3 + var storeCnt uint64 = 3 ret := task.EstimateTiflashUsage(tables, storeCnt) require.Equal(t, 8*pb/3, ret) } diff --git a/br/pkg/utils/backoff.go b/br/pkg/utils/backoff.go index fb947980e572e..b9ce6dcb0b5d6 100644 --- a/br/pkg/utils/backoff.go +++ b/br/pkg/utils/backoff.go @@ -298,23 +298,18 @@ func NewDiskCheckBackoffer() Backoffer { func (bo *DiskCheckBackoffer) NextBackoff(err error) time.Duration { e := errors.Cause(err) switch e { // nolint:errorlint - case nil, context.Canceled, context.DeadlineExceeded: + case nil, context.Canceled, context.DeadlineExceeded, berrors.ErrKVDiskFull: bo.delayTime = 0 bo.attempt = 0 case berrors.ErrPDInvalidResponse: bo.delayTime = 2 * bo.delayTime bo.attempt-- default: - if strings.Contains(e.Error(), "no space left on device") { - bo.delayTime = 0 - bo.attempt = 0 - } else { - bo.delayTime = 2 * bo.delayTime - if bo.attempt > 5 { - bo.attempt = 5 - } - bo.attempt-- + bo.delayTime = 2 * bo.delayTime + if bo.attempt > 5 { + bo.attempt = 5 } + bo.attempt-- } if bo.delayTime > bo.maxDelayTime { diff --git a/errors.toml b/errors.toml index b3a2e27a2a0df..b97bde0ae5c53 100644 --- a/errors.toml +++ b/errors.toml @@ -111,6 +111,11 @@ error = ''' tikv cluster ID mismatch ''' +["BR:KV:ErrKVDiskFull"] +error = ''' +disk is full +''' + ["BR:KV:ErrKVDownloadFailed"] error = ''' download sst failed