Skip to content

Commit

Permalink
store/tikv: udpate backoff. (pingcap#1506)
Browse files Browse the repository at this point in the history
  • Loading branch information
disksing authored Jul 28, 2016
1 parent b38a5cd commit a5aca8a
Show file tree
Hide file tree
Showing 15 changed files with 353 additions and 255 deletions.
98 changes: 89 additions & 9 deletions store/tikv/backoff.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"time"

"github.com/juju/errors"
"github.com/ngaut/log"
)

const (
Expand All @@ -32,17 +33,13 @@ const (
DecorrJitter
)

// NewBackoff creates a backoff func which implements exponential backoff with
// NewBackoffFn creates a backoff func which implements exponential backoff with
// optional jitters.
// See http://www.awsarchitectureblog.com/2015/03/backoff.html
func NewBackoff(retry, base, cap, jitter int) func() error {
func NewBackoffFn(base, cap, jitter int) func() int {
attempts := 0
totalSleep := 0
lastSleep := base
return func() error {
if attempts >= retry {
return errors.Errorf("still fail after %d retries, total sleep %dms", attempts, totalSleep)
}
return func() int {
var sleep int
switch jitter {
case NoJitter:
Expand All @@ -59,12 +56,95 @@ func NewBackoff(retry, base, cap, jitter int) func() error {
time.Sleep(time.Duration(sleep) * time.Millisecond)

attempts++
totalSleep += sleep
lastSleep = sleep
return nil
return lastSleep
}
}

func expo(base, cap, n int) int {
return int(math.Min(float64(cap), float64(base)*math.Pow(2.0, float64(n))))
}

type backoffType int

const (
boTiKVRPC backoffType = iota
boTxnLock
boPDRPC
boRegionMiss
)

func (t backoffType) createFn() func() int {
switch t {
case boTiKVRPC:
return NewBackoffFn(100, 2000, EqualJitter)
case boTxnLock:
return NewBackoffFn(300, 3000, EqualJitter)
case boPDRPC:
return NewBackoffFn(500, 3000, EqualJitter)
case boRegionMiss:
return NewBackoffFn(100, 500, NoJitter)
}
return nil
}

// Maximum total sleep time(in ms) for kv/cop commands.
const (
copBuildTaskMaxBackoff = 5000
tsoMaxBackoff = 5000
scannerNextMaxBackoff = 5000
batchGetMaxBackoff = 10000
copNextMaxBackoff = 10000
getMaxBackoff = 10000
prewriteMaxBackoff = 10000
commitMaxBackoff = 10000
cleanupMaxBackoff = 10000
)

// Backoffer is a utility for retrying queries.
type Backoffer struct {
fn map[backoffType]func() int
maxSleep int
totalSleep int
errors []error
}

// NewBackoffer creates a Backoffer with maximum sleep time(in ms).
func NewBackoffer(maxSleep int) *Backoffer {
return &Backoffer{
maxSleep: maxSleep,
}
}

// Backoff sleeps a while base on the backoffType and records the error message.
// It returns a retryable error if total sleep time exceeds maxSleep.
func (b *Backoffer) Backoff(typ backoffType, err error) error {
// Lazy initialize.
if b.fn == nil {
b.fn = make(map[backoffType]func() int)
}
f, ok := b.fn[typ]
if !ok {
f = typ.createFn()
b.fn[typ] = f
}

b.totalSleep += f()

log.Warnf("%v, retry later(totalSleep %dms, maxSleep %dms)", err, b.totalSleep, b.maxSleep)
b.errors = append(b.errors, err)
if b.totalSleep >= b.maxSleep {
e := errors.Errorf("backoffer.maxSleep %dms is exceeded, errors: %v", b.maxSleep, b.errors)
return errors.Annotate(e, txnRetryableMark)
}
return nil
}

// Fork creates a new Backoffer which keeps current Backoffer's sleep time and errors.
func (b *Backoffer) Fork() *Backoffer {
return &Backoffer{
maxSleep: b.maxSleep,
totalSleep: b.totalSleep,
errors: b.errors,
}
}
11 changes: 0 additions & 11 deletions store/tikv/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,14 +126,3 @@ func (c *rpcClient) Close() error {
c.p.Close()
return nil
}

// rpcBackoff is for RPC (with TiKV) retry.
// It is expected to sleep for about 10s(+/-3s) in total before abort.
func rpcBackoff() func() error {
const (
maxRetry = 10
sleepBase = 100
sleepCap = 2000
)
return NewBackoff(maxRetry, sleepBase, sleepCap, EqualJitter)
}
48 changes: 30 additions & 18 deletions store/tikv/coprocessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ func supportExpr(exprType tipb.ExprType) bool {

// Send builds the request and gets the coprocessor iterator response.
func (c *CopClient) Send(req *kv.Request) kv.Response {
tasks, err := buildCopTasks(c.store.regionCache, req.KeyRanges, req.Desc)
bo := NewBackoffer(copBuildTaskMaxBackoff)
tasks, err := buildCopTasks(bo, c.store.regionCache, req.KeyRanges, req.Desc)
if err != nil {
return copErrorResponse{err}
}
Expand Down Expand Up @@ -133,11 +134,11 @@ func (t *copTask) pbRanges() []*coprocessor.KeyRange {
return ranges
}

func buildCopTasks(cache *RegionCache, ranges []kv.KeyRange, desc bool) ([]*copTask, error) {
func buildCopTasks(bo *Backoffer, cache *RegionCache, ranges []kv.KeyRange, desc bool) ([]*copTask, error) {
var tasks []*copTask
for _, r := range ranges {
var err error
if tasks, err = appendTask(tasks, cache, r); err != nil {
if tasks, err = appendTask(tasks, bo, cache, r); err != nil {
return nil, errors.Trace(err)
}
}
Expand All @@ -155,14 +156,14 @@ func reverseTasks(tasks []*copTask) {
}
}

func appendTask(tasks []*copTask, cache *RegionCache, r kv.KeyRange) ([]*copTask, error) {
func appendTask(tasks []*copTask, bo *Backoffer, cache *RegionCache, r kv.KeyRange) ([]*copTask, error) {
var last *copTask
if len(tasks) > 0 {
last = tasks[len(tasks)-1]
}
// Ensure `r` (or part of `r`) is inside `last`, create a task if need.
if last == nil || !last.region.Contains(r.StartKey) {
region, err := cache.GetRegion(r.StartKey)
region, err := cache.GetRegion(bo, r.StartKey)
if err != nil {
return nil, errors.Trace(err)
}
Expand All @@ -187,7 +188,7 @@ func appendTask(tasks []*copTask, cache *RegionCache, r kv.KeyRange) ([]*copTask
StartKey: last.region.EndKey(),
EndKey: r.EndKey,
}
return appendTask(tasks, cache, remain)
return appendTask(tasks, bo, cache, remain)
}
return tasks, nil
}
Expand All @@ -207,6 +208,7 @@ type copIterator struct {

// Pick the next new copTask and send request to tikv-server.
func (it *copIterator) work() {
bo := NewBackoffer(copNextMaxBackoff)
for {
it.mu.Lock()
if it.finished {
Expand All @@ -227,7 +229,7 @@ func (it *copIterator) work() {
}
task.status = taskRunning
it.mu.Unlock()
resp, err := it.handleTask(task)
resp, err := it.handleTask(bo, task)
if err != nil {
it.errChan <- err
break
Expand Down Expand Up @@ -303,9 +305,8 @@ func (it *copIterator) Next() (io.ReadCloser, error) {
}

// Handle single copTask.
func (it *copIterator) handleTask(task *copTask) (*coprocessor.Response, error) {
var backoffErr error
for backoff := rpcBackoff(); backoffErr == nil; backoffErr = backoff() {
func (it *copIterator) handleTask(bo *Backoffer, task *copTask) (*coprocessor.Response, error) {
for {
req := &coprocessor.Request{
Context: task.region.GetContext(),
Tp: proto.Int64(it.req.Tp),
Expand All @@ -315,9 +316,13 @@ func (it *copIterator) handleTask(task *copTask) (*coprocessor.Response, error)
resp, err := it.store.client.SendCopReq(task.region.GetAddress(), req)
if err != nil {
it.store.regionCache.NextPeer(task.region.VerID())
err1 := it.rebuildCurrentTask(task)
if err1 != nil {
return nil, errors.Trace(err1)
err = bo.Backoff(boTiKVRPC, err)
if err != nil {
return nil, errors.Trace(err)
}
err = it.rebuildCurrentTask(bo, task)
if err != nil {
return nil, errors.Trace(err)
}
log.Warnf("send coprocessor request error: %v, try next peer later", err)
continue
Expand All @@ -328,7 +333,11 @@ func (it *copIterator) handleTask(task *copTask) (*coprocessor.Response, error)
} else {
it.store.regionCache.DropRegion(task.region.VerID())
}
err = it.rebuildCurrentTask(task)
err = bo.Backoff(boRegionMiss, err)
if err != nil {
return nil, errors.Trace(err)
}
err = it.rebuildCurrentTask(bo, task)
if err != nil {
return nil, errors.Trace(err)
}
Expand All @@ -337,8 +346,12 @@ func (it *copIterator) handleTask(task *copTask) (*coprocessor.Response, error)
}
if e := resp.GetLocked(); e != nil {
lock := newLock(it.store, e.GetPrimaryLock(), e.GetLockVersion(), e.GetKey(), e.GetLockVersion())
_, lockErr := lock.cleanup()
_, lockErr := lock.cleanup(bo)
if lockErr == nil || terror.ErrorEqual(lockErr, errInnerRetryable) {
err = bo.Backoff(boTxnLock, lockErr)
if err != nil {
return nil, errors.Trace(err)
}
continue
}
log.Warnf("cleanup lock error: %v", lockErr)
Expand All @@ -351,12 +364,11 @@ func (it *copIterator) handleTask(task *copTask) (*coprocessor.Response, error)
}
return resp, nil
}
return nil, errors.Trace(backoffErr)
}

// Rebuild current task. It may be split into multiple tasks (in region split scenario).
func (it *copIterator) rebuildCurrentTask(task *copTask) error {
newTasks, err := buildCopTasks(it.store.regionCache, task.ranges, it.req.Desc)
func (it *copIterator) rebuildCurrentTask(bo *Backoffer, task *copTask) error {
newTasks, err := buildCopTasks(bo, it.store.regionCache, task.ranges, it.req.Desc)
if err != nil {
return errors.Trace(err)
}
Expand Down
31 changes: 17 additions & 14 deletions store/tikv/coprocessor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,52 +30,54 @@ func (s *testCoprocessorSuite) TestBuildTasks(c *C) {
_, regionIDs, _ := mocktikv.BootstrapWithMultiRegions(cluster, []byte("g"), []byte("n"), []byte("t"))
cache := NewRegionCache(mocktikv.NewPDClient(cluster))

tasks, err := buildCopTasks(cache, s.buildKeyRanges("a", "c"), false)
bo := NewBackoffer(3000)

tasks, err := buildCopTasks(bo, cache, s.buildKeyRanges("a", "c"), false)
c.Assert(err, IsNil)
c.Assert(tasks, HasLen, 1)
s.taskEqual(c, tasks[0], regionIDs[0], "a", "c")

tasks, err = buildCopTasks(cache, s.buildKeyRanges("g", "n"), false)
tasks, err = buildCopTasks(bo, cache, s.buildKeyRanges("g", "n"), false)
c.Assert(err, IsNil)
c.Assert(tasks, HasLen, 1)
s.taskEqual(c, tasks[0], regionIDs[1], "g", "n")

tasks, err = buildCopTasks(cache, s.buildKeyRanges("m", "n"), false)
tasks, err = buildCopTasks(bo, cache, s.buildKeyRanges("m", "n"), false)
c.Assert(err, IsNil)
c.Assert(tasks, HasLen, 1)
s.taskEqual(c, tasks[0], regionIDs[1], "m", "n")

tasks, err = buildCopTasks(cache, s.buildKeyRanges("a", "k"), false)
tasks, err = buildCopTasks(bo, cache, s.buildKeyRanges("a", "k"), false)
c.Assert(err, IsNil)
c.Assert(tasks, HasLen, 2)
s.taskEqual(c, tasks[0], regionIDs[0], "a", "g")
s.taskEqual(c, tasks[1], regionIDs[1], "g", "k")

tasks, err = buildCopTasks(cache, s.buildKeyRanges("a", "x"), false)
tasks, err = buildCopTasks(bo, cache, s.buildKeyRanges("a", "x"), false)
c.Assert(err, IsNil)
c.Assert(tasks, HasLen, 4)
s.taskEqual(c, tasks[0], regionIDs[0], "a", "g")
s.taskEqual(c, tasks[1], regionIDs[1], "g", "n")
s.taskEqual(c, tasks[2], regionIDs[2], "n", "t")
s.taskEqual(c, tasks[3], regionIDs[3], "t", "x")

tasks, err = buildCopTasks(cache, s.buildKeyRanges("a", "b", "b", "c"), false)
tasks, err = buildCopTasks(bo, cache, s.buildKeyRanges("a", "b", "b", "c"), false)
c.Assert(err, IsNil)
c.Assert(tasks, HasLen, 1)
s.taskEqual(c, tasks[0], regionIDs[0], "a", "b", "b", "c")

tasks, err = buildCopTasks(cache, s.buildKeyRanges("a", "b", "e", "f"), false)
tasks, err = buildCopTasks(bo, cache, s.buildKeyRanges("a", "b", "e", "f"), false)
c.Assert(err, IsNil)
c.Assert(tasks, HasLen, 1)
s.taskEqual(c, tasks[0], regionIDs[0], "a", "b", "e", "f")

tasks, err = buildCopTasks(cache, s.buildKeyRanges("g", "n", "o", "p"), false)
tasks, err = buildCopTasks(bo, cache, s.buildKeyRanges("g", "n", "o", "p"), false)
c.Assert(err, IsNil)
c.Assert(tasks, HasLen, 2)
s.taskEqual(c, tasks[0], regionIDs[1], "g", "n")
s.taskEqual(c, tasks[1], regionIDs[2], "o", "p")

tasks, err = buildCopTasks(cache, s.buildKeyRanges("h", "k", "m", "p"), false)
tasks, err = buildCopTasks(bo, cache, s.buildKeyRanges("h", "k", "m", "p"), false)
c.Assert(err, IsNil)
c.Assert(tasks, HasLen, 2)
s.taskEqual(c, tasks[0], regionIDs[1], "h", "k", "m", "n")
Expand All @@ -88,8 +90,9 @@ func (s *testCoprocessorSuite) TestRebuild(c *C) {
cluster := mocktikv.NewCluster()
storeID, regionIDs, peerIDs := mocktikv.BootstrapWithMultiRegions(cluster, []byte("m"))
cache := NewRegionCache(mocktikv.NewPDClient(cluster))
bo := NewBackoffer(3000)

tasks, err := buildCopTasks(cache, s.buildKeyRanges("a", "z"), false)
tasks, err := buildCopTasks(bo, cache, s.buildKeyRanges("a", "z"), false)
c.Assert(err, IsNil)
c.Assert(tasks, HasLen, 2)
s.taskEqual(c, tasks[0], regionIDs[0], "a", "m")
Expand All @@ -102,7 +105,7 @@ func (s *testCoprocessorSuite) TestRebuild(c *C) {
cluster.Split(regionIDs[1], regionIDs[2], []byte("q"), []uint64{peerIDs[2]}, storeID)
cache.DropRegion(tasks[1].region.VerID())

tasks, err = buildCopTasks(cache, s.buildKeyRanges("a", "z"), true)
tasks, err = buildCopTasks(bo, cache, s.buildKeyRanges("a", "z"), true)
c.Assert(err, IsNil)
iter := &copIterator{
store: &tikvStore{
Expand All @@ -113,14 +116,14 @@ func (s *testCoprocessorSuite) TestRebuild(c *C) {
},
tasks: tasks,
}
err = iter.rebuildCurrentTask(iter.tasks[0])
err = iter.rebuildCurrentTask(bo, iter.tasks[0])
c.Assert(err, IsNil)
c.Assert(iter.tasks, HasLen, 3)
s.taskEqual(c, iter.tasks[2], regionIDs[0], "a", "m")
s.taskEqual(c, iter.tasks[1], regionIDs[1], "m", "q")
s.taskEqual(c, iter.tasks[0], regionIDs[2], "q", "z")

tasks, err = buildCopTasks(cache, s.buildKeyRanges("a", "z"), true)
tasks, err = buildCopTasks(bo, cache, s.buildKeyRanges("a", "z"), true)
iter = &copIterator{
store: &tikvStore{
regionCache: cache,
Expand All @@ -130,7 +133,7 @@ func (s *testCoprocessorSuite) TestRebuild(c *C) {
},
tasks: tasks,
}
err = iter.rebuildCurrentTask(iter.tasks[2])
err = iter.rebuildCurrentTask(bo, iter.tasks[2])
c.Assert(err, IsNil)
c.Assert(iter.tasks, HasLen, 3)
s.taskEqual(c, iter.tasks[2], regionIDs[0], "a", "m")
Expand Down
Loading

0 comments on commit a5aca8a

Please sign in to comment.