Skip to content

Commit

Permalink
importer: generate string by stats (pingcap#5804)
Browse files Browse the repository at this point in the history
  • Loading branch information
hanfei1991 authored Feb 7, 2018
1 parent fe165d9 commit be1eeac
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 3 deletions.
6 changes: 6 additions & 0 deletions cmd/importer/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ func intRangeValue(column *column, min int64, max int64) (int64, int64) {
}

func randStringValue(column *column, n int) string {
if column.hist != nil {
if column.hist.avgLen == 0 {
column.hist.avgLen = column.hist.getAvgLen(n)
}
return column.hist.randString()
}
if len(column.set) > 0 {
idx := randInt(0, len(column.set)-1)
return column.set[idx]
Expand Down
55 changes: 52 additions & 3 deletions cmd/importer/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package main
import (
"encoding/json"
"io/ioutil"
"math/rand"

"github.com/juju/errors"
"github.com/pingcap/tidb/model"
Expand Down Expand Up @@ -43,11 +44,12 @@ func loadStats(tblInfo *model.TableInfo, path string) (*stats.Table, error) {
type histogram struct {
stats.Histogram

index *model.IndexInfo
index *model.IndexInfo
avgLen int
}

// When the cnt falls in the middle of bucket, we return the idx of lower bound which is an even number.
// When the cnt falls in the end of bucket, we return the upper bound which is odd.
// When the randCnt falls in the middle of bucket, we return the idx of lower bound which is an even number.
// When the randCnt falls in the end of bucket, we return the upper bound which is odd.
func (h *histogram) getRandomBoundIdx() int {
cnt := h.Buckets[len(h.Buckets)-1].Count
randCnt := randInt64(0, cnt)
Expand Down Expand Up @@ -83,3 +85,50 @@ func (h *histogram) randInt() int64 {
}
return h.Bounds.GetRow(idx).GetInt64(0)
}

func getValidPrefix(lower, upper string) string {
for i := range lower {
if i >= len(upper) {
log.Fatalf("lower %s is larger than upper %s", lower, upper)
}
if lower[i] != upper[i] {
randCh := uint8(rand.Intn(int(upper[i]-lower[i]))) + lower[i]
newBytes := make([]byte, i, i+1)
copy(newBytes, lower[:i])
newBytes = append(newBytes, byte(randCh))
return string(newBytes)
}
}
return lower
}

func (h *histogram) getAvgLen(maxLen int) int {
l := h.Bounds.NumRows()
totalLen := 0
for i := 0; i < l; i++ {
totalLen += len(h.Bounds.GetRow(i).GetString(0))
}
avg := totalLen / l
if avg > maxLen {
avg = maxLen
}
if avg == 0 {
avg = 1
}
return avg
}

func (h *histogram) randString() string {
idx := h.getRandomBoundIdx()
if idx%2 == 0 {
lower := h.Bounds.GetRow(idx).GetString(0)
upper := h.Bounds.GetRow(idx + 1).GetString(0)
prefix := getValidPrefix(lower, upper)
restLen := h.avgLen - len(prefix)
if restLen > 0 {
prefix = prefix + randString(restLen)
}
return prefix
}
return h.Bounds.GetRow(idx).GetString(0)
}

0 comments on commit be1eeac

Please sign in to comment.