Skip to content

Commit

Permalink
stats: fix building too many buckets (pingcap#5978)
Browse files Browse the repository at this point in the history
  • Loading branch information
alivxxx authored Mar 9, 2018
1 parent 831c93f commit b6ad6a2
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 7 deletions.
5 changes: 4 additions & 1 deletion statistics/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,13 @@ func BuildColumn(ctx sessionctx.Context, numBuckets, id int64, collector *Sample
ndv = count
}
hg := NewHistogram(id, ndv, collector.NullCount, 0, tp, int(numBuckets))
valuesPerBucket := float64(count)/float64(numBuckets) + 1

// As we use samples to build the histogram, the bucket number and repeat should multiply a factor.
sampleFactor := float64(count) / float64(len(samples))
// Since bucket count is increased by sampleFactor, so the actual max values per bucket is
// floor(valuesPerBucket/sampleFactor)*sampleFactor, which may less than valuesPerBucket,
// thus we need to add a sampleFactor to avoid building too many buckets.
valuesPerBucket := float64(count)/float64(numBuckets) + sampleFactor
ndvFactor := float64(count) / float64(hg.NDV)
if ndvFactor > sampleFactor {
ndvFactor = sampleFactor
Expand Down
13 changes: 7 additions & 6 deletions statistics/statistics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,23 +246,23 @@ func (s *testStatisticsSuite) TestBuild(c *C) {
c.Check(err, IsNil)
checkRepeats(c, col)
col.PreCalculateScalar()
c.Check(col.Len(), Equals, 232)
c.Check(col.Len(), Equals, 226)
count := col.equalRowCount(types.NewIntDatum(1000))
c.Check(int(count), Equals, 0)
count = col.lessRowCount(types.NewIntDatum(1000))
c.Check(int(count), Equals, 10000)
count = col.lessRowCount(types.NewIntDatum(2000))
c.Check(int(count), Equals, 19995)
c.Check(int(count), Equals, 19999)
count = col.greaterRowCount(types.NewIntDatum(2000))
c.Check(int(count), Equals, 80003)
c.Check(int(count), Equals, 80000)
count = col.lessRowCount(types.NewIntDatum(200000000))
c.Check(int(count), Equals, 100000)
count = col.greaterRowCount(types.NewIntDatum(200000000))
c.Check(count, Equals, 0.0)
count = col.equalRowCount(types.NewIntDatum(200000000))
c.Check(count, Equals, 0.0)
count = col.betweenRowCount(types.NewIntDatum(3000), types.NewIntDatum(3500))
c.Check(int(count), Equals, 5008)
c.Check(int(count), Equals, 4994)
count = col.lessRowCount(types.NewIntDatum(1))
c.Check(int(count), Equals, 9)

Expand All @@ -280,6 +280,7 @@ func (s *testStatisticsSuite) TestBuild(c *C) {
col, err = BuildColumn(mock.NewContext(), 256, 2, collectors[0], types.NewFieldType(mysql.TypeLonglong))
c.Assert(err, IsNil)
checkRepeats(c, col)
c.Assert(col.Len(), Equals, 250)

tblCount, col, _, err := buildIndex(ctx, bucketCount, 1, ast.RecordSet(s.rc))
c.Check(err, IsNil)
Expand Down Expand Up @@ -497,12 +498,12 @@ func (s *testStatisticsSuite) TestColumnRange(c *C) {
ran[0].HighExclude = true
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 9994)
c.Assert(int(count), Equals, 9998)
ran[0].LowExclude = false
ran[0].HighExclude = false
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 9996)
c.Assert(int(count), Equals, 10000)
ran[0].LowVal[0] = ran[0].HighVal[0]
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
Expand Down

0 comments on commit b6ad6a2

Please sign in to comment.