diff --git a/statistics/builder.go b/statistics/builder.go index 69528dc27b1cc..1eb85057fc9ea 100644 --- a/statistics/builder.go +++ b/statistics/builder.go @@ -110,10 +110,13 @@ func BuildColumn(ctx sessionctx.Context, numBuckets, id int64, collector *Sample ndv = count } hg := NewHistogram(id, ndv, collector.NullCount, 0, tp, int(numBuckets)) - valuesPerBucket := float64(count)/float64(numBuckets) + 1 // As we use samples to build the histogram, the bucket number and repeat should multiply a factor. sampleFactor := float64(count) / float64(len(samples)) + // Since bucket count is increased by sampleFactor, so the actual max values per bucket is + // floor(valuesPerBucket/sampleFactor)*sampleFactor, which may less than valuesPerBucket, + // thus we need to add a sampleFactor to avoid building too many buckets. + valuesPerBucket := float64(count)/float64(numBuckets) + sampleFactor ndvFactor := float64(count) / float64(hg.NDV) if ndvFactor > sampleFactor { ndvFactor = sampleFactor diff --git a/statistics/statistics_test.go b/statistics/statistics_test.go index f1c4c2ffc13a4..c53c720c40711 100644 --- a/statistics/statistics_test.go +++ b/statistics/statistics_test.go @@ -246,15 +246,15 @@ func (s *testStatisticsSuite) TestBuild(c *C) { c.Check(err, IsNil) checkRepeats(c, col) col.PreCalculateScalar() - c.Check(col.Len(), Equals, 232) + c.Check(col.Len(), Equals, 226) count := col.equalRowCount(types.NewIntDatum(1000)) c.Check(int(count), Equals, 0) count = col.lessRowCount(types.NewIntDatum(1000)) c.Check(int(count), Equals, 10000) count = col.lessRowCount(types.NewIntDatum(2000)) - c.Check(int(count), Equals, 19995) + c.Check(int(count), Equals, 19999) count = col.greaterRowCount(types.NewIntDatum(2000)) - c.Check(int(count), Equals, 80003) + c.Check(int(count), Equals, 80000) count = col.lessRowCount(types.NewIntDatum(200000000)) c.Check(int(count), Equals, 100000) count = col.greaterRowCount(types.NewIntDatum(200000000)) @@ -262,7 +262,7 @@ func (s *testStatisticsSuite) TestBuild(c *C) { count = col.equalRowCount(types.NewIntDatum(200000000)) c.Check(count, Equals, 0.0) count = col.betweenRowCount(types.NewIntDatum(3000), types.NewIntDatum(3500)) - c.Check(int(count), Equals, 5008) + c.Check(int(count), Equals, 4994) count = col.lessRowCount(types.NewIntDatum(1)) c.Check(int(count), Equals, 9) @@ -280,6 +280,7 @@ func (s *testStatisticsSuite) TestBuild(c *C) { col, err = BuildColumn(mock.NewContext(), 256, 2, collectors[0], types.NewFieldType(mysql.TypeLonglong)) c.Assert(err, IsNil) checkRepeats(c, col) + c.Assert(col.Len(), Equals, 250) tblCount, col, _, err := buildIndex(ctx, bucketCount, 1, ast.RecordSet(s.rc)) c.Check(err, IsNil) @@ -497,12 +498,12 @@ func (s *testStatisticsSuite) TestColumnRange(c *C) { ran[0].HighExclude = true count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran) c.Assert(err, IsNil) - c.Assert(int(count), Equals, 9994) + c.Assert(int(count), Equals, 9998) ran[0].LowExclude = false ran[0].HighExclude = false count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran) c.Assert(err, IsNil) - c.Assert(int(count), Equals, 9996) + c.Assert(int(count), Equals, 10000) ran[0].LowVal[0] = ran[0].HighVal[0] count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran) c.Assert(err, IsNil)