Skip to content

Commit

Permalink
statistics: fix the top-n size to not hold the small things (pingcap#…
Browse files Browse the repository at this point in the history
  • Loading branch information
winoros authored Jun 3, 2021
1 parent b21aed8 commit 3e3f977
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 14 deletions.
55 changes: 55 additions & 0 deletions statistics/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package statistics

import (
"bytes"
"math"

"github.com/pingcap/errors"
"github.com/pingcap/tidb/sessionctx"
Expand Down Expand Up @@ -329,6 +330,8 @@ func BuildHistAndTopN(
}
}

topNList = pruneTopNItem(topNList, ndv, nullCount, sampleNum, count)

// Step2: exclude topn from samples
for i := int64(0); i < int64(len(samples)); i++ {
sampleBytes, err := getComparedBytes(samples[i].Value)
Expand Down Expand Up @@ -366,3 +369,55 @@ func BuildHistAndTopN(

return hg, topn, nil
}

// pruneTopNItem tries to prune the least common values in the top-n list if it is not significantly more common than the values not in the list.
// We assume that the ones not in the top-n list's selectivity is 1/remained_ndv which is the internal implementation of EqualRowCount
func pruneTopNItem(topns []TopNMeta, ndv, nullCount, sampleRows, totalRows int64) []TopNMeta {
// If the sampleRows holds all rows. We just return the top-n directly.
if sampleRows == totalRows || totalRows <= 1 {
return topns
}
// Sum the occurrence except the least common one from the top-n list. To check whether the lest common one is worth
// storing later.
sumCount := uint64(0)
for i := 0; i < len(topns)-1; i++ {
sumCount += topns[i].Count
}
topNNum := len(topns)
for topNNum > 0 {
// Selectivity for the ones not in the top-n list.
// (1 - things in top-n list - null) / remained ndv.
selectivity := 1.0 - float64(sumCount)/float64(sampleRows) - float64(nullCount)/float64(totalRows)
if selectivity < 0.0 {
selectivity = 0
}
if selectivity > 1 {
selectivity = 1
}
otherNDV := float64(ndv) - float64(topNNum)
if otherNDV > 1 {
selectivity /= otherNDV
}
N := float64(totalRows)
n := float64(sampleRows)
K := N * float64(topns[topNNum-1].Count) / n
// Since we are sampling without replacement. The distribution would be a hypergeometric distribution.
// Thus the variance is the following formula.
variance := n * K * (N - K) * (N - n) / (N * N * (N - 1))
stddev := math.Sqrt(variance)
// We choose the bound that plus two stddev of the sample frequency, plus an additional 0.5 for the continuity correction.
// Note:
// The mean + 2 * stddev is known as Wald confidence interval, plus 0.5 would be continuity-corrected Wald interval
if float64(topns[topNNum-1].Count) > selectivity*n+2*stddev+0.5 {
// If the current one is worth storing, the latter ones too. So we just break here.
break
}
// Current one is not worth storing, remove it and subtract it from sumCount, go to next one.
topNNum--
if topNNum == 0 {
break
}
sumCount -= topns[topNNum-1].Count
}
return topns[:topNNum]
}
18 changes: 11 additions & 7 deletions statistics/handle/handle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -848,9 +848,13 @@ func (s *testStatsSuite) prepareForGlobalStatsWithOpts(c *C, tk *testkit.TestKit
buf1.WriteString(fmt.Sprintf(", (%v)", i))
buf2.WriteString(fmt.Sprintf(", (%v)", 100000+i))
}
for i := 0; i < 1000; i++ {
buf1.WriteString(fmt.Sprintf(", (%v)", 0))
buf2.WriteString(fmt.Sprintf(", (%v)", 100000))
}
tk.MustExec(buf1.String())
tk.MustExec(buf2.String())
tk.MustExec("set @@tidb_analyze_version=2")
tk.MustExec("set @@tidb_analyze_version=3")
tk.MustExec("set @@tidb_partition_prune_mode='dynamic'")
c.Assert(s.do.StatsHandle().DumpStatsDeltaToKV(handle.DumpAll), IsNil)
}
Expand Down Expand Up @@ -907,23 +911,23 @@ func (s *testStatsSuite) TestAnalyzeGlobalStatsWithOpts2(c *C) {
tk := testkit.NewTestKit(c, s.store)
s.prepareForGlobalStatsWithOpts(c, tk)

tk.MustExec("analyze table t with 20 topn, 50 buckets")
s.checkForGlobalStatsWithOpts(c, tk, "global", 20, 50)
s.checkForGlobalStatsWithOpts(c, tk, "p0", 20, 50)
s.checkForGlobalStatsWithOpts(c, tk, "p1", 20, 50)
tk.MustExec("analyze table t with 20 topn, 50 buckets, 1000 samples")
s.checkForGlobalStatsWithOpts(c, tk, "global", 2, 50)
s.checkForGlobalStatsWithOpts(c, tk, "p0", 1, 50)
s.checkForGlobalStatsWithOpts(c, tk, "p1", 1, 50)

// analyze a partition to let its options be different with others'
tk.MustExec("analyze table t partition p0 with 10 topn, 20 buckets")
s.checkForGlobalStatsWithOpts(c, tk, "global", 10, 20) // use new options
s.checkForGlobalStatsWithOpts(c, tk, "p0", 10, 20)
s.checkForGlobalStatsWithOpts(c, tk, "p1", 20, 50)
s.checkForGlobalStatsWithOpts(c, tk, "p1", 1, 50)

tk.MustExec("analyze table t partition p1 with 100 topn, 200 buckets")
s.checkForGlobalStatsWithOpts(c, tk, "global", 100, 200)
s.checkForGlobalStatsWithOpts(c, tk, "p0", 10, 20)
s.checkForGlobalStatsWithOpts(c, tk, "p1", 100, 200)

tk.MustExec("analyze table t partition p0") // default options
tk.MustExec("analyze table t partition p0 with 20 topn") // change back to 20 topn
s.checkForGlobalStatsWithOpts(c, tk, "global", 20, 256)
s.checkForGlobalStatsWithOpts(c, tk, "p0", 20, 256)
s.checkForGlobalStatsWithOpts(c, tk, "p1", 100, 200)
Expand Down
17 changes: 10 additions & 7 deletions statistics/statistics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,23 +281,26 @@ func (s *testStatisticsSuite) TestBuild(c *C) {
colv2, topnv2, err := BuildHistAndTopN(ctx, int(bucketCount), topNCount, 2, collector, types.NewFieldType(mysql.TypeLonglong), true)
c.Check(err, IsNil)
c.Check(topnv2.TopN, NotNil)
expectedTopNCount := []uint64{9990, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}
// The most common one's occurrence is 9990, the second most common one's occurrence is 30.
// The ndv of the histogram is 73344, the total count of it is 90010. 90010/73344 vs 30, it's not a bad estimate.
expectedTopNCount := []uint64{9990}
c.Assert(len(topnv2.TopN), Equals, len(expectedTopNCount))
for i, meta := range topnv2.TopN {
c.Check(meta.Count, Equals, expectedTopNCount[i])
}
c.Check(colv2.Len(), Equals, 256)
c.Check(colv2.Len(), Equals, 251)
count = colv2.lessRowCount(types.NewIntDatum(1000))
c.Check(int(count), Equals, 325)
c.Check(int(count), Equals, 328)
count = colv2.lessRowCount(types.NewIntDatum(2000))
c.Check(int(count), Equals, 9430)
c.Check(int(count), Equals, 10007)
count = colv2.greaterRowCount(types.NewIntDatum(2000))
c.Check(int(count), Equals, 80008)
c.Check(int(count), Equals, 80001)
count = colv2.lessRowCount(types.NewIntDatum(200000000))
c.Check(int(count), Equals, 89440)
c.Check(int(count), Equals, 90010)
count = colv2.greaterRowCount(types.NewIntDatum(200000000))
c.Check(count, Equals, 0.0)
count = colv2.BetweenRowCount(types.NewIntDatum(3000), types.NewIntDatum(3500))
c.Check(int(count), Equals, 4995)
c.Check(int(count), Equals, 5001)
count = colv2.lessRowCount(types.NewIntDatum(1))
c.Check(int(count), Equals, 0)

Expand Down

0 comments on commit 3e3f977

Please sign in to comment.