statistics: fix the top-n size to not hold the small things (pingcap#…

…24906)
Yui-Song · Jun 3, 2021 · 3e3f977 · 3e3f977
1 parent b21aed8
commit 3e3f977
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 14 deletions.
diff --git a/statistics/builder.go b/statistics/builder.go
@@ -15,6 +15,7 @@ package statistics
 
 import (
 	"bytes"
+	"math"
 
 	"github.com/pingcap/errors"
 	"github.com/pingcap/tidb/sessionctx"
@@ -329,6 +330,8 @@ func BuildHistAndTopN(
 		}
 	}
 
+	topNList = pruneTopNItem(topNList, ndv, nullCount, sampleNum, count)
+
 	// Step2: exclude topn from samples
 	for i := int64(0); i < int64(len(samples)); i++ {
 		sampleBytes, err := getComparedBytes(samples[i].Value)
@@ -366,3 +369,55 @@ func BuildHistAndTopN(
 
 	return hg, topn, nil
 }
+
+// pruneTopNItem tries to prune the least common values in the top-n list if it is not significantly more common than the values not in the list.
+//   We assume that the ones not in the top-n list's selectivity is 1/remained_ndv which is the internal implementation of EqualRowCount
+func pruneTopNItem(topns []TopNMeta, ndv, nullCount, sampleRows, totalRows int64) []TopNMeta {
+	// If the sampleRows holds all rows. We just return the top-n directly.
+	if sampleRows == totalRows || totalRows <= 1 {
+		return topns
+	}
+	// Sum the occurrence except the least common one from the top-n list. To check whether the lest common one is worth
+	// storing later.
+	sumCount := uint64(0)
+	for i := 0; i < len(topns)-1; i++ {
+		sumCount += topns[i].Count
+	}
+	topNNum := len(topns)
+	for topNNum > 0 {
+		// Selectivity for the ones not in the top-n list.
+		// (1 - things in top-n list - null) / remained ndv.
+		selectivity := 1.0 - float64(sumCount)/float64(sampleRows) - float64(nullCount)/float64(totalRows)
+		if selectivity < 0.0 {
+			selectivity = 0
+		}
+		if selectivity > 1 {
+			selectivity = 1
+		}
+		otherNDV := float64(ndv) - float64(topNNum)
+		if otherNDV > 1 {
+			selectivity /= otherNDV
+		}
+		N := float64(totalRows)
+		n := float64(sampleRows)
+		K := N * float64(topns[topNNum-1].Count) / n
+		// Since we are sampling without replacement. The distribution would be a hypergeometric distribution.
+		// Thus the variance is the following formula.
+		variance := n * K * (N - K) * (N - n) / (N * N * (N - 1))
+		stddev := math.Sqrt(variance)
+		// We choose the bound that plus two stddev of the sample frequency， plus an additional 0.5 for the continuity correction.
+		//   Note:
+		//  	The mean + 2 * stddev is known as Wald confidence interval, plus 0.5 would be continuity-corrected Wald interval
+		if float64(topns[topNNum-1].Count) > selectivity*n+2*stddev+0.5 {
+			// If the current one is worth storing, the latter ones too. So we just break here.
+			break
+		}
+		// Current one is not worth storing, remove it and subtract it from sumCount, go to next one.
+		topNNum--
+		if topNNum == 0 {
+			break
+		}
+		sumCount -= topns[topNNum-1].Count
+	}
+	return topns[:topNNum]
+}
diff --git a/statistics/handle/handle_test.go b/statistics/handle/handle_test.go
@@ -848,9 +848,13 @@ func (s *testStatsSuite) prepareForGlobalStatsWithOpts(c *C, tk *testkit.TestKit
 		buf1.WriteString(fmt.Sprintf(", (%v)", i))
 		buf2.WriteString(fmt.Sprintf(", (%v)", 100000+i))
 	}
+	for i := 0; i < 1000; i++ {
+		buf1.WriteString(fmt.Sprintf(", (%v)", 0))
+		buf2.WriteString(fmt.Sprintf(", (%v)", 100000))
+	}
 	tk.MustExec(buf1.String())
 	tk.MustExec(buf2.String())
-	tk.MustExec("set @@tidb_analyze_version=2")
+	tk.MustExec("set @@tidb_analyze_version=3")
 	tk.MustExec("set @@tidb_partition_prune_mode='dynamic'")
 	c.Assert(s.do.StatsHandle().DumpStatsDeltaToKV(handle.DumpAll), IsNil)
 }
@@ -907,23 +911,23 @@ func (s *testStatsSuite) TestAnalyzeGlobalStatsWithOpts2(c *C) {
 	tk := testkit.NewTestKit(c, s.store)
 	s.prepareForGlobalStatsWithOpts(c, tk)
 
-	tk.MustExec("analyze table t with 20 topn, 50 buckets")
-	s.checkForGlobalStatsWithOpts(c, tk, "global", 20, 50)
-	s.checkForGlobalStatsWithOpts(c, tk, "p0", 20, 50)
-	s.checkForGlobalStatsWithOpts(c, tk, "p1", 20, 50)
+	tk.MustExec("analyze table t with 20 topn, 50 buckets, 1000 samples")
+	s.checkForGlobalStatsWithOpts(c, tk, "global", 2, 50)
+	s.checkForGlobalStatsWithOpts(c, tk, "p0", 1, 50)
+	s.checkForGlobalStatsWithOpts(c, tk, "p1", 1, 50)
 
 	// analyze a partition to let its options be different with others'
 	tk.MustExec("analyze table t partition p0 with 10 topn, 20 buckets")
 	s.checkForGlobalStatsWithOpts(c, tk, "global", 10, 20) // use new options
 	s.checkForGlobalStatsWithOpts(c, tk, "p0", 10, 20)
-	s.checkForGlobalStatsWithOpts(c, tk, "p1", 20, 50)
+	s.checkForGlobalStatsWithOpts(c, tk, "p1", 1, 50)
 
 	tk.MustExec("analyze table t partition p1 with 100 topn, 200 buckets")
 	s.checkForGlobalStatsWithOpts(c, tk, "global", 100, 200)
 	s.checkForGlobalStatsWithOpts(c, tk, "p0", 10, 20)
 	s.checkForGlobalStatsWithOpts(c, tk, "p1", 100, 200)
 
-	tk.MustExec("analyze table t partition p0") // default options
+	tk.MustExec("analyze table t partition p0 with 20 topn") // change back to 20 topn
 	s.checkForGlobalStatsWithOpts(c, tk, "global", 20, 256)
 	s.checkForGlobalStatsWithOpts(c, tk, "p0", 20, 256)
 	s.checkForGlobalStatsWithOpts(c, tk, "p1", 100, 200)

diff --git a/statistics/statistics_test.go b/statistics/statistics_test.go
@@ -281,23 +281,26 @@ func (s *testStatisticsSuite) TestBuild(c *C) {
 	colv2, topnv2, err := BuildHistAndTopN(ctx, int(bucketCount), topNCount, 2, collector, types.NewFieldType(mysql.TypeLonglong), true)
 	c.Check(err, IsNil)
 	c.Check(topnv2.TopN, NotNil)
-	expectedTopNCount := []uint64{9990, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}
+	// The most common one's occurrence is 9990, the second most common one's occurrence is 30.
+	// The ndv of the histogram is 73344, the total count of it is 90010. 90010/73344 vs 30, it's not a bad estimate.
+	expectedTopNCount := []uint64{9990}
+	c.Assert(len(topnv2.TopN), Equals, len(expectedTopNCount))
 	for i, meta := range topnv2.TopN {
 		c.Check(meta.Count, Equals, expectedTopNCount[i])
 	}
-	c.Check(colv2.Len(), Equals, 256)
+	c.Check(colv2.Len(), Equals, 251)
 	count = colv2.lessRowCount(types.NewIntDatum(1000))
-	c.Check(int(count), Equals, 325)
+	c.Check(int(count), Equals, 328)
 	count = colv2.lessRowCount(types.NewIntDatum(2000))
-	c.Check(int(count), Equals, 9430)
+	c.Check(int(count), Equals, 10007)
 	count = colv2.greaterRowCount(types.NewIntDatum(2000))
-	c.Check(int(count), Equals, 80008)
+	c.Check(int(count), Equals, 80001)
 	count = colv2.lessRowCount(types.NewIntDatum(200000000))
-	c.Check(int(count), Equals, 89440)
+	c.Check(int(count), Equals, 90010)
 	count = colv2.greaterRowCount(types.NewIntDatum(200000000))
 	c.Check(count, Equals, 0.0)
 	count = colv2.BetweenRowCount(types.NewIntDatum(3000), types.NewIntDatum(3500))
-	c.Check(int(count), Equals, 4995)
+	c.Check(int(count), Equals, 5001)
 	count = colv2.lessRowCount(types.NewIntDatum(1))
 	c.Check(int(count), Equals, 0)