From 384a1080e78ad6b376ebb2e49887a0d9e572d78a Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Fri, 19 Jan 2018 11:35:30 +0800 Subject: [PATCH] stats: fix estimation in between row count (#5682) --- statistics/histogram.go | 5 ++++- statistics/statistics_test.go | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/statistics/histogram.go b/statistics/histogram.go index e9f95243f0e25..5dace7da0e449 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -16,6 +16,7 @@ package statistics import ( "bytes" "fmt" + "math" "strings" "time" @@ -344,8 +345,10 @@ func (hg *Histogram) lessAndEqRowCount(value types.Datum) float64 { func (hg *Histogram) betweenRowCount(a, b types.Datum) float64 { lessCountA := hg.lessRowCount(a) lessCountB := hg.lessRowCount(b) + // If lessCountA is not less than lessCountB, it may be that they fall to the same bucket and we cannot estimate + // the fraction, so we use `totalCount / NDV` to estimate the row count, but the result should not greater than lessCountB. if lessCountA >= lessCountB { - return hg.totalRowCount() / float64(hg.NDV) + return math.Min(lessCountB, hg.totalRowCount()/float64(hg.NDV)) } return lessCountB - lessCountA } diff --git a/statistics/statistics_test.go b/statistics/statistics_test.go index d7a04800cd1f9..0e6e30d731ac6 100644 --- a/statistics/statistics_test.go +++ b/statistics/statistics_test.go @@ -271,6 +271,8 @@ func (s *testStatisticsSuite) TestBuild(c *C) { c.Check(int(count), Equals, 19999) count = col.betweenRowCount(encodeKey(types.NewIntDatum(30000)), encodeKey(types.NewIntDatum(35000))) c.Check(int(count), Equals, 4999) + count = col.betweenRowCount(encodeKey(types.MinNotNullDatum()), encodeKey(types.NewIntDatum(0))) + c.Check(int(count), Equals, 0) count = col.lessRowCount(encodeKey(types.NewIntDatum(0))) c.Check(int(count), Equals, 0)