forked from pingcap/tidb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathestimate.go
50 lines (43 loc) · 1.76 KB
/
estimate.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
// Copyright 2019 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package statistics
import (
"math"
"github.com/cznic/mathutil"
)
// calculateEstimateNDV calculates the estimate ndv of a sampled data from a multisize with size total.
func calculateEstimateNDV(h *topNHelper, rowCount uint64) (ndv uint64, scaleRatio uint64) {
sampleSize, sampleNDV, onlyOnceItems := h.sampleSize, uint64(len(h.sorted)), h.onlyOnceItems
scaleRatio = rowCount / sampleSize
if onlyOnceItems == sampleSize {
// Assume this is a unique column, so do not scale up the count of elements
return rowCount, 1
} else if onlyOnceItems == 0 {
// Assume data only consists of sampled data
// Nothing to do, no change with scale ratio
return sampleNDV, scaleRatio
}
// Charikar, Moses, et al. "Towards estimation error guarantees for distinct values."
// Proceedings of the nineteenth ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems. ACM, 2000.
// This is GEE in that paper.
// estimateNDV = sqrt(N/n) f_1 + sum_2..inf f_i
// f_i = number of elements occurred i times in sample
f1 := float64(onlyOnceItems)
n := float64(sampleSize)
N := float64(rowCount)
d := float64(sampleNDV)
ndv = uint64(math.Sqrt(N/n)*f1 + d - f1 + 0.5)
ndv = mathutil.MaxUint64(ndv, sampleNDV)
ndv = mathutil.MinUint64(ndv, rowCount)
return ndv, scaleRatio
}