stats: log detailed stats info for query feedback (pingcap#7293)

m4ver1k · Aug 15, 2018 · 29addca · 29addca
1 parent 9fc67b9
commit 29addca
Show file tree

Hide file tree

Showing 10 changed files with 351 additions and 59 deletions.
diff --git a/statistics/boostrap.go b/statistics/boostrap.go
@@ -14,6 +14,8 @@
 package statistics
 
 import (
+	"fmt"
+
 	"github.com/juju/errors"
 	"github.com/pingcap/tidb/infoschema"
 	"github.com/pingcap/tidb/model"
@@ -49,6 +51,7 @@ func (h *Handle) initStatsMeta4Chunk(is infoschema.InfoSchema, tables statsCache
 		tbl := &Table{
 			HistColl: newHistColl,
 			Version:  row.GetUint64(0),
+			name:     getFullTableName(is, tableInfo),
 		}
 		tables[physicalID] = tbl
 	}
@@ -257,3 +260,14 @@ func (h *Handle) InitStats(is infoschema.InfoSchema) error {
 	h.statsCache.Store(tables)
 	return nil
 }
+
+func getFullTableName(is infoschema.InfoSchema, tblInfo *model.TableInfo) string {
+	for _, schema := range is.AllSchemas() {
+		if t, err := is.TableByName(schema.Name, tblInfo.Name); err == nil {
+			if t.Meta().ID == tblInfo.ID {
+				return schema.Name.O + "." + tblInfo.Name.O
+			}
+		}
+	}
+	return fmt.Sprintf("%d", tblInfo.ID)
+}
diff --git a/statistics/feedback.go b/statistics/feedback.go
@@ -16,6 +16,7 @@ package statistics
 import (
 	"bytes"
 	"encoding/gob"
+	"fmt"
 	"math"
 	"math/rand"
 	"sort"
@@ -102,11 +103,11 @@ func (q *QueryFeedback) DecodeToRanges(isIndex bool) ([]*ranger.Range, error) {
 		if isIndex {
 			var err error
 			// As we do not know the origin length, just use a custom value here.
-			lowVal, err = codec.Decode(low.GetBytes(), 4)
+			lowVal, err = codec.DecodeRange(low.GetBytes(), 4)
 			if err != nil {
 				return nil, errors.Trace(err)
 			}
-			highVal, err = codec.Decode(high.GetBytes(), 4)
+			highVal, err = codec.DecodeRange(high.GetBytes(), 4)
 			if err != nil {
 				return nil, errors.Trace(err)
 			}
@@ -759,3 +760,135 @@ func splitFeedbackByQueryType(feedbacks []feedback) ([]feedback, []feedback) {
 	}
 	return eqFB, ranFB
 }
+
+// formatBuckets formats bucket from lowBkt to highBkt.
+func formatBuckets(hg *Histogram, lowBkt, highBkt, idxCols int) string {
+	if lowBkt == highBkt {
+		return hg.bucketToString(lowBkt, idxCols)
+	}
+	if lowBkt+1 == highBkt {
+		return fmt.Sprintf("%s, %s", hg.bucketToString(lowBkt, 0), hg.bucketToString(highBkt, 0))
+	}
+	// do not care the middle buckets
+	return fmt.Sprintf("%s, (%d buckets, total count %d), %s", hg.bucketToString(lowBkt, 0),
+		highBkt-lowBkt-1, hg.Buckets[highBkt-1].Count-hg.Buckets[lowBkt].Count, hg.bucketToString(highBkt, 0))
+}
+
+func colRangeToStr(c *Column, ran *ranger.Range, actual int64, factor float64) string {
+	lowCount, lowBkt := c.lessRowCountWithBktIdx(ran.LowVal[0])
+	highCount, highBkt := c.lessRowCountWithBktIdx(ran.HighVal[0])
+	return fmt.Sprintf("range: %s, actual: %d, expected: %d, buckets: {%s}", ran.String(), actual,
+		int64((highCount-lowCount)*factor), formatBuckets(&c.Histogram, lowBkt, highBkt, 0))
+}
+
+func logForPK(prefix string, c *Column, ranges []*ranger.Range, actual []int64, factor float64) {
+	for i, ran := range ranges {
+		if ran.LowVal[0].GetInt64()+1 >= ran.HighVal[0].GetInt64() {
+			continue
+		}
+		log.Debugf("%s column: %s, %s", prefix, c.Info.Name, colRangeToStr(c, ran, actual[i], factor))
+	}
+}
+
+func logForIndexRange(idx *Index, ran *ranger.Range, actual int64, factor float64) string {
+	sc := &stmtctx.StatementContext{TimeZone: time.UTC}
+	lb, err := codec.EncodeKey(sc, nil, ran.LowVal...)
+	if err != nil {
+		return ""
+	}
+	rb, err := codec.EncodeKey(sc, nil, ran.HighVal...)
+	if err != nil {
+		return ""
+	}
+	if idx.CMSketch != nil && bytes.Compare(kv.Key(lb).PrefixNext(), rb) >= 0 {
+		str, err := types.DatumsToString(ran.LowVal, true)
+		if err != nil {
+			return ""
+		}
+		return fmt.Sprintf("value: %s, actual: %d, expected: %d", str, actual, int64(float64(idx.QueryBytes(lb))*factor))
+	}
+	l, r := types.NewBytesDatum(lb), types.NewBytesDatum(rb)
+	lowCount, lowBkt := idx.lessRowCountWithBktIdx(l)
+	highCount, highBkt := idx.lessRowCountWithBktIdx(r)
+	return fmt.Sprintf("range: %s, actual: %d, expected: %d, histogram: {%s}", ran.String(), actual,
+		int64((highCount-lowCount)*factor), formatBuckets(&idx.Histogram, lowBkt, highBkt, len(idx.Info.Columns)))
+}
+
+func logForIndex(prefix string, t *Table, idx *Index, ranges []*ranger.Range, actual []int64, factor float64) {
+	sc := &stmtctx.StatementContext{TimeZone: time.UTC}
+	if idx.CMSketch == nil || idx.statsVer != version1 {
+		for i, ran := range ranges {
+			log.Debugf("%s index: %s, %s", prefix, idx.Info.Name.O, logForIndexRange(idx, ran, actual[i], factor))
+		}
+		return
+	}
+	for i, ran := range ranges {
+		rangePosition := getOrdinalOfRangeCond(sc, ran)
+		// only contains range or equality query
+		if rangePosition == 0 || rangePosition == len(ran.LowVal) {
+			log.Debugf("%s index: %s, %s", prefix, idx.Info.Name.O, logForIndexRange(idx, ran, actual[i], factor))
+			continue
+		}
+		equalityString, err := types.DatumsToString(ran.LowVal[:rangePosition], true)
+		if err != nil {
+			continue
+		}
+		bytes, err := codec.EncodeKey(sc, nil, ran.LowVal[:rangePosition]...)
+		if err != nil {
+			continue
+		}
+		equalityCount := idx.CMSketch.QueryBytes(bytes)
+		rang := ranger.Range{
+			LowVal:  []types.Datum{ran.LowVal[rangePosition]},
+			HighVal: []types.Datum{ran.HighVal[rangePosition]},
+		}
+		colName := idx.Info.Columns[rangePosition].Name.L
+		var rangeString string
+		// prefer index stats over column stats
+		if idx, ok := t.colName2Idx[colName]; ok {
+			if t.Indices[idx] == nil {
+				return
+			}
+			rangeString = logForIndexRange(t.Indices[idx], &rang, -1, factor)
+		} else {
+			id := t.colName2ID[colName]
+			if t.Columns[id] == nil {
+				return
+			}
+			rangeString = colRangeToStr(t.Columns[t.colName2ID[colName]], &rang, -1, factor)
+		}
+		log.Debugf("%s index: %s, actual: %d, equality: %s, expected equality: %d, %s", prefix, idx.Info.Name.O,
+			actual[i], equalityString, equalityCount, rangeString)
+	}
+}
+
+func (q *QueryFeedback) logDetailedInfo(h *Handle) {
+	t, ok := h.statsCache.Load().(statsCache)[q.tableID]
+	if !ok {
+		return
+	}
+	isIndex := q.hist.isIndexHist()
+	ranges, err := q.DecodeToRanges(isIndex)
+	if err != nil {
+		log.Debug(err)
+		return
+	}
+	actual := make([]int64, 0, len(q.feedback))
+	for _, fb := range q.feedback {
+		actual = append(actual, fb.count)
+	}
+	logPrefix := fmt.Sprintf("[stats-feedback] %s,", t.name)
+	if isIndex {
+		idx := t.Indices[q.hist.ID]
+		if idx == nil {
+			return
+		}
+		logForIndex(logPrefix, t, idx, ranges, actual, idx.getIncreaseFactor(t.Count))
+	} else {
+		c := t.Columns[q.hist.ID]
+		if c == nil {
+			return
+		}
+		logForPK(logPrefix, c, ranges, actual, c.getIncreaseFactor(t.Count))
+	}
+}
diff --git a/statistics/feedback_test.go b/statistics/feedback_test.go
@@ -71,13 +71,13 @@ func (s *testFeedbackSuite) TestUpdateHistogram(c *C) {
 	defer func() { defaultBucketCount = originBucketCount }()
 	c.Assert(UpdateHistogram(q.Hist(), q).ToString(0), Equals,
 		"column:0 ndv:0 totColSize:0\n"+
-			"num: 10000\tlower_bound: 0\tupper_bound: 1\trepeats: 0\n"+
-			"num: 10008\tlower_bound: 2\tupper_bound: 7\trepeats: 0\n"+
-			"num: 10019\tlower_bound: 8\tupper_bound: 19\trepeats: 0\n"+
-			"num: 10019\tlower_bound: 20\tupper_bound: 20\trepeats: 0\n"+
-			"num: 10037\tlower_bound: 21\tupper_bound: 39\trepeats: 0\n"+
-			"num: 10055\tlower_bound: 40\tupper_bound: 58\trepeats: 0\n"+
-			"num: 10057\tlower_bound: 59\tupper_bound: 60\trepeats: 0")
+			"num: 10000 lower_bound: 0 upper_bound: 1 repeats: 0\n"+
+			"num: 8 lower_bound: 2 upper_bound: 7 repeats: 0\n"+
+			"num: 11 lower_bound: 8 upper_bound: 19 repeats: 0\n"+
+			"num: 0 lower_bound: 20 upper_bound: 20 repeats: 0\n"+
+			"num: 18 lower_bound: 21 upper_bound: 39 repeats: 0\n"+
+			"num: 18 lower_bound: 40 upper_bound: 58 repeats: 0\n"+
+			"num: 2 lower_bound: 59 upper_bound: 60 repeats: 0")
 }
 
 func (s *testFeedbackSuite) TestSplitBuckets(c *C) {
@@ -91,12 +91,12 @@ func (s *testFeedbackSuite) TestSplitBuckets(c *C) {
 	buckets, isNewBuckets, totalCount := splitBuckets(q.Hist(), q)
 	c.Assert(buildNewHistogram(q.Hist(), buckets).ToString(0), Equals,
 		"column:0 ndv:0 totColSize:0\n"+
-			"num: 1\tlower_bound: 0\tupper_bound: 1\trepeats: 0\n"+
-			"num: 1\tlower_bound: 2\tupper_bound: 3\trepeats: 0\n"+
-			"num: 1\tlower_bound: 5\tupper_bound: 7\trepeats: 0\n"+
-			"num: 6\tlower_bound: 10\tupper_bound: 15\trepeats: 0\n"+
-			"num: 6\tlower_bound: 16\tupper_bound: 20\trepeats: 0\n"+
-			"num: 6\tlower_bound: 30\tupper_bound: 50\trepeats: 0")
+			"num: 1 lower_bound: 0 upper_bound: 1 repeats: 0\n"+
+			"num: 0 lower_bound: 2 upper_bound: 3 repeats: 0\n"+
+			"num: 0 lower_bound: 5 upper_bound: 7 repeats: 0\n"+
+			"num: 5 lower_bound: 10 upper_bound: 15 repeats: 0\n"+
+			"num: 0 lower_bound: 16 upper_bound: 20 repeats: 0\n"+
+			"num: 0 lower_bound: 30 upper_bound: 50 repeats: 0")
 	c.Assert(isNewBuckets, DeepEquals, []bool{false, false, false, true, true, false})
 	c.Assert(totalCount, Equals, int64(6))
 
@@ -110,12 +110,12 @@ func (s *testFeedbackSuite) TestSplitBuckets(c *C) {
 	buckets, isNewBuckets, totalCount = splitBuckets(q.Hist(), q)
 	c.Assert(buildNewHistogram(q.Hist(), buckets).ToString(0), Equals,
 		"column:0 ndv:0 totColSize:0\n"+
-			"num: 100000\tlower_bound: 0\tupper_bound: 1\trepeats: 0\n"+
-			"num: 100000\tlower_bound: 2\tupper_bound: 3\trepeats: 0\n"+
-			"num: 100000\tlower_bound: 5\tupper_bound: 7\trepeats: 0\n"+
-			"num: 100001\tlower_bound: 10\tupper_bound: 15\trepeats: 0\n"+
-			"num: 100001\tlower_bound: 16\tupper_bound: 20\trepeats: 0\n"+
-			"num: 100001\tlower_bound: 30\tupper_bound: 50\trepeats: 0")
+			"num: 100000 lower_bound: 0 upper_bound: 1 repeats: 0\n"+
+			"num: 0 lower_bound: 2 upper_bound: 3 repeats: 0\n"+
+			"num: 0 lower_bound: 5 upper_bound: 7 repeats: 0\n"+
+			"num: 1 lower_bound: 10 upper_bound: 15 repeats: 0\n"+
+			"num: 0 lower_bound: 16 upper_bound: 20 repeats: 0\n"+
+			"num: 0 lower_bound: 30 upper_bound: 50 repeats: 0")
 	c.Assert(isNewBuckets, DeepEquals, []bool{false, false, false, true, true, false})
 	c.Assert(totalCount, Equals, int64(100001))
 
@@ -132,7 +132,7 @@ func (s *testFeedbackSuite) TestSplitBuckets(c *C) {
 	buckets, isNewBuckets, totalCount = splitBuckets(q.Hist(), q)
 	c.Assert(buildNewHistogram(q.Hist(), buckets).ToString(0), Equals,
 		"column:0 ndv:0 totColSize:0\n"+
-			"num: 1000000\tlower_bound: 0\tupper_bound: 1000000\trepeats: 0")
+			"num: 1000000 lower_bound: 0 upper_bound: 1000000 repeats: 0")
 	c.Assert(isNewBuckets, DeepEquals, []bool{false})
 	c.Assert(totalCount, Equals, int64(1000000))
 
@@ -148,8 +148,8 @@ func (s *testFeedbackSuite) TestSplitBuckets(c *C) {
 	buckets, isNewBuckets, totalCount = splitBuckets(q.Hist(), q)
 	c.Assert(buildNewHistogram(q.Hist(), buckets).ToString(0), Equals,
 		"column:0 ndv:0 totColSize:0\n"+
-			"num: 1\tlower_bound: 0\tupper_bound: 10\trepeats: 0\n"+
-			"num: 1\tlower_bound: 11\tupper_bound: 1000000\trepeats: 0")
+			"num: 1 lower_bound: 0 upper_bound: 10 repeats: 0\n"+
+			"num: 0 lower_bound: 11 upper_bound: 1000000 repeats: 0")
 	c.Assert(isNewBuckets, DeepEquals, []bool{true, true})
 	c.Assert(totalCount, Equals, int64(1))
 }
@@ -169,16 +169,16 @@ func (s *testFeedbackSuite) TestMergeBuckets(c *C) {
 			counts:       []int64{1},
 			isNewBuckets: []bool{false},
 			bucketCount:  1,
-			result:       "column:0 ndv:0 totColSize:0\nnum: 1\tlower_bound: 1\tupper_bound: 2\trepeats: 0",
+			result:       "column:0 ndv:0 totColSize:0\nnum: 1 lower_bound: 1 upper_bound: 2 repeats: 0",
 		},
 		{
 			points:       []int64{1, 2, 2, 3, 3, 4},
 			counts:       []int64{100000, 1, 1},
 			isNewBuckets: []bool{false, false, false},
 			bucketCount:  2,
 			result: "column:0 ndv:0 totColSize:0\n" +
-				"num: 100000\tlower_bound: 1\tupper_bound: 2\trepeats: 0\n" +
-				"num: 100002\tlower_bound: 2\tupper_bound: 4\trepeats: 0",
+				"num: 100000 lower_bound: 1 upper_bound: 2 repeats: 0\n" +
+				"num: 2 lower_bound: 2 upper_bound: 4 repeats: 0",
 		},
 		// test do not merge if the result bucket count is too large
 		{
@@ -187,9 +187,9 @@ func (s *testFeedbackSuite) TestMergeBuckets(c *C) {
 			isNewBuckets: []bool{false, false, false, false},
 			bucketCount:  3,
 			result: "column:0 ndv:0 totColSize:0\n" +
-				"num: 2\tlower_bound: 1\tupper_bound: 3\trepeats: 0\n" +
-				"num: 100002\tlower_bound: 3\tupper_bound: 4\trepeats: 0\n" +
-				"num: 200002\tlower_bound: 4\tupper_bound: 5\trepeats: 0",
+				"num: 2 lower_bound: 1 upper_bound: 3 repeats: 0\n" +
+				"num: 100000 lower_bound: 3 upper_bound: 4 repeats: 0\n" +
+				"num: 100000 lower_bound: 4 upper_bound: 5 repeats: 0",
 		},
 	}
 	for _, t := range tests {

diff --git a/statistics/handle.go b/statistics/handle.go
@@ -159,6 +159,7 @@ func (h *Handle) Update(is infoschema.InfoSchema) error {
 		tbl.Version = version
 		tbl.Count = count
 		tbl.ModifyCount = modifyCount
+		tbl.name = getFullTableName(is, tableInfo)
 		tables = append(tables, tbl)
 	}
 	h.mu.Lock()

diff --git a/statistics/histogram.go b/statistics/histogram.go
@@ -341,7 +341,7 @@ func ValueToString(value *types.Datum, idxCols int) (string, error) {
 	if idxCols == 0 {
 		return value.ToString()
 	}
-	decodedVals, err := codec.Decode(value.GetBytes(), idxCols)
+	decodedVals, err := codec.DecodeRange(value.GetBytes(), idxCols)
 	if err != nil {
 		return "", errors.Trace(err)
 	}
@@ -352,6 +352,14 @@ func ValueToString(value *types.Datum, idxCols int) (string, error) {
 	return str, nil
 }
 
+func (hg *Histogram) bucketToString(bktID, idxCols int) string {
+	upperVal, err := ValueToString(hg.GetUpper(bktID), idxCols)
+	terror.Log(errors.Trace(err))
+	lowerVal, err := ValueToString(hg.GetLower(bktID), idxCols)
+	terror.Log(errors.Trace(err))
+	return fmt.Sprintf("num: %d lower_bound: %s upper_bound: %s repeats: %d", hg.bucketCount(bktID), lowerVal, upperVal, hg.Buckets[bktID].Repeat)
+}
+
 // ToString gets the string representation for the histogram.
 func (hg *Histogram) ToString(idxCols int) string {
 	strs := make([]string, 0, hg.Len()+1)
@@ -361,11 +369,7 @@ func (hg *Histogram) ToString(idxCols int) string {
 		strs = append(strs, fmt.Sprintf("column:%d ndv:%d totColSize:%d", hg.ID, hg.NDV, hg.TotColSize))
 	}
 	for i := 0; i < hg.Len(); i++ {
-		upperVal, err := ValueToString(hg.GetUpper(i), idxCols)
-		terror.Log(errors.Trace(err))
-		lowerVal, err := ValueToString(hg.GetLower(i), idxCols)
-		terror.Log(errors.Trace(err))
-		strs = append(strs, fmt.Sprintf("num: %d\tlower_bound: %s\tupper_bound: %s\trepeats: %d", hg.Buckets[i].Count, lowerVal, upperVal, hg.Buckets[i].Repeat))
+		strs = append(strs, hg.bucketToString(i, idxCols))
 	}
 	return strings.Join(strs, "\n")
 }
@@ -405,14 +409,14 @@ func (hg *Histogram) greaterAndEqRowCount(value types.Datum) float64 {
 }
 
 // lessRowCount estimates the row count where the column less than value.
-func (hg *Histogram) lessRowCount(value types.Datum) float64 {
+func (hg *Histogram) lessRowCountWithBktIdx(value types.Datum) (float64, int) {
 	// all the values is null
 	if hg.Bounds == nil {
-		return 0
+		return 0, 0
 	}
 	index, match := hg.Bounds.LowerBound(0, &value)
 	if index == hg.Bounds.NumRows() {
-		return hg.totalRowCount()
+		return hg.totalRowCount(), hg.Len() - 1
 	}
 	// Since we store the lower and upper bound together, so dividing the index by 2 will get the bucket index.
 	bucketIdx := index / 2
@@ -423,11 +427,16 @@ func (hg *Histogram) lessRowCount(value types.Datum) float64 {
 	}
 	if index%2 == 1 {
 		if match {
-			return curCount - curRepeat
+			return curCount - curRepeat, bucketIdx
 		}
-		return preCount + hg.calcFraction(bucketIdx, &value)*(curCount-curRepeat-preCount)
+		return preCount + hg.calcFraction(bucketIdx, &value)*(curCount-curRepeat-preCount), bucketIdx
 	}
-	return preCount
+	return preCount, bucketIdx
+}
+
+func (hg *Histogram) lessRowCount(value types.Datum) float64 {
+	result, _ := hg.lessRowCountWithBktIdx(value)
+	return result
 }
 
 // lessAndEqRowCount estimates the row count where the column less than or equal to value.