Skip to content

Commit

Permalink
plan, executor: give special treat for index columns when analyze tab…
Browse files Browse the repository at this point in the history
…le (pingcap#2436)

* plan/statistics: statistics support add index info
  • Loading branch information
alivxxx authored and shenli committed Jan 26, 2017
1 parent 7e92364 commit 4d9b70f
Show file tree
Hide file tree
Showing 16 changed files with 519 additions and 144 deletions.
170 changes: 170 additions & 0 deletions executor/analyze.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
// Copyright 2017 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package executor

import (
"math/rand"

"github.com/juju/errors"
"github.com/pingcap/tidb/ast"
"github.com/pingcap/tidb/context"
"github.com/pingcap/tidb/expression"
"github.com/pingcap/tidb/meta"
"github.com/pingcap/tidb/model"
"github.com/pingcap/tidb/plan/statistics"
"github.com/pingcap/tidb/plan/statscache"
"github.com/pingcap/tidb/util/types"
)

var _ Executor = &AnalyzeExec{}

// AnalyzeExec represents Analyze executor.
type AnalyzeExec struct {
schema *expression.Schema
tblInfo *model.TableInfo
ctx context.Context
idxOffsets []int
colOffsets []int
pkOffset int
Srcs []Executor
}

const (
maxSampleCount = 10000
defaultBucketCount = 256
)

// Schema implements the Executor Schema interface.
func (e *AnalyzeExec) Schema() *expression.Schema {
return e.schema
}

// Close implements the Executor Close interface.
func (e *AnalyzeExec) Close() error {
for _, src := range e.Srcs {
err := src.Close()
if err != nil {
return errors.Trace(err)
}
}
return nil
}

// Next implements the Executor Next interface.
func (e *AnalyzeExec) Next() (*Row, error) {
for _, src := range e.Srcs {
ae := src.(*AnalyzeExec)
var count int64 = -1
var sampleRows []*ast.Row
if ae.colOffsets != nil {
rs := &recordSet{executor: ae.Srcs[len(ae.Srcs)-1]}
var err error
count, sampleRows, err = collectSamples(rs)
if err != nil {
return nil, errors.Trace(err)
}
}
columnSamples := rowsToColumnSamples(sampleRows)
var pkRS ast.RecordSet
if ae.pkOffset != -1 {
offset := len(ae.Srcs) - 1
if ae.colOffsets != nil {
offset--
}
pkRS = &recordSet{executor: ae.Srcs[offset]}
}
idxRS := make([]ast.RecordSet, 0, len(ae.idxOffsets))
for i := range ae.idxOffsets {
idxRS = append(idxRS, &recordSet{executor: ae.Srcs[i]})
}
err := ae.buildStatisticsAndSaveToKV(count, columnSamples, idxRS, pkRS)
if err != nil {
return nil, errors.Trace(err)
}
}
return nil, nil
}

func (e *AnalyzeExec) buildStatisticsAndSaveToKV(count int64, columnSamples [][]types.Datum, idxRS []ast.RecordSet, pkRS ast.RecordSet) error {
txn := e.ctx.Txn()
statBuilder := &statistics.Builder{
Sc: e.ctx.GetSessionVars().StmtCtx,
TblInfo: e.tblInfo,
StartTS: int64(txn.StartTS()),
Count: count,
NumBuckets: defaultBucketCount,
ColumnSamples: columnSamples,
ColOffsets: e.colOffsets,
IdxRecords: idxRS,
IdxOffsets: e.idxOffsets,
PkRecords: pkRS,
PkOffset: e.pkOffset,
}
t, err := statBuilder.NewTable()
if err != nil {
return errors.Trace(err)
}
statscache.SetStatisticsTableCache(e.tblInfo.ID, t)
tpb, err := t.ToPB()
if err != nil {
return errors.Trace(err)
}
m := meta.NewMeta(txn)
err = m.SetTableStats(e.tblInfo.ID, tpb)
if err != nil {
return errors.Trace(err)
}
return nil
}

// collectSamples collects sample from the result set, using Reservoir Sampling algorithm.
// See https://en.wikipedia.org/wiki/Reservoir_sampling
func collectSamples(e ast.RecordSet) (count int64, samples []*ast.Row, err error) {
for {
row, err := e.Next()
if err != nil {
return count, samples, errors.Trace(err)
}
if row == nil {
break
}
if len(samples) < maxSampleCount {
samples = append(samples, row)
} else {
shouldAdd := rand.Int63n(count) < maxSampleCount
if shouldAdd {
idx := rand.Intn(maxSampleCount)
samples[idx] = row
}
}
count++
}
return count, samples, nil
}

func rowsToColumnSamples(rows []*ast.Row) [][]types.Datum {
if len(rows) == 0 {
return nil
}
columnSamples := make([][]types.Datum, len(rows[0].Data))
for i := range columnSamples {
columnSamples[i] = make([]types.Datum, len(rows))
}
for j, row := range rows {
for i, val := range row.Data {
columnSamples[i][j] = val
}
}
return columnSamples
}
40 changes: 40 additions & 0 deletions executor/analyze_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright 2017 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package executor_test

import (
"fmt"
"strings"

. "github.com/pingcap/check"
"github.com/pingcap/tidb/util/testkit"
"github.com/pingcap/tidb/util/testleak"
)

func (s *testSuite) TestAnalyzeTable(c *C) {
defer testleak.AfterTest(c)()
tk := testkit.NewTestKit(c, s.store)
tk.MustExec("use test")
tk.MustExec("drop table if exists t1")
tk.MustExec("create table t1 (a int)")
tk.MustExec("create index ind_a on t1 (a)")
tk.MustExec("insert into t1 (a) values (1)")
result := tk.MustQuery("explain select * from t1 where t1.a = 1")
rowStr := fmt.Sprintf("%s", result.Rows())
c.Check(strings.Split(rowStr, "{")[0], Equals, "[[IndexScan_5 ")
tk.MustExec("analyze table t1")
result = tk.MustQuery("explain select * from t1 where t1.a = 1")
rowStr = fmt.Sprintf("%s", result.Rows())
c.Check(strings.Split(rowStr, "{")[0], Equals, "[[TableScan_4 ")
}
23 changes: 23 additions & 0 deletions executor/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ func (b *executorBuilder) build(p plan.Plan) Executor {
return b.buildDummyScan(v)
case *plan.Cache:
return b.buildCache(v)
case *plan.Analyze:
return b.buildAnalyze(v)
default:
b.err = ErrUnknownPlan.Gen("Unknown Plan %T", p)
return nil
Expand Down Expand Up @@ -679,3 +681,24 @@ func (b *executorBuilder) buildCache(v *plan.Cache) Executor {
Src: src,
}
}

func (b *executorBuilder) buildAnalyze(v *plan.Analyze) Executor {
var tblInfo *model.TableInfo
if v.Table != nil {
tblInfo = v.Table.TableInfo
}
e := &AnalyzeExec{
schema: v.Schema(),
tblInfo: tblInfo,
ctx: b.ctx,
idxOffsets: v.IdxOffsets,
colOffsets: v.ColOffsets,
pkOffset: v.PkOffset,
Srcs: make([]Executor, len(v.Children())),
}
for i, child := range v.Children() {
childExec := b.build(child)
e.Srcs[i] = childExec
}
return e
}
119 changes: 0 additions & 119 deletions executor/executor_simple.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ package executor

import (
"fmt"
"math/rand"
"strings"

"github.com/juju/errors"
Expand All @@ -24,16 +23,12 @@ import (
"github.com/pingcap/tidb/context"
"github.com/pingcap/tidb/expression"
"github.com/pingcap/tidb/infoschema"
"github.com/pingcap/tidb/meta"
"github.com/pingcap/tidb/model"
"github.com/pingcap/tidb/mysql"
"github.com/pingcap/tidb/plan/statistics"
"github.com/pingcap/tidb/plan/statscache"
"github.com/pingcap/tidb/sessionctx/variable"
"github.com/pingcap/tidb/terror"
"github.com/pingcap/tidb/util"
"github.com/pingcap/tidb/util/sqlexec"
"github.com/pingcap/tidb/util/types"
)

// SimpleExec represents simple statement executor.
Expand Down Expand Up @@ -78,8 +73,6 @@ func (e *SimpleExec) Next() (*Row, error) {
err = e.executeDropUser(x)
case *ast.SetPwdStmt:
err = e.executeSetPwd(x)
case *ast.AnalyzeTableStmt:
err = e.executeAnalyzeTable(x)
case *ast.BinlogStmt:
// We just ignore it.
return nil, nil
Expand Down Expand Up @@ -313,115 +306,3 @@ func (e *SimpleExec) executeFlushTable(s *ast.FlushTableStmt) error {
// TODO: A dummy implement
return nil
}

func (e *SimpleExec) executeAnalyzeTable(s *ast.AnalyzeTableStmt) error {
for _, table := range s.TableNames {
err := e.createStatisticsForTable(table)
if err != nil {
return errors.Trace(err)
}
}
return nil
}

const (
maxSampleCount = 10000
defaultBucketCount = 256
)

func (e *SimpleExec) createStatisticsForTable(tn *ast.TableName) error {
var tableName string
if tn.Schema.L == "" {
tableName = tn.Name.L
} else {
tableName = tn.Schema.L + "." + tn.Name.L
}
sql := "select * from " + tableName
result, err := e.ctx.(sqlexec.RestrictedSQLExecutor).ExecRestrictedSQL(e.ctx, sql)
if err != nil {
return errors.Trace(err)
}
count, samples, err := e.collectSamples(result)
result.Close()
if err != nil {
return errors.Trace(err)
}
err = e.buildStatisticsAndSaveToKV(tn, count, samples)
if err != nil {
return errors.Trace(err)
}
return nil
}

// collectSamples collects sample from the result set, using Reservoir Sampling algorithm.
// See https://en.wikipedia.org/wiki/Reservoir_sampling
func (e *SimpleExec) collectSamples(result ast.RecordSet) (count int64, samples []*ast.Row, err error) {
for {
var row *ast.Row
row, err = result.Next()
if err != nil {
return count, samples, errors.Trace(err)
}
if row == nil {
break
}
if len(samples) < maxSampleCount {
samples = append(samples, row)
} else {
shouldAdd := rand.Int63n(count) < maxSampleCount
if shouldAdd {
idx := rand.Intn(maxSampleCount)
samples[idx] = row
}
}
count++
}
return count, samples, nil
}

func (e *SimpleExec) buildStatisticsAndSaveToKV(tn *ast.TableName, count int64, sampleRows []*ast.Row) error {
txn := e.ctx.Txn()
statBuilder := &statistics.Builder{
Sc: e.ctx.GetSessionVars().StmtCtx,
TblInfo: tn.TableInfo,
StartTS: int64(txn.StartTS()),
Count: count,
NumBuckets: defaultBucketCount,
ColumnSamples: rowsToColumnSamples(sampleRows),
PkOffset: -1,
}
for i := range statBuilder.ColumnSamples {
statBuilder.ColOffsets = append(statBuilder.ColOffsets, i)
}
t, err := statBuilder.NewTable()
if err != nil {
return errors.Trace(err)
}
statscache.SetStatisticsTableCache(tn.TableInfo.ID, t)
tpb, err := t.ToPB()
if err != nil {
return errors.Trace(err)
}
m := meta.NewMeta(txn)
err = m.SetTableStats(tn.TableInfo.ID, tpb)
if err != nil {
return errors.Trace(err)
}
return nil
}

func rowsToColumnSamples(rows []*ast.Row) [][]types.Datum {
if len(rows) == 0 {
return nil
}
columnSamples := make([][]types.Datum, len(rows[0].Data))
for i := range columnSamples {
columnSamples[i] = make([]types.Datum, len(rows))
}
for j, row := range rows {
for i, val := range row.Data {
columnSamples[i][j] = val
}
}
return columnSamples
}
Loading

0 comments on commit 4d9b70f

Please sign in to comment.