Skip to content

Commit

Permalink
*: add metrics package and recover panic of workers. (pingcap#5733)
Browse files Browse the repository at this point in the history
  • Loading branch information
coocood authored Jan 31, 2018
1 parent d520b0b commit 9e67bd8
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 12 deletions.
11 changes: 10 additions & 1 deletion ddl/ddl_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ import (
"github.com/pingcap/tidb/context"
"github.com/pingcap/tidb/kv"
"github.com/pingcap/tidb/meta"
"github.com/pingcap/tidb/metrics"
"github.com/pingcap/tidb/model"
"github.com/pingcap/tidb/sessionctx/binloginfo"
"github.com/pingcap/tidb/terror"
"github.com/pingcap/tidb/util"
log "github.com/sirupsen/logrus"
goctx "golang.org/x/net/context"
)
Expand All @@ -42,7 +44,14 @@ func (d *ddl) onDDLWorker() {

ticker := time.NewTicker(checkTime)
defer ticker.Stop()

defer func() {
r := recover()
if r != nil {
buf := util.GetStack()
log.Errorf("ddlWorker %v %s", r, buf)
metrics.PanicCounter.WithLabelValues(metrics.LabelDDL).Inc()
}
}()
for {
select {
case <-ticker.C:
Expand Down
22 changes: 22 additions & 0 deletions domain/domain.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package domain

import (
"crypto/tls"
"os"
"sync"
"sync/atomic"
"time"
Expand All @@ -29,12 +30,14 @@ import (
"github.com/pingcap/tidb/infoschema"
"github.com/pingcap/tidb/kv"
"github.com/pingcap/tidb/meta"
"github.com/pingcap/tidb/metrics"
"github.com/pingcap/tidb/model"
"github.com/pingcap/tidb/owner"
"github.com/pingcap/tidb/privilege/privileges"
"github.com/pingcap/tidb/sessionctx/variable"
"github.com/pingcap/tidb/statistics"
"github.com/pingcap/tidb/terror"
"github.com/pingcap/tidb/util"
log "github.com/sirupsen/logrus"
goctx "golang.org/x/net/context"
"google.golang.org/grpc"
Expand Down Expand Up @@ -323,6 +326,7 @@ func (do *Domain) loadSchemaInLoop(lease time.Duration) {
// Use lease/2 here as recommend by paper.
ticker := time.NewTicker(lease / 2)
defer ticker.Stop()
defer recoverInDomain("loadSchemaInLoop", true)
syncer := do.ddl.SchemaSyncer()

for {
Expand Down Expand Up @@ -526,6 +530,7 @@ func (do *Domain) LoadPrivilegeLoop(ctx context.Context) error {
}

go func() {
defer recoverInDomain("loadPrivilegeInLoop", false)
var count int
for {
ok := true
Expand Down Expand Up @@ -632,6 +637,7 @@ func (do *Domain) updateStatsWorker(ctx context.Context, owner owner.Manager) {
} else {
log.Info("[stats] init stats info takes ", time.Now().Sub(t))
}
defer recoverInDomain("updateStatsWorker", false)
for {
select {
case <-loadTicker.C:
Expand Down Expand Up @@ -681,6 +687,7 @@ func (do *Domain) autoAnalyzeWorker(owner owner.Manager) {
statsHandle := do.StatsHandle()
analyzeTicker := time.NewTicker(do.statsLease)
defer analyzeTicker.Stop()
defer recoverInDomain("autoAnalyzeWorker", false)
for {
select {
case <-analyzeTicker.C:
Expand Down Expand Up @@ -711,6 +718,21 @@ func (do *Domain) NotifyUpdatePrivilege(ctx context.Context) {
}
}

func recoverInDomain(funcName string, quit bool) {
r := recover()
if r == nil {
return
}
buf := util.GetStack()
log.Errorf("%s, %v, %s", funcName, r, buf)
metrics.PanicCounter.WithLabelValues(metrics.LabelDomain).Inc()
if quit {
// Wait for metrics to be pushed.
time.Sleep(time.Second * 15)
os.Exit(1)
}
}

// Domain error codes.
const (
codeInfoSchemaExpired terror.ErrCode = 1
Expand Down
38 changes: 38 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright 2018 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package metrics

import "github.com/prometheus/client_golang/prometheus"

var (
// PanicCounter measures the count of panics.
PanicCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "tidb",
Name: "panic",
Help: "Counter of panic.",
}, []string{LabelSession, LabelDomain, LabelDDL})
)

// metrics labels.
const (
LabelSession = "session"
LabelDomain = "domain"
LabelDDL = "ddl"
LabelGCWorker = "gcworker"
)

func init() {
prometheus.MustRegister(PanicCounter)
}
3 changes: 2 additions & 1 deletion server/conn.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ import (
"github.com/pingcap/tidb/context"
"github.com/pingcap/tidb/executor"
"github.com/pingcap/tidb/kv"
"github.com/pingcap/tidb/metrics"
"github.com/pingcap/tidb/mysql"
"github.com/pingcap/tidb/terror"
"github.com/pingcap/tidb/util/arena"
Expand Down Expand Up @@ -408,7 +409,7 @@ func (cc *clientConn) Run() {
stackSize := runtime.Stack(buf, false)
buf = buf[:stackSize]
log.Errorf("lastCmd %s, %v, %s", cc.lastCmd, r, buf)
panicCounter.Add(1)
metrics.PanicCounter.WithLabelValues(metrics.LabelSession).Inc()
}
if !closedOutside {
err := cc.Close()
Expand Down
10 changes: 0 additions & 10 deletions server/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,23 +62,13 @@ var (
Name: "critical_error",
Help: "Counter of critical errors.",
})

// panicCounter measures the count of panics.
panicCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "tidb",
Subsystem: "server",
Name: "panic",
Help: "Counter of panic.",
})
)

func init() {
prometheus.MustRegister(queryHistogram)
prometheus.MustRegister(queryCounter)
prometheus.MustRegister(connGauge)
prometheus.MustRegister(criticalErrorCounter)
prometheus.MustRegister(panicCounter)
}

func executeErrorToLabel(err error) string {
Expand Down
10 changes: 10 additions & 0 deletions store/tikv/gcworker/gc_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ import (
"github.com/pingcap/tidb/ddl/util"
"github.com/pingcap/tidb/kv"
"github.com/pingcap/tidb/meta"
"github.com/pingcap/tidb/metrics"
"github.com/pingcap/tidb/privilege"
"github.com/pingcap/tidb/store/tikv"
"github.com/pingcap/tidb/store/tikv/oracle"
"github.com/pingcap/tidb/store/tikv/tikvrpc"
"github.com/pingcap/tidb/terror"
tidbutil "github.com/pingcap/tidb/util"
log "github.com/sirupsen/logrus"
goctx "golang.org/x/net/context"
)
Expand Down Expand Up @@ -125,6 +127,14 @@ func (w *GCWorker) start(ctx goctx.Context, wg *sync.WaitGroup) {

ticker := time.NewTicker(gcWorkerTickInterval)
defer ticker.Stop()
defer func() {
r := recover()
if r != nil {
buf := tidbutil.GetStack()
log.Errorf("gcWorker %v %s", r, buf)
metrics.PanicCounter.WithLabelValues(metrics.LabelGCWorker).Inc()
}
}()
for {
select {
case <-ticker.C:
Expand Down
10 changes: 10 additions & 0 deletions util/misc.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
package util

import (
"runtime"
"time"

"github.com/juju/errors"
Expand Down Expand Up @@ -43,3 +44,12 @@ func RunWithRetry(retryCnt int, backoff uint64, f func() (bool, error)) (err err
}
return errors.Trace(err)
}

// GetStack gets the stacktrace.
func GetStack() []byte {
const size = 4096
buf := make([]byte, size)
stackSize := runtime.Stack(buf, false)
buf = buf[:stackSize]
return buf
}

0 comments on commit 9e67bd8

Please sign in to comment.