Skip to content

Commit

Permalink
server: add metric for connection idle time (pingcap#21265)
Browse files Browse the repository at this point in the history
  • Loading branch information
crazycs520 authored Nov 25, 2020
1 parent e01f5e4 commit 727d8f1
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 3 deletions.
16 changes: 16 additions & 0 deletions infoschema/metric_table_def.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,22 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance"},
Comment: "TiDB current connection counts",
},
"tidb_connection_idle_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tidb_server_conn_idle_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,in_txn,instance))`,
Labels: []string{"instance", "in_txn"},
Quantile: 0.90,
Comment: "The quantile of TiDB connection idle durations(second)",
},
"tidb_connection_idle_total_count": {
PromQL: `sum(increase(tidb_server_conn_idle_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (in_txn,instance)`,
Labels: []string{"instance", "in_txn"},
Comment: "The total count of TiDB connection idle",
},
"tidb_connection_idle_total_time": {
PromQL: `sum(increase(tidb_server_conn_idle_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (in_txn,instance)`,
Labels: []string{"instance", "in_txn"},
Comment: "The total time of TiDB connection idle",
},
"node_process_open_fd_count": {
PromQL: "process_open_fds{$LABEL_CONDITIONS}",
Labels: []string{"instance", "job"},
Expand Down
132 changes: 130 additions & 2 deletions metrics/grafana/tidb.json
Original file line number Diff line number Diff line change
Expand Up @@ -606,9 +606,9 @@
"fill": 1,
"gridPos": {
"h": 6,
"w": 24,
"w": 12,
"x": 0,
"y": 19
"y": 25
},
"id": 112,
"legend": {
Expand Down Expand Up @@ -696,6 +696,134 @@
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "TiDB connection idle durations",
"fill": 1,
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 25
},
"id": 218,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tidb_server_conn_idle_duration_seconds_bucket{in_txn='1'}[1m])) by (le,in_txn,instance))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "99-in-txn",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(tidb_server_conn_idle_duration_seconds_bucket{in_txn='0'}[1m])) by (le,in_txn,instance))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "99-not-in-txn",
"refId": "B"
},
{
"expr": "histogram_quantile(0.90, sum(rate(tidb_server_conn_idle_duration_seconds_bucket{in_txn='1'}[1m])) by (le,in_txn,instance))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "90-in-txn",
"refId": "C"
},
{
"expr": "histogram_quantile(0.90, sum(rate(tidb_server_conn_idle_duration_seconds_bucket{in_txn='0'}[1m])) by (le,in_txn,instance))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "90-not-in-txn",
"refId": "D"
},
{
"expr": "histogram_quantile(0.80, sum(rate(tidb_server_conn_idle_duration_seconds_bucket{in_txn='1'}[1m])) by (le,in_txn,instance))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "80-in-txn",
"refId": "E"
},
{
"expr": "histogram_quantile(0.80, sum(rate(tidb_server_conn_idle_duration_seconds_bucket{in_txn='0'}[1m])) by (le,in_txn,instance))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "80-not-in-txn",
"refId": "F"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Connection Idle Duration",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:518",
"format": "s",
"label": null,
"logBase": 2,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
Expand Down
1 change: 1 addition & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,4 +173,5 @@ func RegisterMetrics() {
prometheus.MustRegister(TiKVOnePCTxnCounter)
prometheus.MustRegister(MaxProcs)
prometheus.MustRegister(GOGC)
prometheus.MustRegister(ConnIdleDurationHistogram)
}
9 changes: 9 additions & 0 deletions metrics/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,15 @@ var (
Name: "gogc",
Help: "The value of GOGC",
})

ConnIdleDurationHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "tidb",
Subsystem: "server",
Name: "conn_idle_duration_seconds",
Help: "Bucketed histogram of connection idle time (s).",
Buckets: prometheus.ExponentialBuckets(0.0005, 2, 29), // 0.5ms ~ 1.5days
}, []string{LblInTxn})
)

// ExecuteErrorToLabel converts an execute error to label.
Expand Down
1 change: 1 addition & 0 deletions metrics/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,4 +143,5 @@ const (
LblBatchGet = "batch_get"
LblGet = "get"
LblLockKeys = "lock_keys"
LblInTxn = "in_txn"
)
14 changes: 13 additions & 1 deletion server/conn.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ var (
disconnectNormal = metrics.DisconnectionCounter.WithLabelValues(metrics.LblOK)
disconnectByClientWithError = metrics.DisconnectionCounter.WithLabelValues(metrics.LblError)
disconnectErrorUndetermined = metrics.DisconnectionCounter.WithLabelValues("undetermined")

connIdleDurationHistogramNotInTxn = metrics.ConnIdleDurationHistogram.WithLabelValues("0")
connIdleDurationHistogramInTxn = metrics.ConnIdleDurationHistogram.WithLabelValues("1")
)

// newClientConn creates a *clientConn object.
Expand All @@ -150,6 +153,7 @@ func newClientConn(s *Server) *clientConn {
collation: mysql.DefaultCollationID,
alloc: arena.NewAllocator(32 * 1024),
status: connStatusDispatching,
lastActive: time.Now(),
}
}

Expand All @@ -174,6 +178,7 @@ type clientConn struct {
status int32 // dispatching/reading/shutdown/waitshutdown
lastCode uint16 // last error code
collation uint8 // collation used by client, may be different from the collation used by database.
lastActive time.Time

// mu is used for cancelling the execution of current transaction.
mu struct {
Expand Down Expand Up @@ -923,6 +928,13 @@ func (cc *clientConn) dispatch(ctx context.Context, data []byte) error {
// reset killed for each request
atomic.StoreUint32(&cc.ctx.GetSessionVars().Killed, 0)
}()
t := time.Now()
if (cc.ctx.Status() & mysql.ServerStatusInTrans) > 0 {
connIdleDurationHistogramInTxn.Observe(t.Sub(cc.lastActive).Seconds())
} else {
connIdleDurationHistogramNotInTxn.Observe(t.Sub(cc.lastActive).Seconds())
}

span := opentracing.StartSpan("server.dispatch")
ctx = opentracing.ContextWithSpan(ctx, span)

Expand All @@ -932,7 +944,6 @@ func (cc *clientConn) dispatch(ctx context.Context, data []byte) error {
cc.mu.cancelFunc = cancelFunc
cc.mu.Unlock()

t := time.Now()
cc.lastPacket = data
cmd := data[0]
data = data[1:]
Expand Down Expand Up @@ -969,6 +980,7 @@ func (cc *clientConn) dispatch(ctx context.Context, data []byte) error {

cc.server.releaseToken(token)
span.Finish()
cc.lastActive = time.Now()
}()

vars := cc.ctx.GetSessionVars()
Expand Down

0 comments on commit 727d8f1

Please sign in to comment.