forked from pingcap/tidb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetric_table_def.go
3171 lines (3155 loc) · 161 KB
/
metric_table_def.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2019 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package infoschema
// MetricTableMap records the metric table definition, export for test.
// TODO: read from system table.
var MetricTableMap = map[string]MetricTableDef{
"tidb_query_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tidb_server_handle_query_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,sql_type,instance))`,
Labels: []string{"instance", "sql_type"},
Quantile: 0.90,
Comment: "The quantile of TiDB query durations(second)",
},
"tidb_qps": {
PromQL: `sum(rate(tidb_server_query_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (result,type,instance)`,
Labels: []string{"instance", "type", "result"},
Comment: "TiDB query processing numbers per second",
},
"tidb_qps_ideal": {
PromQL: `sum(tidb_server_connections) * sum(rate(tidb_server_handle_query_duration_seconds_count[$RANGE_DURATION])) / sum(rate(tidb_server_handle_query_duration_seconds_sum[$RANGE_DURATION]))`,
},
"tidb_ops_statement": {
PromQL: `sum(rate(tidb_executor_statement_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)`,
Labels: []string{"instance", "type"},
Comment: "TiDB statement statistics",
},
"tidb_failed_query_opm": {
PromQL: `sum(increase(tidb_server_execute_error_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type, instance)`,
Labels: []string{"instance", "type"},
Comment: "TiDB failed query opm",
},
"tidb_slow_query_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_server_slow_query_process_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.90,
Comment: "The quantile of TiDB slow query statistics with slow query time(second)",
},
"tidb_slow_query_cop_process_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_server_slow_query_cop_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.90,
Comment: "The quantile of TiDB slow query statistics with slow query total cop process time(second)",
},
"tidb_slow_query_cop_wait_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_server_slow_query_wait_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.90,
Comment: "The quantile of TiDB slow query statistics with slow query total cop wait time(second)",
},
"tidb_ops_internal": {
PromQL: "sum(rate(tidb_session_restricted_sql_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "TiDB internal SQL is used by TiDB itself.",
},
"tidb_process_mem_usage": {
PromQL: "process_resident_memory_bytes{$LABEL_CONDITIONS}",
Labels: []string{"instance", "job"},
Comment: "process rss memory usage",
},
"go_heap_mem_usage": {
PromQL: "go_memstats_heap_alloc_bytes{$LABEL_CONDITIONS}",
Labels: []string{"instance", "job"},
Comment: "TiDB heap memory size in use",
},
"process_cpu_usage": {
PromQL: "rate(process_cpu_seconds_total{$LABEL_CONDITIONS}[$RANGE_DURATION])",
Labels: []string{"instance", "job"},
},
"tidb_connection_count": {
PromQL: "tidb_server_connections{$LABEL_CONDITIONS}",
Labels: []string{"instance"},
Comment: "TiDB current connection counts",
},
"tidb_connection_idle_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tidb_server_conn_idle_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,in_txn,instance))`,
Labels: []string{"instance", "in_txn"},
Quantile: 0.90,
Comment: "The quantile of TiDB connection idle durations(second)",
},
"tidb_connection_idle_total_count": {
PromQL: `sum(increase(tidb_server_conn_idle_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (in_txn,instance)`,
Labels: []string{"instance", "in_txn"},
Comment: "The total count of TiDB connection idle",
},
"tidb_connection_idle_total_time": {
PromQL: `sum(increase(tidb_server_conn_idle_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (in_txn,instance)`,
Labels: []string{"instance", "in_txn"},
Comment: "The total time of TiDB connection idle",
},
"node_process_open_fd_count": {
PromQL: "process_open_fds{$LABEL_CONDITIONS}",
Labels: []string{"instance", "job"},
Comment: "Process opened file descriptors count",
},
"goroutines_count": {
PromQL: " go_goroutines{$LABEL_CONDITIONS}",
Labels: []string{"instance", "job"},
Comment: "Process current goroutines count)",
},
"go_gc_duration": {
PromQL: "rate(go_gc_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])",
Labels: []string{"instance", "job"},
Comment: "Go garbage collection STW pause duration(second)",
},
"go_threads": {
PromQL: "go_threads{$LABEL_CONDITIONS}",
Labels: []string{"instance", "job"},
Comment: "Total threads TiDB/PD process created currently",
},
"go_gc_count": {
PromQL: " rate(go_gc_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])",
Labels: []string{"instance", "job"},
Comment: "The Go garbage collection counts per second",
},
"go_gc_cpu_usage": {
PromQL: "go_memstats_gc_cpu_fraction{$LABEL_CONDITIONS}",
Labels: []string{"instance", "job"},
Comment: "The fraction of TiDB/PD available CPU time used by the GC since the program started.",
},
"tidb_event_opm": {
PromQL: "increase(tidb_server_event_total{$LABEL_CONDITIONS}[$RANGE_DURATION])",
Labels: []string{"instance", "type"},
Comment: "TiDB Server critical events total, including start/close/shutdown/hang etc",
},
"tidb_keep_alive_opm": {
PromQL: "sum(increase(tidb_monitor_keep_alive_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "TiDB instance monitor average keep alive times",
},
"tidb_prepared_statement_count": {
PromQL: "tidb_server_prepared_stmts{$LABEL_CONDITIONS}",
Labels: []string{"instance"},
Comment: "TiDB prepare statements count",
},
"tidb_time_jump_back_ops": {
PromQL: "sum(increase(tidb_monitor_time_jump_back_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "TiDB monitor time jump back count",
},
"tidb_panic_count": {
Comment: "TiDB instance panic count",
PromQL: "increase(tidb_server_panic_total{$LABEL_CONDITIONS}[$RANGE_DURATION])",
Labels: []string{"instance"},
},
"tidb_panic_count_total_count": {
Comment: "The total count of TiDB instance panic",
PromQL: "sum(increase(tidb_server_panic_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
},
"tidb_binlog_error_count": {
Comment: "TiDB write binlog error, skip binlog count",
PromQL: "tidb_server_critical_error_total{$LABEL_CONDITIONS}",
Labels: []string{"instance"},
},
"tidb_binlog_error_total_count": {
PromQL: "sum(increase(tidb_server_critical_error_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total count of TiDB write binlog error and skip binlog",
},
"tidb_get_token_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_server_get_token_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.99,
Comment: " The quantile of Duration (us) for getting token, it should be small until concurrency limit is reached(microsecond)",
},
"tidb_handshake_error_opm": {
PromQL: "sum(increase(tidb_server_handshake_error_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The OPM of TiDB processing handshake error",
},
"tidb_handshake_error_total_count": {
PromQL: "sum(increase(tidb_server_handshake_error_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total count of TiDB processing handshake error",
},
"tidb_transaction_ops": {
PromQL: "sum(rate(tidb_session_transaction_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,sql_type,instance)",
Labels: []string{"instance", "type", "sql_type"},
Comment: "TiDB transaction processing counts by type and source. Internal means TiDB inner transaction calls",
},
"tidb_transaction_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_session_transaction_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type,sql_type,instance))",
Labels: []string{"instance", "type", "sql_type"},
Quantile: 0.95,
Comment: "The quantile of transaction execution durations, including retry(second)",
},
"tidb_transaction_retry_num": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_session_retry_num_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Comment: "The quantile of TiDB transaction retry num",
Quantile: 0.95,
},
"tidb_transaction_statement_num": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_session_transaction_statement_num_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance,sql_type))",
Labels: []string{"instance", "sql_type"},
Comment: "The quantile of TiDB statements numbers within one transaction. Internal means TiDB inner transaction",
Quantile: 0.95,
},
"tidb_transaction_retry_error_ops": {
PromQL: "sum(rate(tidb_session_retry_error_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,sql_type,instance)",
Labels: []string{"instance", "type", "sql_type"},
Comment: "Error numbers of transaction retry",
},
"tidb_transaction_retry_error_total_count": {
PromQL: "sum(increase(tidb_session_retry_error_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,sql_type,instance)",
Labels: []string{"instance", "type", "sql_type"},
Comment: "The total count of transaction retry",
},
"tidb_transaction_local_latch_wait_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_local_latch_wait_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Comment: "The quantile of TiDB transaction latch wait time on key value storage(second)",
Quantile: 0.95,
},
"tidb_parse_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_session_parse_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,sql_type,instance))",
Labels: []string{"instance", "sql_type"},
Quantile: 0.95,
Comment: "The quantile time cost of parsing SQL to AST(second)",
},
"tidb_compile_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_session_compile_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, sql_type,instance))",
Labels: []string{"instance", "sql_type"},
Quantile: 0.95,
Comment: "The quantile time cost of building the query plan(second)",
},
"tidb_execute_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_session_execute_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, sql_type, instance))",
Labels: []string{"instance", "sql_type"},
Quantile: 0.95,
Comment: "The quantile time cost of executing the SQL which does not include the time to get the results of the query(second)",
},
"tidb_expensive_executors_ops": {
Comment: "TiDB executors using more cpu and memory resources",
PromQL: "sum(rate(tidb_executor_expensive_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
},
"tidb_query_using_plan_cache_ops": {
PromQL: "sum(rate(tidb_server_plan_cache_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
Comment: "TiDB plan cache hit ops",
},
"tidb_distsql_execution_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_distsql_handle_query_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, type, instance))",
Labels: []string{"instance", "type"},
Quantile: 0.95,
Comment: "The quantile durations of distsql execution(second)",
},
"tidb_distsql_qps": {
PromQL: "sum(rate(tidb_distsql_handle_query_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION]))",
Labels: []string{"instance", "type"},
Comment: "distsql query handling durations per second",
},
"tidb_distsql_partial_qps": {
PromQL: "sum(rate(tidb_distsql_scan_keys_partial_num_count{$LABEL_CONDITIONS}[$RANGE_DURATION]))",
Labels: []string{"instance"},
Comment: "the numebr of distsql partial scan numbers",
},
"tidb_distsql_scan_key_num": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_distsql_scan_keys_num_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.95,
Comment: "The quantile numebr of distsql scan numbers",
},
"tidb_distsql_partial_scan_key_num": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_distsql_scan_keys_partial_num_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.95,
Comment: "The quantile numebr of distsql partial scan key numbers",
},
"tidb_distsql_partial_num": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_distsql_partial_num_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.95,
Comment: "The quantile of distsql partial numbers per query",
},
"tidb_cop_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_cop_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance))",
Labels: []string{"instance"},
Quantile: 0.95,
Comment: "The quantile of kv storage coprocessor processing durations",
},
"tidb_kv_backoff_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_backoff_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type,instance))",
Labels: []string{"instance", "type"},
Quantile: 0.95,
Comment: "The quantile of kv backoff time durations(second)",
},
"tidb_kv_backoff_ops": {
PromQL: "sum(rate(tidb_tikvclient_backoff_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
Comment: "kv storage backoff times",
},
"tidb_kv_region_error_ops": {
PromQL: "sum(rate(tidb_tikvclient_region_err_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
Comment: "kv region error times",
},
"tidb_kv_region_error_total_count": {
PromQL: "sum(increase(tidb_tikvclient_region_err_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
Comment: "The total count of kv region error",
},
"tidb_lock_resolver_ops": {
PromQL: "sum(rate(tidb_tikvclient_lock_resolver_actions_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
Comment: "lock resolve times",
},
"tidb_lock_resolver_total_num": {
PromQL: "sum(increase(tidb_tikvclient_lock_resolver_actions_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
Comment: "The total number of lock resolve",
},
"tidb_lock_cleanup_fail_ops": {
PromQL: "sum(rate(tidb_tikvclient_lock_cleanup_task_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
Comment: "lock cleanup failed ops",
},
"tidb_load_safepoint_fail_ops": {
PromQL: "sum(rate(tidb_tikvclient_load_safepoint_total{$LABEL_CONDITIONS}[$RANGE_DURATION]))",
Labels: []string{"instance", "type"},
Comment: "safe point update ops",
},
"tidb_kv_request_ops": {
Comment: "kv request total by instance and command type",
PromQL: "sum(rate(tidb_tikvclient_request_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance, type)",
Labels: []string{"instance", "type"},
},
"tidb_kv_request_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_request_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type,store,instance))",
Labels: []string{"instance", "type", "store"},
Quantile: 0.95,
Comment: "The quantile of kv requests durations by store",
},
"tidb_kv_txn_ops": {
Comment: "TiDB total kv transaction counts",
PromQL: "sum(rate(tidb_tikvclient_txn_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
},
"tidb_kv_write_num": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_txn_write_kv_num_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance))",
Labels: []string{"instance"},
Quantile: 1,
Comment: "The quantile of kv write count per transaction execution",
},
"tidb_kv_write_size": {
Comment: "The quantile of kv write size per transaction execution",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_txn_write_size_bytes_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance))",
Labels: []string{"instance"},
Quantile: 1,
},
"tidb_txn_region_num": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_txn_regions_num_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance))",
Labels: []string{"instance"},
Comment: "The quantile of regions transaction operates on count",
Quantile: 0.95,
},
"tidb_load_safepoint_ops": {
PromQL: "sum(rate(tidb_tikvclient_load_safepoint_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The OPS of load safe point loading",
},
"tidb_load_safepoint_total_num": {
PromQL: "sum(increase(tidb_tikvclient_load_safepoint_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total count of safe point loading",
},
"tidb_kv_snapshot_ops": {
Comment: "using snapshots total",
PromQL: "sum(rate(tidb_tikvclient_snapshot_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
},
"pd_client_cmd_ops": {
PromQL: "sum(rate(pd_client_cmd_handle_cmds_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
Comment: "pd client command ops",
},
"pd_client_cmd_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, type,instance))",
Labels: []string{"instance", "type"},
Quantile: 0.95,
Comment: "The quantile of pd client command durations",
},
"pd_cmd_fail_ops": {
PromQL: "sum(rate(pd_client_cmd_handle_failed_cmds_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
Comment: "pd client command fail count",
},
"pd_cmd_fail_total_count": {
PromQL: "sum(increase(pd_client_cmd_handle_failed_cmds_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
Comment: "The total count of pd client command fail",
},
"pd_request_rpc_ops": {
PromQL: "sum(rate(pd_client_request_handle_requests_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION]))",
Labels: []string{"instance", "type"},
Comment: "pd client handle request operation per second",
},
"pd_request_rpc_duration": {
Comment: "The quantile of pd client handle request duration(second)",
PromQL: "histogram_quantile($QUANTILE, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type,instance))",
Labels: []string{"instance", "type"},
Quantile: 0.999,
},
"pd_tso_wait_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.999,
Comment: "The quantile duration of a client starting to wait for the TS until received the TS result.",
},
"pd_tso_rpc_duration": {
Comment: "The quantile duration of a client sending TSO request until received the response.",
PromQL: "histogram_quantile($QUANTILE, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.999,
},
"pd_start_tso_wait_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_pdclient_ts_future_wait_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.999,
Comment: "The quantile duration of the waiting time for getting the start timestamp oracle",
},
"tidb_load_schema_duration": {
Comment: "The quantile of TiDB loading schema time durations by instance",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_domain_load_schema_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance))",
Labels: []string{"instance"},
Quantile: 0.99,
},
"tidb_load_schema_ops": {
Comment: "TiDB loading schema times including both failed and successful ones",
PromQL: "sum(rate(tidb_domain_load_schema_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
},
"tidb_schema_lease_error_opm": {
Comment: "TiDB schema lease error counts",
PromQL: "sum(increase(tidb_session_schema_lease_error_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
},
"tidb_schema_lease_error_total_count": {
Comment: "The total count of TiDB schema lease error",
PromQL: "sum(increase(tidb_session_schema_lease_error_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
},
"tidb_load_privilege_ops": {
Comment: "TiDB load privilege counts",
PromQL: "sum(rate(tidb_domain_load_privilege_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
},
"tidb_ddl_duration": {
Comment: "The quantile of TiDB DDL duration statistics",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_ddl_handle_job_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, type,instance))",
Labels: []string{"instance", "type"},
Quantile: 0.95,
},
"tidb_ddl_batch_add_index_duration": {
Comment: "The quantile of TiDB batch add index durations by histogram buckets",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_ddl_batch_add_idx_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, type, instance))",
Labels: []string{"instance", "type"},
Quantile: 0.95,
},
"tidb_ddl_add_index_speed": {
Comment: "TiDB add index speed",
PromQL: "sum(rate(tidb_ddl_add_index_total[$RANGE_DURATION])) by (type)",
},
"tidb_ddl_waiting_jobs_num": {
Comment: "TiDB ddl request in queue",
PromQL: "tidb_ddl_waiting_jobs{$LABEL_CONDITIONS}",
Labels: []string{"instance", "type"},
},
"tidb_ddl_meta_opm": {
Comment: "TiDB different ddl worker numbers",
PromQL: "increase(tidb_ddl_worker_operation_total{$LABEL_CONDITIONS}[$RANGE_DURATION])",
Labels: []string{"instance", "type"},
},
"tidb_ddl_worker_duration": {
Comment: "The quantile of TiDB ddl worker duration",
PromQL: "histogram_quantile($QUANTILE, sum(increase(tidb_ddl_worker_operation_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, type, action, result,instance))",
Labels: []string{"instance", "type", "result", "action"},
Quantile: 0.95,
},
"tidb_ddl_deploy_syncer_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_ddl_deploy_syncer_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, type, result,instance))",
Labels: []string{"instance", "type", "result"},
Quantile: 0.95,
Comment: "The quantile of TiDB ddl schema syncer statistics, including init, start, watch, clear function call time cost",
},
"tidb_owner_handle_syncer_duration": {
Comment: "The quantile of TiDB ddl owner time operations on etcd duration statistics ",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_ddl_owner_handle_syncer_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, type, result,instance))",
Labels: []string{"instance", "type", "result"},
Quantile: 0.95,
},
"tidb_ddl_update_self_version_duration": {
Comment: "The quantile of TiDB schema syncer version update time duration",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_ddl_update_self_ver_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, result,instance))",
Labels: []string{"instance", "result"},
Quantile: 0.95,
},
"tidb_ddl_opm": {
Comment: "The quantile of executed DDL jobs per minute",
PromQL: "sum(rate(tidb_ddl_handle_job_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
},
"tidb_statistics_auto_analyze_duration": {
Comment: "The quantile of TiDB auto analyze time durations within 95 percent histogram buckets",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_statistics_auto_analyze_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.95,
},
"tidb_statistics_auto_analyze_ops": {
Comment: "TiDB auto analyze query per second",
PromQL: "sum(rate(tidb_statistics_auto_analyze_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
},
"tidb_statistics_stats_inaccuracy_rate": {
Comment: "The quantile of TiDB statistics inaccurate rate",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_statistics_stats_inaccuracy_rate_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.95,
},
"tidb_statistics_pseudo_estimation_ops": {
Comment: "TiDB optimizer using pseudo estimation counts",
PromQL: "sum(rate(tidb_statistics_pseudo_estimation_total{$LABEL_CONDITIONS}[$RANGE_DURATION]))",
Labels: []string{"instance"},
},
"tidb_statistics_pseudo_estimation_total_count": {
Comment: "The total count of TiDB optimizer using pseudo estimation",
PromQL: "sum(increase(tidb_statistics_pseudo_estimation_total{$LABEL_CONDITIONS}[$RANGE_DURATION]))",
Labels: []string{"instance"},
},
"tidb_statistics_dump_feedback_ops": {
Comment: "TiDB dumping statistics back to kv storage times",
PromQL: "sum(rate(tidb_statistics_dump_feedback_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
},
"tidb_statistics_dump_feedback_total_count": {
Comment: "The total count of operations that TiDB dumping statistics back to kv storage",
PromQL: "sum(increase(tidb_statistics_dump_feedback_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
},
"tidb_statistics_store_query_feedback_qps": {
Comment: "TiDB store quering feedback counts",
PromQL: "sum(rate(tidb_statistics_store_query_feedback_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance) ",
Labels: []string{"instance", "type"},
},
"tidb_statistics_store_query_feedback_total_count": {
Comment: "The total count of TiDB store quering feedback",
PromQL: "sum(increase(tidb_statistics_store_query_feedback_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance) ",
Labels: []string{"instance", "type"},
},
"tidb_statistics_significant_feedback": {
Comment: "Counter of query feedback whose actual count is much different than calculated by current statistics",
PromQL: "sum(rate(tidb_statistics_high_error_rate_feedback_total{$LABEL_CONDITIONS}[$RANGE_DURATION]))",
Labels: []string{"instance"},
},
"tidb_statistics_update_stats_ops": {
Comment: "TiDB updating statistics using feed back counts",
PromQL: "sum(rate(tidb_statistics_update_stats_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
},
"tidb_statistics_update_stats_total_count": {
Comment: "The total count of TiDB updating statistics using feed back",
PromQL: "sum(increase(tidb_statistics_update_stats_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
},
"tidb_statistics_fast_analyze_status": {
Comment: "The quantile of TiDB fast analyze statistics ",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_statistics_fast_analyze_status_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, type,instance))",
Labels: []string{"instance", "type"},
Quantile: 0.95,
},
"tidb_new_etcd_session_duration": {
Comment: "The quantile of TiDB new session durations for new etcd sessions",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_owner_new_session_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type,result, instance))",
Labels: []string{"instance", "type", "result"},
Quantile: 0.95,
},
"tidb_owner_watcher_ops": {
Comment: "TiDB owner watcher counts",
PromQL: "sum(rate(tidb_owner_watch_owner_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type, result, instance)",
Labels: []string{"instance", "type", "result"},
},
"tidb_auto_id_qps": {
Comment: "TiDB auto id requests per second including single table/global auto id processing and single table auto id rebase processing",
PromQL: "sum(rate(tidb_autoid_operation_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION]))",
Labels: []string{"instance"},
},
"tidb_auto_id_request_duration": {
Comment: "The quantile of TiDB auto id requests durations",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_autoid_operation_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, type,instance))",
Labels: []string{"instance", "type"},
Quantile: 0.95,
},
"tidb_region_cache_ops": {
Comment: "TiDB region cache operations count",
PromQL: "sum(rate(tidb_tikvclient_region_cache_operations_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,result,instance)",
Labels: []string{"instance", "type", "result"},
},
"tidb_meta_operation_duration": {
Comment: "The quantile of TiDB meta operation durations including get/set schema and ddl jobs",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_meta_operation_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, type,result,instance))",
Labels: []string{"instance", "type", "result"},
Quantile: 0.95,
},
"tidb_gc_worker_action_opm": {
Comment: "kv storage garbage collection counts by type",
PromQL: "sum(increase(tidb_tikvclient_gc_worker_actions_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
},
"tidb_gc_duration": {
Comment: "The quantile of kv storage garbage collection time durations",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_gc_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance,stage))",
Labels: []string{"instance", "stage"},
Quantile: 0.95,
},
"tidb_gc_config": {
Comment: "kv storage garbage collection config including gc_life_time and gc_run_interval",
PromQL: "tidb_tikvclient_gc_config{$LABEL_CONDITIONS}",
Labels: []string{"instance", "type"},
},
"tidb_gc_fail_opm": {
Comment: "kv storage garbage collection failing counts",
PromQL: "sum(increase(tidb_tikvclient_gc_failure{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
},
"tidb_gc_delete_range_fail_opm": {
Comment: "kv storage unsafe destroy range failed counts",
PromQL: "sum(increase(tidb_tikvclient_gc_unsafe_destroy_range_failures{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
},
"tidb_gc_too_many_locks_opm": {
Comment: "kv storage region garbage collection clean too many locks count",
PromQL: "sum(increase(tidb_tikvclient_gc_region_too_many_locks[$RANGE_DURATION]))",
},
"tidb_gc_action_result_opm": {
Comment: "kv storage garbage collection results including failed and successful ones",
PromQL: "sum(increase(tidb_tikvclient_gc_action_result{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance)",
Labels: []string{"instance", "type"},
},
"tidb_gc_delete_range_task_status": {
Comment: "kv storage delete range task execution status by type",
PromQL: "sum(tidb_tikvclient_range_task_stats{$LABEL_CONDITIONS}) by (type, result,instance)",
Labels: []string{"instance", "type", "result"},
},
"tidb_gc_push_task_duration": {
Comment: "The quantile of kv storage range worker processing one task duration",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_range_task_push_duration_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type,instance))",
Labels: []string{"instance", "type"},
Quantile: 0.95,
},
"tidb_batch_client_pending_req_count": {
Comment: "kv storage batch requests in queue",
PromQL: "sum(tidb_tikvclient_pending_batch_requests{$LABEL_CONDITIONS}) by (store,instance)",
Labels: []string{"instance", "store"},
},
"tidb_batch_client_wait_duration": {
Comment: "The quantile of kv storage batch processing durations, the unit is nanosecond",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_batch_wait_duration_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance))",
Labels: []string{"instance"},
Quantile: 0.95,
},
"tidb_batch_client_wait_conn_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_batch_client_wait_connection_establish_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance))",
Labels: []string{"instance"},
Quantile: 0.95,
Comment: "The quantile of batch client wait new connection establish durations",
},
"tidb_batch_client_wait_conn_total_count": {
PromQL: "sum(increase(tidb_tikvclient_batch_client_wait_connection_establish_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total count of batch client wait new connection establish",
},
"tidb_batch_client_wait_conn_total_time": {
PromQL: "sum(increase(tidb_tikvclient_batch_client_wait_connection_establish_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total time of batch client wait new connection establish",
},
"tidb_batch_client_unavailable_duration": {
Comment: "The quantile of kv storage batch processing unvailable durations",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_batch_client_unavailable_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance))",
Labels: []string{"instance"},
Quantile: 0.95,
},
"uptime": {
PromQL: "(time() - process_start_time_seconds{$LABEL_CONDITIONS})",
Labels: []string{"instance", "job"},
Comment: "TiDB uptime since last restart(second)",
},
"up": {
PromQL: `up{$LABEL_CONDITIONS}`,
Labels: []string{"instance", "job"},
Comment: "whether the instance is up. 1 is up, 0 is down(off-line)",
},
"pd_role": {
PromQL: `delta(pd_tso_events{type="save"}[$RANGE_DURATION]) > bool 0`,
Labels: []string{"instance"},
Comment: "It indicates whether the current PD is the leader or a follower.",
},
"normal_stores": {
PromQL: `sum(pd_cluster_status{type="store_up_count"}) by (instance)`,
Labels: []string{"instance"},
Comment: "The count of healthy stores",
},
"abnormal_stores": {
PromQL: `sum(pd_cluster_status{ type=~"store_disconnected_count|store_unhealth_count|store_low_space_count|store_down_count|store_offline_count|store_tombstone_count"})`,
Labels: []string{"instance", "type"},
},
"pd_scheduler_config": {
PromQL: `pd_config_status{$LABEL_CONDITIONS}`,
Labels: []string{"type"},
},
"pd_region_label_isolation_level": {
PromQL: `pd_regions_label_level{$LABEL_CONDITIONS}`,
Labels: []string{"instance", "type"},
},
"pd_label_distribution": {
PromQL: `pd_cluster_placement_status{$LABEL_CONDITIONS}`,
Labels: []string{"name"},
},
"pd_cluster_status": {
PromQL: `sum(pd_cluster_status{$LABEL_CONDITIONS}) by (instance, type)`,
Labels: []string{"instance", "type"},
},
"pd_cluster_metadata": {
PromQL: `pd_cluster_metadata{$LABEL_CONDITIONS}`,
Labels: []string{"instance", "type"},
},
"pd_region_health": {
PromQL: `sum(pd_regions_status{$LABEL_CONDITIONS}) by (instance, type)`,
Labels: []string{"instance", "type"},
Comment: "It records the unusual Regions' count which may include pending peers, down peers, extra peers, offline peers, missing peers or learner peers",
},
"pd_schedule_operator": {
PromQL: `sum(delta(pd_schedule_operators_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,event,instance)`,
Labels: []string{"instance", "type", "event"},
Comment: "The number of different operators",
},
"pd_schedule_operator_total_num": {
PromQL: `sum(increase(pd_schedule_operators_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,event,instance)`,
Labels: []string{"instance", "type", "event"},
Comment: "The total number of different operators",
},
"pd_operator_finish_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(pd_schedule_finish_operators_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type))`,
Labels: []string{"type"},
Quantile: 0.99,
Comment: "The quantile time consumed when the operator is finished",
},
"pd_operator_step_finish_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(pd_schedule_finish_operator_steps_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type))`,
Labels: []string{"type"},
Quantile: 0.99,
Comment: "The quantile time consumed when the operator step is finished",
},
"pd_scheduler_store_status": {
PromQL: `pd_scheduler_store_status{$LABEL_CONDITIONS}`,
Labels: []string{"instance", "address", "store", "type"},
},
"store_available_ratio": {
PromQL: `sum(pd_scheduler_store_status{type="store_available"}) by (address, store) / sum(pd_scheduler_store_status{type="store_capacity"}) by (address, store)`,
Labels: []string{"address", "store"},
Comment: "It is equal to Store available capacity size over Store capacity size for each TiKV instance",
},
"store_size_amplification": {
PromQL: `sum(pd_scheduler_store_status{type="region_size"}) by (address, store) / sum(pd_scheduler_store_status{type="store_used"}) by (address, store) * 2^20`,
Labels: []string{"address", "store"},
Comment: "The size amplification, which is equal to Store Region size over Store used capacity size, of each TiKV instance",
},
"pd_scheduler_op_influence": {
PromQL: `pd_scheduler_op_influence{$LABEL_CONDITIONS}`,
Labels: []string{"instance", "scheduler", "store", "type"},
},
"pd_scheduler_tolerant_resource": {
PromQL: `pd_scheduler_tolerant_resource{$LABEL_CONDITIONS}`,
Labels: []string{"instance", "scheduler", "source", "target"},
},
"pd_hotspot_status": {
PromQL: `pd_hotspot_status{$LABEL_CONDITIONS}`,
Labels: []string{"instance", "address", "store", "type"},
},
"pd_scheduler_status": {
PromQL: `pd_scheduler_status{$LABEL_CONDITIONS}`,
Labels: []string{"instance", "kind", "type"},
},
"pd_scheduler_balance_leader": {
PromQL: `sum(delta(pd_scheduler_balance_leader{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (address,store,instance,type)`,
Labels: []string{"instance", "address", "store", "type"},
Comment: "The leader movement details among TiKV instances",
},
"pd_scheduler_balance_region": {
PromQL: `sum(delta(pd_scheduler_balance_region{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (address,store,instance,type)`,
Labels: []string{"instance", "address", "store", "type"},
Comment: "The Region movement details among TiKV instances",
},
"pd_balance_scheduler_status": {
PromQL: `sum(delta(pd_scheduler_event_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type,name)`,
Labels: []string{"instance", "name", "type"},
Comment: "The inner status of balance leader scheduler",
},
"pd_checker_event_count": {
PromQL: `sum(delta(pd_checker_event_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (name,instance,type)`,
Labels: []string{"instance", "name", "type"},
Comment: "The replica/region checker's status",
},
"pd_schedule_filter": {
PromQL: `sum(delta(pd_schedule_filter{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (store, type, scope, instance)`,
Labels: []string{"instance", "scope", "store", "type"},
},
"pd_scheduler_balance_direction": {
PromQL: `sum(delta(pd_scheduler_balance_direction{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,source,target,instance)`,
Labels: []string{"instance", "source", "target", "type"},
},
"pd_schedule_store_limit": {
PromQL: `pd_schedule_store_limit{$LABEL_CONDITIONS}`,
Labels: []string{"instance", "store", "type"},
},
"pd_grpc_completed_commands_rate": {
PromQL: `sum(rate(grpc_server_handling_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (grpc_method,instance)`,
Labels: []string{"instance", "grpc_method"},
Comment: "The rate of completing each kind of gRPC commands",
},
"pd_grpc_completed_commands_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(grpc_server_handling_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,grpc_method,instance))`,
Labels: []string{"instance", "grpc_method"},
Quantile: 0.99,
Comment: "The quantile time consumed of completing each kind of gRPC commands",
},
"pd_handle_transactions_rate": {
PromQL: `sum(rate(pd_txn_handle_txns_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance, result)`,
Labels: []string{"instance", "result"},
Comment: "The rate of handling etcd transactions",
},
"pd_handle_transactions_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(pd_txn_handle_txns_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance, result))`,
Labels: []string{"instance", "result"},
Quantile: 0.99,
Comment: "The quantile time consumed of handling etcd transactions",
},
"etcd_wal_fsync_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))`,
Labels: []string{"instance"},
Quantile: 0.99,
Comment: "The quantile time consumed of writing WAL into the persistent storage",
},
"pd_peer_round_trip_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance,To))`,
Labels: []string{"instance", "To"},
Quantile: 0.99,
Comment: "The quantile latency of the network in .99",
},
"etcd_disk_wal_fsync_rate": {
PromQL: `delta(etcd_disk_wal_fsync_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])`,
Labels: []string{"instance"},
Comment: "The rate of writing WAL into the persistent storage",
},
"pd_server_etcd_state": {
PromQL: `pd_server_etcd_state{$LABEL_CONDITIONS}`,
Labels: []string{"instance", "type"},
Comment: "The current term of Raft",
},
"pd_request_rpc_duration_avg": {
PromQL: `avg(rate(pd_client_request_handle_requests_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type) / avg(rate(pd_client_request_handle_requests_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type)`,
Labels: []string{"type"},
},
"pd_region_heartbeat_duration": {
PromQL: `round(histogram_quantile($QUANTILE, sum(rate(pd_scheduler_region_heartbeat_latency_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,address, store)), 1000)`,
Labels: []string{"address", "store"},
Quantile: 0.99,
Comment: "The quantile of heartbeat latency of each TiKV instance in",
},
"pd_scheduler_region_heartbeat": {
PromQL: `sum(rate(pd_scheduler_region_heartbeat{$LABEL_CONDITIONS}[$RANGE_DURATION])*60) by (address,instance, store, status,type)`,
Labels: []string{"instance", "address", "status", "store", "type"},
},
"pd_region_syncer_status": {
PromQL: `pd_region_syncer_status{$LABEL_CONDITIONS}`,
Labels: []string{"instance", "type"},
},
"tikv_engine_size": {
PromQL: `sum(tikv_engine_size_bytes{$LABEL_CONDITIONS}) by (instance, type, db)`,
Labels: []string{"instance", "type", "db"},
Comment: "The storage size per TiKV instance",
},
"tikv_store_size": {
PromQL: `sum(tikv_store_size_bytes{$LABEL_CONDITIONS}) by (instance,type)`,
Labels: []string{"instance", "type"},
Comment: "The available or capacity size of each TiKV instance",
},
"tikv_thread_cpu": {
PromQL: `sum(rate(tikv_thread_cpu_seconds_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,name)`,
Labels: []string{"instance", "name"},
Comment: "The CPU usage of each TiKV instance",
},
"tikv_memory": {
PromQL: `avg(process_resident_memory_bytes{$LABEL_CONDITIONS}) by (instance)`,
Labels: []string{"instance"},
Comment: "The memory usage per TiKV instance",
},
"tikv_io_utilization": {
PromQL: `rate(node_disk_io_time_seconds_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`,
Labels: []string{"instance", "device"},
Comment: "The I/O utilization per TiKV instance",
},
"tikv_flow_mbps": {
PromQL: `sum(rate(tikv_engine_flow_bytes{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type,db)`,
Labels: []string{"instance", "type", "db"},
Comment: "The total bytes of read and write in each TiKV instance",
},
"tikv_grpc_qps": {
PromQL: `sum(rate(tikv_grpc_msg_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)`,
Labels: []string{"instance", "type"},
Comment: "The QPS per command in each TiKV instance",
},
"tikv_grpc_errors": {
PromQL: `sum(rate(tikv_grpc_msg_fail_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)`,
Labels: []string{"instance", "type"},
Comment: "The OPS of the gRPC message failures",
},
"tikv_grpc_error_total_count": {
PromQL: `sum(increase(tikv_grpc_msg_fail_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)`,
Labels: []string{"instance", "type"},
Comment: "The total count of the gRPC message failures",
},
"tikv_critical_error": {
PromQL: `sum(rate(tikv_critical_error_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance, type)`,
Labels: []string{"instance", "type"},
Comment: "The OPS of the TiKV critical error",
},
"tikv_critical_error_total_count": {
PromQL: `sum(increase(tikv_critical_error_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance, type)`,
Labels: []string{"instance", "type"},
Comment: "The total number of the TiKV critical error",
},
"tikv_pd_heartbeat": {
PromQL: `sum(delta(tikv_pd_heartbeat_message_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)`,
Labels: []string{"instance", "type"},
Comment: "The total number of the gRPC message failures",
},
"tikv_region_count": {
PromQL: `sum(tikv_raftstore_region_count{$LABEL_CONDITIONS}) by (instance,type)`,
Labels: []string{"instance", "type"},
Comment: "The number of regions on each TiKV instance",
},
"tikv_scheduler_is_busy": {
PromQL: `sum(rate(tikv_scheduler_too_busy_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,db,type,stage)`,
Labels: []string{"instance", "db", "type", "stage"},
Comment: "Indicates occurrences of Scheduler Busy events that make the TiKV instance unavailable temporarily",
},
"tikv_scheduler_is_busy_total_count": {
PromQL: `sum(increase(tikv_scheduler_too_busy_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,db,type,stage)`,
Labels: []string{"instance", "db", "type", "stage"},
Comment: "The total count of Scheduler Busy events that make the TiKV instance unavailable temporarily",
},
"tikv_channel_full": {
PromQL: `sum(rate(tikv_channel_full_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type,db)`,
Labels: []string{"instance", "db", "type"},
Comment: "The ops of channel full errors on each TiKV instance, it will make the TiKV instance unavailable temporarily",
},
"tikv_channel_full_total_count": {
PromQL: `sum(increase(tikv_channel_full_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type,db)`,
Labels: []string{"instance", "db", "type"},
Comment: "The total number of channel full errors on each TiKV instance, it will make the TiKV instance unavailable temporarily",
},
"tikv_coprocessor_is_busy": {
PromQL: `sum(rate(tikv_coprocessor_request_error{type='full'}[$RANGE_DURATION])) by (instance,db,type)`,
Labels: []string{"instance", "db"},
Comment: "The ops of Coprocessor Full events that make the TiKV instance unavailable temporarily",
},
"tikv_coprocessor_is_busy_total_count": {
PromQL: `sum(increase(tikv_coprocessor_request_error{type='full'}[$RANGE_DURATION])) by (instance,db,type)`,
Labels: []string{"instance", "db"},
Comment: "The total count of Coprocessor Full events that make the TiKV instance unavailable temporarily",
},
"tikv_engine_write_stall": {
PromQL: `avg(tikv_engine_write_stall{type="write_stall_percentile99"}) by (instance, db)`,
Labels: []string{"instance", "db"},
Comment: "Indicates occurrences of Write Stall events that make the TiKV instance unavailable temporarily",
},
"tikv_server_report_failures": {
PromQL: `sum(rate(tikv_server_report_failure_msg_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance,store_id)`,
Labels: []string{"instance", "store_id", "type"},
Comment: "The total number of reported failure messages",
},
"tikv_server_report_failures_total_count": {
PromQL: `sum(increase(tikv_server_report_failure_msg_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (type,instance,store_id)`,
Labels: []string{"instance", "store_id", "type"},