Skip to content

Commit

Permalink
Fix inaccurate APISIX metrics (apache#12108)
Browse files Browse the repository at this point in the history
  • Loading branch information
pg-yang authored Apr 16, 2024
1 parent bf19d64 commit bea7ed3
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 33 deletions.
3 changes: 0 additions & 3 deletions docs/en/api/metrics-query-expression.md
Original file line number Diff line number Diff line change
Expand Up @@ -251,9 +251,6 @@ The order of the new label values should be the same as the order of the label v
For example:
If we want to query the `service_percentile` metric with the label values `50,75,90,95,99`, and rename the label name to `percentile` and the label values to `P50,P75,P90,P95,P99`, we can use the following expression:

```text
and rename the label values to `P50,P75,P90,P95,P99`, we can use the following expression:
```text
relabel(service_percentile{p='50,75,90,95,99'}, p='50,75,90,95,99', percentile='P50,P75,P90,P95,P99')
```
Expand Down
1 change: 1 addition & 0 deletions docs/en/changes/changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
- `memory_swap_percentage` -> `memory_virtual_memory_percentage`
* Fix/Change UI init setting for Windows Swap -> Virtual Memory
* Fix `Memory Swap Usage`/`Virtual Memory Usage` display with UI init.(Linux/Windows)
* Fix inaccurate APISIX metrics

#### UI

Expand Down
24 changes: 12 additions & 12 deletions oap-server/server-starter/src/main/resources/otel-rules/apisix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,24 @@ metricsRules:
# Service
# Ignore http_connections metrics with accepted and handled state as the actual type is counter
- name: sv_http_connections
exp: apisix_nginx_http_current_connections.tagNotMatch('state','accepted|handled').sum(['state','service_name']).service(['service_name'] , Layer.APISIX)
exp: apisix_nginx_http_current_connections.tagNotMatch('state','accepted|handled').sum(['state','service_name','node']).service(['service_name'] , Layer.APISIX)
- name: sv_http_requests
exp: apisix_http_requests_total.sum(['service_instance_id','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX)
exp: apisix_http_requests_total.sum(['service_instance_id','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX)
# Not match any route
# Refer to https://apisix.apache.org/docs/apisix/plugins/prometheus/
- name: sv_bandwidth_unmatched
exp: apisix_bandwidth.tagEqual('route' , '' , 'node' , '').sum(['type','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX)
exp: apisix_bandwidth.tagEqual('route' , '' , 'node' , '').sum(['type','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX)
- name: sv_http_status_unmatched
exp: apisix_http_status.tagEqual('route' , '' , 'node' , '').sum(['code','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX)
exp: apisix_http_status.tagEqual('route' , '' , 'node' , '').sum(['code','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX)
- name: sv_http_latency_unmatched
exp: apisix_http_latency.tagEqual('route' , '' , 'node' , '').sum(['type','le','service_name']).histogram().histogram_percentile([50,70,90,99]).service(['service_name'] , Layer.APISIX)
exp: apisix_http_latency.tagEqual('route' , '' , 'node' , '').sum(['type','le','service_name','node']).histogram().histogram_percentile([50,70,90,99]).service(['service_name'] , Layer.APISIX)
# Match a route
- name: sv_bandwidth_matched
exp: apisix_bandwidth.tagNotEqual('route' , '' , 'node' , '').sum(['type','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX)
exp: apisix_bandwidth.tagNotEqual('route' , '' , 'node' , '').sum(['type','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX)
- name: sv_http_status_matched
exp: apisix_http_status.tagNotEqual('route' , '' , 'node' , '').sum(['code','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX)
exp: apisix_http_status.tagNotEqual('route' , '' , 'node' , '').sum(['code','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX)
- name: sv_http_latency_matched
exp: apisix_http_latency.tagNotEqual('route' , '' , 'node' , '').sum(['type','le','service_name']).histogram().histogram_percentile([50,70,90,99]).service(['service_name'] , Layer.APISIX)
exp: apisix_http_latency.tagNotEqual('route' , '' , 'node' , '').sum(['type','le','service_name','node']).histogram().histogram_percentile([50,70,90,99]).service(['service_name'] , Layer.APISIX)

# Instance
- name: instance_shared_dict_capacity_bytes
Expand Down Expand Up @@ -88,15 +88,15 @@ metricsRules:
# Endpoint
# Reorganization metrics which has `route` label as endpoint ,that is formatted to `router/{routerId}`
- name: endpoint_http_status
exp: apisix_http_status.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['code','service_name','route']).rate('PT1M').endpoint(['service_name'],['route'], Layer.APISIX)
exp: apisix_http_status.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['code','service_name','route','node']).rate('PT1M').endpoint(['service_name'],['route'], Layer.APISIX)
- name: endpoint_bandwidth
exp: apisix_bandwidth.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['type','service_name','route']).rate('PT1M').endpoint(['service_name'],['route'], Layer.APISIX)
exp: apisix_bandwidth.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['type','service_name','route','node']).rate('PT1M').endpoint(['service_name'],['route'], Layer.APISIX)
- name: endpoint_http_latency
exp: apisix_http_latency.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['type','le','service_name','route']).histogram().histogram_percentile([50,70,90,99]).endpoint(['service_name'],['route'], Layer.APISIX)
exp: apisix_http_latency.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['type','le','service_name','route','node']).histogram().histogram_percentile([50,70,90,99]).endpoint(['service_name'],['route'], Layer.APISIX)
# Reorganization metrics which has `node` label as endpoint , that is formatted to `node/{node}`
- name: endpoint_http_status
exp: apisix_http_status.tagNotEqual('node','').tag({tags->tags.node = 'upstream/'+tags['node']}).sum(['code','service_name','node']).rate('PT1M').endpoint(['service_name'],['node'], Layer.APISIX)
- name: endpoint_bandwidth
exp: apisix_bandwidth.tagNotEqual('node','').tag({tags->tags.node = 'upstream/'+tags['node']}).sum(['type','service_name','node']).rate('PT1M').endpoint(['service_name'],['node'], Layer.APISIX)
- name: endpoint_http_latency
exp: apisix_http_latency.tagNotEqual('node','').tag({tags->tags.node = 'upstream/'+tags['node']})sum(['type','le','service_name','node']).histogram().histogram_percentile([50,70,90,99]).endpoint(['service_name'],['node'], Layer.APISIX)
exp: apisix_http_latency.tagNotEqual('node','').tag({tags->tags.node = 'upstream/'+tags['node']}).sum(['type','le','service_name','node']).histogram().histogram_percentile([50,70,90,99]).endpoint(['service_name'],['node'], Layer.APISIX)
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"showYAxis": true
},
"expressions": [
"meter_apisix_endpoint_http_status"
"aggregate_labels(meter_apisix_endpoint_http_status,sum(code))"
],
"associate": [
{
Expand Down Expand Up @@ -55,7 +55,7 @@
"showYAxis": true
},
"expressions": [
"meter_apisix_endpoint_http_latency"
"aggregate_labels(meter_apisix_endpoint_http_latency,avg(type,p))"
],
"associate": [
{
Expand Down Expand Up @@ -88,7 +88,7 @@
"showYAxis": true
},
"expressions": [
"meter_apisix_endpoint_bandwidth/1024"
"aggregate_labels(meter_apisix_endpoint_bandwidth/1024,sum(type))"
],
"associate": [
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"showYAxis": true
},
"expressions": [
"meter_apisix_sv_http_requests"
"aggregate_labels(meter_apisix_sv_http_requests,sum(service_instance_id))"
]
},
{
Expand All @@ -44,7 +44,7 @@
"showYAxis": true
},
"expressions": [
"meter_apisix_sv_http_status_matched"
"aggregate_labels(meter_apisix_sv_http_status_matched,sum(code))"
],
"associate": [
{
Expand Down Expand Up @@ -89,7 +89,7 @@
"showYAxis": true
},
"expressions": [
"meter_apisix_sv_http_latency_matched"
"aggregate_labels(meter_apisix_sv_http_latency_matched,avg(type,p))"
],
"associate": [
{
Expand Down Expand Up @@ -134,7 +134,7 @@
"showYAxis": true
},
"expressions": [
"meter_apisix_sv_bandwidth_matched/1024"
"aggregate_labels(meter_apisix_sv_bandwidth_matched/1024,sum(type))"
],
"associate": [
{
Expand Down Expand Up @@ -168,7 +168,7 @@
"i": "5",
"type": "Widget",
"expressions": [
"meter_apisix_sv_http_connections"
"aggregate_labels(meter_apisix_sv_http_connections,sum(state))"
],
"graph": {
"type": "Line",
Expand Down Expand Up @@ -224,7 +224,7 @@
"showYAxis": true
},
"expressions": [
"meter_apisix_sv_http_status_unmatched"
"aggregate_labels(meter_apisix_sv_http_status_unmatched,sum(code))"
],
"associate": [
{
Expand Down Expand Up @@ -269,7 +269,7 @@
"showYAxis": true
},
"expressions": [
"meter_apisix_sv_http_latency_unmatched"
"aggregate_labels(meter_apisix_sv_http_latency_unmatched,avg(type,p))"
],
"associate": [
{
Expand Down Expand Up @@ -314,7 +314,7 @@
"showYAxis": true
},
"expressions": [
"meter_apisix_sv_bandwidth_unmatched/1024"
"aggregate_labels(meter_apisix_sv_bandwidth_unmatched/1024,sum(type))"
],
"associate": [
{
Expand Down Expand Up @@ -379,11 +379,11 @@
"isRoot": false,
"isDefault": true,
"expressions": [
"avg(meter_apisix_sv_http_status_matched{code='200'})",
"avg(meter_apisix_sv_http_status_matched{code='304'})",
"avg(meter_apisix_sv_http_status_matched{code='404'})",
"avg(meter_apisix_sv_http_status_matched{code='499'})",
"avg(meter_apisix_sv_http_status_matched{code='503'})"
"avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='200'},sum(code)))",
"avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='304'},sum(code)))",
"avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='404'},sum(code)))",
"avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='499'},sum(code)))",
"avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='503'},sum(code)))"
],
"expressionsConfig": [
{
Expand Down
4 changes: 2 additions & 2 deletions test/e2e-v2/cases/apisix/apisix-cases.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
- query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql instance ls --service-name=APISIX::showcase-apisix-service
expected: expected/instance.yml
# service metrics
- query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql metrics exec --expression=meter_apisix_sv_http_connections --service-name=APISIX::showcase-apisix-service
- query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql metrics exec --expression='aggregate_labels(meter_apisix_sv_http_connections,sum(state))' --service-name=APISIX::showcase-apisix-service
expected: expected/metrics-has-connection-value-label.yml
# instance metrics
- query: |
Expand All @@ -31,5 +31,5 @@
)
expected: expected/metrics-has-status-value-label.yml
# endpoint metrics
- query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql metrics exec --expression=meter_apisix_endpoint_http_latency --endpoint-name='route/routes#1' --service-name=APISIX::showcase-apisix-service
- query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql metrics exec --expression='aggregate_labels(meter_apisix_endpoint_http_latency,avg(type,p))' --endpoint-name='route/routes#1' --service-name=APISIX::showcase-apisix-service
expected: expected/metrics-has-latency-value-label.yml

0 comments on commit bea7ed3

Please sign in to comment.