mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-24 18:36:59 +08:00
Replace `+ 1` denominator hack with `and ... > 0` filter in upstream timeout rate and upstream 5xx error rate queries for mathematical correctness and repo consistency.
This commit is contained in:
parent
6bec57ae96
commit
281142567c
1 changed files with 2 additions and 6 deletions
|
|
@ -2363,18 +2363,14 @@ groups:
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Envoy high cluster upstream request timeout rate
|
- name: Envoy high cluster upstream request timeout rate
|
||||||
description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
|
description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
|
||||||
query: "increase(envoy_cluster_upstream_rq_timeout[5m]) / (increase(envoy_cluster_upstream_rq_completed[5m]) + 1) * 100 > 5"
|
query: "increase(envoy_cluster_upstream_rq_timeout[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0"
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
comments: |
|
|
||||||
The +1 in the denominator guards against division by zero.
|
|
||||||
- name: Envoy high cluster upstream 5xx error rate
|
- name: Envoy high cluster upstream 5xx error rate
|
||||||
description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
|
description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
|
||||||
query: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / (increase(envoy_cluster_upstream_rq_completed[5m]) + 1) * 100 > 5'
|
query: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0'
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 1m
|
for: 1m
|
||||||
comments: |
|
|
||||||
The +1 in the denominator guards against division by zero.
|
|
||||||
- name: Envoy cluster health check failures
|
- name: Envoy cluster health check failures
|
||||||
description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
|
description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
|
||||||
query: "increase(envoy_cluster_health_check_failure[5m]) > 5"
|
query: "increase(envoy_cluster_health_check_failure[5m]) > 5"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue