From 281142567c7b02b6c0467a5da312ca50ba11f348 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 16 Mar 2026 03:25:27 +0100 Subject: [PATCH] fix: use proper zero-traffic guard in Envoy ratio alerts (#511) (#513) Replace `+ 1` denominator hack with `and ... > 0` filter in upstream timeout rate and upstream 5xx error rate queries for mathematical correctness and repo consistency. --- _data/rules.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index 0e78127..4e989f7 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -2363,18 +2363,14 @@ groups: for: 5m - name: Envoy high cluster upstream request timeout rate description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}" - query: "increase(envoy_cluster_upstream_rq_timeout[5m]) / (increase(envoy_cluster_upstream_rq_completed[5m]) + 1) * 100 > 5" + query: "increase(envoy_cluster_upstream_rq_timeout[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0" severity: warning for: 5m - comments: | - The +1 in the denominator guards against division by zero. - name: Envoy high cluster upstream 5xx error rate description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}" - query: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / (increase(envoy_cluster_upstream_rq_completed[5m]) + 1) * 100 > 5' + query: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0' severity: critical for: 1m - comments: | - The +1 in the denominator guards against division by zero. - name: Envoy cluster health check failures description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}" query: "increase(envoy_cluster_health_check_failure[5m]) > 5"