From f974552ef126524c5b9098035269af156b7d3e24 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 16 Mar 2026 14:09:03 +0100 Subject: [PATCH] Feat/jaeger alerting rules (#521) * Add .worktrees/ to .gitignore * feat: add Jaeger alerting rules (8 rules from official jaeger-mixin) Rules cover agent HTTP errors, RPC errors, client/agent/collector span drops, sampling update failures, throttling update failures, and query request failures. All rules sourced from https://github.com/jaegertracing/jaeger/tree/main/monitoring/jaeger-mixin * fix: rename Jaeger agent RPC alert to Jaeger client RPC The jaeger_client_jaeger_rpc_http_requests metric is client-side, not agent-side. Rename alert to match the actual metric source. --- .gitignore | 3 ++- README.md | 1 + _data/rules.yml | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 66a746a..451be5c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ _site/ .jekyll-metadata _data/rules.json test/rules/ -/node_modules \ No newline at end of file +/node_modules +.worktrees/ \ No newline at end of file diff --git a/README.md b/README.md index 64781ca..0d3a44b 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins) - [GitLab](https://samber.github.io/awesome-prometheus-alerts/rules#gitlab) - [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node) +- [Jaeger](https://samber.github.io/awesome-prometheus-alerts/rules#jaeger) ## 🤝 Contributing diff --git a/_data/rules.yml b/_data/rules.yml index d097b18..48cc40d 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -4819,3 +4819,50 @@ groups: comments: | When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail. Check Gitaly service health and logs. + + - name: Jaeger + exporters: + - name: Embedded exporter + slug: embedded-exporter + doc_url: https://www.jaegertracing.io/docs/latest/monitoring/ + rules: + - name: Jaeger agent HTTP server errors + description: "Jaeger agent on {{ $labels.instance }} is experiencing {{ $value | humanize }}% HTTP server errors." + query: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger client RPC request errors + description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors." + query: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger client spans dropped + description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans." + query: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger agent spans dropped + description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches." + query: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger collector dropping spans + description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans." + query: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger sampling update failing + description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates." + query: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger throttling update failing + description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates." + query: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger query request failures + description: "Jaeger query on {{ $labels.instance }} is failing {{ $value | humanize }}% of requests." + query: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m