diff --git a/.gitignore b/.gitignore index 66a746a..451be5c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ _site/ .jekyll-metadata _data/rules.json test/rules/ -/node_modules \ No newline at end of file +/node_modules +.worktrees/ \ No newline at end of file diff --git a/README.md b/README.md index 64781ca..0d3a44b 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins) - [GitLab](https://samber.github.io/awesome-prometheus-alerts/rules#gitlab) - [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node) +- [Jaeger](https://samber.github.io/awesome-prometheus-alerts/rules#jaeger) ## 🤝 Contributing diff --git a/_data/rules.yml b/_data/rules.yml index d097b18..48cc40d 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -4819,3 +4819,50 @@ groups: comments: | When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail. Check Gitaly service health and logs. + + - name: Jaeger + exporters: + - name: Embedded exporter + slug: embedded-exporter + doc_url: https://www.jaegertracing.io/docs/latest/monitoring/ + rules: + - name: Jaeger agent HTTP server errors + description: "Jaeger agent on {{ $labels.instance }} is experiencing {{ $value | humanize }}% HTTP server errors." + query: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger client RPC request errors + description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors." + query: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger client spans dropped + description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans." + query: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger agent spans dropped + description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches." + query: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger collector dropping spans + description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans." + query: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger sampling update failing + description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates." + query: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger throttling update failing + description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates." + query: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger query request failures + description: "Jaeger query on {{ $labels.instance }} is failing {{ $value | humanize }}% of requests." + query: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m