mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-22 09:27:26 +08:00
Feat/jaeger alerting rules (#521)
* Add .worktrees/ to .gitignore * feat: add Jaeger alerting rules (8 rules from official jaeger-mixin) Rules cover agent HTTP errors, RPC errors, client/agent/collector span drops, sampling update failures, throttling update failures, and query request failures. All rules sourced from https://github.com/jaegertracing/jaeger/tree/main/monitoring/jaeger-mixin * fix: rename Jaeger agent RPC alert to Jaeger client RPC The jaeger_client_jaeger_rpc_http_requests metric is client-side, not agent-side. Rename alert to match the actual metric source.
This commit is contained in:
parent
eeba1ebbaa
commit
f974552ef1
3 changed files with 50 additions and 1 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -4,4 +4,5 @@ _site/
|
|||
.jekyll-metadata
|
||||
_data/rules.json
|
||||
test/rules/
|
||||
/node_modules
|
||||
/node_modules
|
||||
.worktrees/
|
||||
|
|
@ -141,6 +141,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
|
|||
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
|
||||
- [GitLab](https://samber.github.io/awesome-prometheus-alerts/rules#gitlab)
|
||||
- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)
|
||||
- [Jaeger](https://samber.github.io/awesome-prometheus-alerts/rules#jaeger)
|
||||
|
||||
## 🤝 Contributing
|
||||
|
||||
|
|
|
|||
|
|
@ -4819,3 +4819,50 @@ groups:
|
|||
comments: |
|
||||
When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail.
|
||||
Check Gitaly service health and logs.
|
||||
|
||||
- name: Jaeger
|
||||
exporters:
|
||||
- name: Embedded exporter
|
||||
slug: embedded-exporter
|
||||
doc_url: https://www.jaegertracing.io/docs/latest/monitoring/
|
||||
rules:
|
||||
- name: Jaeger agent HTTP server errors
|
||||
description: "Jaeger agent on {{ $labels.instance }} is experiencing {{ $value | humanize }}% HTTP server errors."
|
||||
query: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1'
|
||||
severity: warning
|
||||
for: 15m
|
||||
- name: Jaeger client RPC request errors
|
||||
description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors."
|
||||
query: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1'
|
||||
severity: warning
|
||||
for: 15m
|
||||
- name: Jaeger client spans dropped
|
||||
description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans."
|
||||
query: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1'
|
||||
severity: warning
|
||||
for: 15m
|
||||
- name: Jaeger agent spans dropped
|
||||
description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches."
|
||||
query: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1'
|
||||
severity: warning
|
||||
for: 15m
|
||||
- name: Jaeger collector dropping spans
|
||||
description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans."
|
||||
query: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1'
|
||||
severity: warning
|
||||
for: 15m
|
||||
- name: Jaeger sampling update failing
|
||||
description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates."
|
||||
query: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1'
|
||||
severity: warning
|
||||
for: 15m
|
||||
- name: Jaeger throttling update failing
|
||||
description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates."
|
||||
query: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1'
|
||||
severity: warning
|
||||
for: 15m
|
||||
- name: Jaeger query request failures
|
||||
description: "Jaeger query on {{ $labels.instance }} is failing {{ $value | humanize }}% of requests."
|
||||
query: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1'
|
||||
severity: warning
|
||||
for: 15m
|
||||
|
|
|
|||
Loading…
Reference in a new issue