mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-22 01:17:19 +08:00
94 lines
6.5 KiB
YAML
94 lines
6.5 KiB
YAML
groups:
|
|
|
|
- name: EmbeddedExporter
|
|
|
|
# Jaeger v2 is built on OpenTelemetry Collector and exposes metrics on port 8888 (/metrics).
|
|
# It emits standard otelcol_* pipeline metrics alongside Jaeger-specific storage and query metrics.
|
|
# For span ingestion pipeline alerts (refused spans, export failures, queue saturation),
|
|
# use the OpenTelemetry Collector rules instead.
|
|
|
|
rules:
|
|
|
|
- alert: JaegerHighStorageErrorRate
|
|
expr: '100 * sum(rate(jaeger_storage_requests_total{result="err"}[1m])) by (instance, job, namespace, operation) / sum(rate(jaeger_storage_requests_total[1m])) by (instance, job, namespace, operation) > 1 and sum(rate(jaeger_storage_requests_total[1m])) by (instance, job, namespace, operation) > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Jaeger high storage error rate (instance {{ $labels.instance }})
|
|
description: "Jaeger on {{ $labels.instance }} is experiencing {{ $value | humanize }}% storage errors on {{ $labels.operation }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Threshold of 1s is a rough default. Adjust based on your storage backend and data volume.
|
|
- alert: JaegerSlowStorageOperations
|
|
expr: 'histogram_quantile(0.99, sum(rate(jaeger_storage_latency_seconds_bucket[5m])) by (le, instance, job, namespace, operation)) > 1'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Jaeger slow storage operations (instance {{ $labels.instance }})
|
|
description: "Jaeger on {{ $labels.instance }} storage p99 latency for {{ $labels.operation }} is {{ $value | humanizeDuration }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Filters on http_route="/api/traces" (the trace search endpoint). The http_server_request_duration_seconds
|
|
# metric is emitted by the otelhttp middleware used by the Jaeger query service.
|
|
- alert: JaegerQueryServiceHighErrorRate
|
|
expr: '100 * sum(rate(http_server_request_duration_seconds_count{http_route="/api/traces",http_response_status_code=~"5.."}[1m])) by (instance, job, namespace) / sum(rate(http_server_request_duration_seconds_count{http_route="/api/traces"}[1m])) by (instance, job, namespace) > 1 and sum(rate(http_server_request_duration_seconds_count{http_route="/api/traces"}[1m])) by (instance, job, namespace) > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Jaeger query service high error rate (instance {{ $labels.instance }})
|
|
description: "Jaeger query service on {{ $labels.instance }} is returning {{ $value | humanize }}% HTTP 5xx errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Threshold of 2s is a rough default. Adjust based on your storage backend and data volume.
|
|
- alert: JaegerQueryServiceSlowResponses
|
|
expr: 'histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{http_route="/api/traces"}[5m])) by (le, instance, job, namespace)) > 2'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Jaeger query service slow responses (instance {{ $labels.instance }})
|
|
description: "Jaeger query service on {{ $labels.instance }} p99 response latency is {{ $value | humanizeDuration }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Fires when all storage operations for a given type are failing and none are succeeding.
|
|
# Indicates the storage backend (Cassandra, Elasticsearch, etc.) is likely unreachable or misconfigured.
|
|
- alert: JaegerStorageCompletelyUnavailable
|
|
expr: 'sum(rate(jaeger_storage_requests_total{result="err"}[1m])) by (instance, job, namespace, operation) > 0 and sum(rate(jaeger_storage_requests_total{result="ok"}[1m])) by (instance, job, namespace, operation) == 0'
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Jaeger storage completely unavailable (instance {{ $labels.instance }})
|
|
description: "Jaeger on {{ $labels.instance }} has 100% storage errors for {{ $labels.operation }} — storage backend may be down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Single trace retrieval (/api/traces/{traceID}) can be slower than search, especially for large traces.
|
|
# Threshold of 5s is a rough default.
|
|
- alert: JaegerSlowSingleTraceRetrieval
|
|
expr: 'histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{http_route="/api/traces/{traceID}"}[5m])) by (le, instance, job, namespace)) > 5'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Jaeger slow single trace retrieval (instance {{ $labels.instance }})
|
|
description: "Jaeger on {{ $labels.instance }} p99 latency for single trace retrieval is {{ $value | humanizeDuration }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Errors on /api/services indicate the storage backend cannot return the list of instrumented services,
|
|
# which breaks the Jaeger UI service selector.
|
|
- alert: JaegerServiceDiscoveryErrors
|
|
expr: '100 * sum(rate(http_server_request_duration_seconds_count{http_route="/api/services",http_response_status_code=~"5.."}[1m])) by (instance, job, namespace) / sum(rate(http_server_request_duration_seconds_count{http_route="/api/services"}[1m])) by (instance, job, namespace) > 1 and sum(rate(http_server_request_duration_seconds_count{http_route="/api/services"}[1m])) by (instance, job, namespace) > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Jaeger service discovery errors (instance {{ $labels.instance }})
|
|
description: "Jaeger on {{ $labels.instance }} is returning {{ $value | humanize }}% HTTP 5xx errors on the services endpoint.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Fires when an operation (e.g. find_traces, get_services) has received requests but none succeeded.
|
|
# May indicate a persistent storage error or a backend that is slow to recover.
|
|
- alert: JaegerNoStorageReadsSucceeding
|
|
expr: 'sum(increase(jaeger_storage_requests_total{result="ok"}[15m])) by (instance, job, namespace, operation) == 0 and sum(increase(jaeger_storage_requests_total[15m])) by (instance, job, namespace, operation) > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Jaeger no storage reads succeeding (instance {{ $labels.instance }})
|
|
description: "Jaeger on {{ $labels.instance }} has no successful storage reads for {{ $labels.operation }} in the past 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|