mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-21 08:57:19 +08:00
77 lines
5 KiB
YAML
77 lines
5 KiB
YAML
groups:
|
|
|
|
- name: ThanosQuery
|
|
|
|
rules:
|
|
|
|
- alert: ThanosQueryHttpRequestQueryErrorRateHigh
|
|
expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Thanos Query Http Request Query Error Rate High (instance {{ $labels.instance }})
|
|
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
|
|
expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Thanos Query Http Request Query Range Error Rate High (instance {{ $labels.instance }})
|
|
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query_range\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosQueryGrpcServerErrorRate
|
|
expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Thanos Query Grpc Server Error Rate (instance {{ $labels.instance }})
|
|
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosQueryGrpcClientErrorRate
|
|
expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Thanos Query Grpc Client Error Rate (instance {{ $labels.instance }})
|
|
description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosQueryHighDNSFailures
|
|
expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1'
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Thanos Query High D N S Failures (instance {{ $labels.instance }})
|
|
description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosQueryInstantLatencyHigh
|
|
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Thanos Query Instant Latency High (instance {{ $labels.instance }})
|
|
description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosQueryRangeLatencyHigh
|
|
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Thanos Query Range Latency High (instance {{ $labels.instance }})
|
|
description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosQueryOverload
|
|
expr: '(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)'
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Thanos Query Overload (instance {{ $labels.instance }})
|
|
description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|