awesome-prometheus-alerts/dist/rules/thanos/thanos-query.yml
2025-11-05 16:04:56 +00:00

78 lines
5 KiB
YAML

groups:
- name: ThanosQuery
rules:
- alert: ThanosQueryHttpRequestQueryErrorRateHigh
expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Query Http Request Query Error Rate High (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Query Http Request Query Range Error Rate High (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query_range\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryGrpcServerErrorRate
expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Query Grpc Server Error Rate (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryGrpcClientErrorRate
expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Query Grpc Client Error Rate (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryHighDNSFailures
expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Query High D N S Failures (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryInstantLatencyHigh
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
for: 10m
labels:
severity: critical
annotations:
summary: Thanos Query Instant Latency High (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryRangeLatencyHigh
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
for: 10m
labels:
severity: critical
annotations:
summary: Thanos Query Range Latency High (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryOverload
expr: '(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Query Overload (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"