groups: - name: ThanosQuery rules: - alert: ThanosQueryHttpRequestQueryErrorRateHigh expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5' for: 5m labels: severity: critical annotations: summary: Thanos Query Http Request Query Error Rate High (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5' for: 5m labels: severity: critical annotations: summary: Thanos Query Http Request Query Range Error Rate High (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query_range\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryGrpcServerErrorRate expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)' for: 5m labels: severity: warning annotations: summary: Thanos Query Grpc Server Error Rate (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryGrpcClientErrorRate expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5' for: 5m labels: severity: warning annotations: summary: Thanos Query Grpc Client Error Rate (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryHighDNSFailures expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1' for: 15m labels: severity: warning annotations: summary: Thanos Query High D N S Failures (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryInstantLatencyHigh expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)' for: 10m labels: severity: critical annotations: summary: Thanos Query Instant Latency High (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryRangeLatencyHigh expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)' for: 10m labels: severity: critical annotations: summary: Thanos Query Range Latency High (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryOverload expr: '(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)' for: 15m labels: severity: warning annotations: summary: Thanos Query Overload (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"