awesome-prometheus-alerts/dist/rules/thanos/thanos-query.yml

groups:

- name: ThanosQuery

  rules:

    - alert: ThanosQueryHttpRequestQueryErrorRateHigh
      expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Query Http Request Query Error Rate High (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query\" requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
      expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Query Http Request Query Range Error Rate High (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query_range\" requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryGrpcServerErrorRate
      expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/  sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Query Grpc Server Error Rate (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryGrpcClientErrorRate
      expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Query Grpc Client Error Rate (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryHighDNSFailures
      expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Thanos Query High D N S Failures (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryInstantLatencyHigh
      expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Thanos Query Instant Latency High (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryRangeLatencyHigh
      expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Thanos Query Range Latency High (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryOverload
      expr: '(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Thanos Query Overload (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"