awesome-prometheus-alerts/dist/rules/thanos/thanos-ruler.yml

groups:

- name: ThanosRuler

  rules:

    - alert: ThanosRuleQueueIsDroppingAlerts
      expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Rule Queue Is Dropping Alerts (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} is failing to queue alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleSenderIsFailingAlerts
      expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Rule Sender Is Failing Alerts (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleHighRuleEvaluationFailures
      expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Rule High Rule Evaluation Failures (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} is failing to evaluate rules.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleHighRuleEvaluationWarnings
      expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
      for: 15m
      labels:
        severity: info
      annotations:
        summary: Thanos Rule High Rule Evaluation Warnings (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleRuleEvaluationLatencyHigh
      expr: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Rule Rule Evaluation Latency High (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleGrpcErrorRate
      expr: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/  sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Thanos Rule Grpc Error Rate (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleConfigReloadFailure
      expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
      for: 5m
      labels:
        severity: info
      annotations:
        summary: Thanos Rule Config Reload Failure (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleQueryHighDNSFailures
      expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Thanos Rule Query High D N S Failures (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleAlertmanagerHighDNSFailures
      expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Thanos Rule Alertmanager High D N S Failures (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleNoEvaluationFor10Intervals
      expr: 'time() -  max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
      for: 5m
      labels:
        severity: info
      annotations:
        summary: Thanos Rule No Evaluation For10 Intervals (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosNoRuleEvaluations
      expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0  and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos No Rule Evaluations (instance {{ $labels.instance }})
        description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"