groups: - name: ThanosRuler rules: - alert: ThanosRuleQueueIsDroppingAlerts expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Thanos Rule Queue Is Dropping Alerts (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} is failing to queue alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleSenderIsFailingAlerts expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 5m labels: severity: critical annotations: summary: Thanos Rule Sender Is Failing Alerts (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleHighRuleEvaluationFailures expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)' for: 5m labels: severity: critical annotations: summary: Thanos Rule High Rule Evaluation Failures (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} is failing to evaluate rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleHighRuleEvaluationWarnings expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 15m labels: severity: info annotations: summary: Thanos Rule High Rule Evaluation Warnings (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleRuleEvaluationLatencyHigh expr: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))' for: 5m labels: severity: warning annotations: summary: Thanos Rule Rule Evaluation Latency High (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleGrpcErrorRate expr: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)' for: 5m labels: severity: warning annotations: summary: Thanos Rule Grpc Error Rate (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleConfigReloadFailure expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1' for: 5m labels: severity: info annotations: summary: Thanos Rule Config Reload Failure (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleQueryHighDNSFailures expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)' for: 15m labels: severity: warning annotations: summary: Thanos Rule Query High D N S Failures (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleAlertmanagerHighDNSFailures expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)' for: 15m labels: severity: warning annotations: summary: Thanos Rule Alertmanager High D N S Failures (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleNoEvaluationFor10Intervals expr: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})' for: 5m labels: severity: info annotations: summary: Thanos Rule No Evaluation For10 Intervals (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosNoRuleEvaluations expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0' for: 5m labels: severity: critical annotations: summary: Thanos No Rule Evaluations (instance {{ $labels.instance }}) description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"