awesome-prometheus-alerts/dist/rules/thanos/thanos-ruler.yml
2023-03-15 17:27:02 +00:00

104 lines
6 KiB
YAML

groups:
- name: ThanosRuler
rules:
- alert: ThanosRuleQueueIsDroppingAlerts
expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Rule Queue Is Dropping Alerts (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} is failing to queue alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleSenderIsFailingAlerts
expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Rule Sender Is Failing Alerts (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleHighRuleEvaluationFailures
expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Rule High Rule Evaluation Failures (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} is failing to evaluate rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleHighRuleEvaluationWarnings
expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
for: 15m
labels:
severity: info
annotations:
summary: Thanos Rule High Rule Evaluation Warnings (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleRuleEvaluationLatencyHigh
expr: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Rule Rule Evaluation Latency High (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleGrpcErrorRate
expr: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Rule Grpc Error Rate (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleConfigReloadFailure
expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
for: 5m
labels:
severity: info
annotations:
summary: Thanos Rule Config Reload Failure (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleQueryHighDNSFailures
expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Rule Query High D N S Failures (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleAlertmanagerHighDNSFailures
expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Rule Alertmanager High D N S Failures (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleNoEvaluationFor10Intervals
expr: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
for: 5m
labels:
severity: info
annotations:
summary: Thanos Rule No Evaluation For10 Intervals (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosNoRuleEvaluations
expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos No Rule Evaluations (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"