mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-21 08:57:19 +08:00
104 lines
6 KiB
YAML
104 lines
6 KiB
YAML
groups:
|
|
|
|
- name: ThanosRuler
|
|
|
|
rules:
|
|
|
|
- alert: ThanosRuleQueueIsDroppingAlerts
|
|
expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Thanos Rule Queue Is Dropping Alerts (instance {{ $labels.instance }})
|
|
description: "Thanos Rule {{$labels.instance}} is failing to queue alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosRuleSenderIsFailingAlerts
|
|
expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Thanos Rule Sender Is Failing Alerts (instance {{ $labels.instance }})
|
|
description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosRuleHighRuleEvaluationFailures
|
|
expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Thanos Rule High Rule Evaluation Failures (instance {{ $labels.instance }})
|
|
description: "Thanos Rule {{$labels.instance}} is failing to evaluate rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosRuleHighRuleEvaluationWarnings
|
|
expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
|
|
for: 15m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Thanos Rule High Rule Evaluation Warnings (instance {{ $labels.instance }})
|
|
description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosRuleRuleEvaluationLatencyHigh
|
|
expr: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Thanos Rule Rule Evaluation Latency High (instance {{ $labels.instance }})
|
|
description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosRuleGrpcErrorRate
|
|
expr: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Thanos Rule Grpc Error Rate (instance {{ $labels.instance }})
|
|
description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosRuleConfigReloadFailure
|
|
expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Thanos Rule Config Reload Failure (instance {{ $labels.instance }})
|
|
description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosRuleQueryHighDNSFailures
|
|
expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Thanos Rule Query High D N S Failures (instance {{ $labels.instance }})
|
|
description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosRuleAlertmanagerHighDNSFailures
|
|
expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Thanos Rule Alertmanager High D N S Failures (instance {{ $labels.instance }})
|
|
description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosRuleNoEvaluationFor10Intervals
|
|
expr: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Thanos Rule No Evaluation For10 Intervals (instance {{ $labels.instance }})
|
|
description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ThanosNoRuleEvaluations
|
|
expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Thanos No Rule Evaluations (instance {{ $labels.instance }})
|
|
description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|