awesome-prometheus-alerts/dist/rules/thanos/thanos-bucket-replicate.yml
2026-04-06 18:38:45 +00:00

24 lines
1.3 KiB
YAML

groups:
- name: ThanosBucketReplicate
rules:
- alert: ThanosBucketReplicateErrorRate
expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Bucket Replicate Error Rate (instance {{ $labels.instance }})
description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosBucketReplicateRunLatency
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_count[5m])) > 0)'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Bucket Replicate Run Latency (instance {{ $labels.instance }})
description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"