awesome-prometheus-alerts/dist/rules/thanos/thanos-receiver.yml
2023-03-15 17:27:02 +00:00

68 lines
4.2 KiB
YAML

groups:
- name: ThanosReceiver
rules:
- alert: ThanosReceiveHttpRequestErrorRateHigh
expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Receive Http Request Error Rate High (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveHttpRequestLatencyHigh
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
for: 10m
labels:
severity: critical
annotations:
summary: Thanos Receive Http Request Latency High (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveHighReplicationFailures
expr: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Receive High Replication Failures (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveHighForwardRequestFailures
expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20'
for: 5m
labels:
severity: info
annotations:
summary: Thanos Receive High Forward Request Failures (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveHighHashringFileRefreshFailures
expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0)'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Receive High Hashring File Refresh Failures (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveConfigReloadFailure
expr: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Receive Config Reload Failure (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveNoUpload
expr: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)'
for: 3h
labels:
severity: critical
annotations:
summary: Thanos Receive No Upload (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"