mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-22 01:17:19 +08:00
Publish
This commit is contained in:
parent
aa2988693b
commit
2ead3bcbd8
9 changed files with 445 additions and 32 deletions
32
dist/rules/thanos/embedded-exporter.yml
vendored
32
dist/rules/thanos/embedded-exporter.yml
vendored
|
|
@ -1,32 +0,0 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosCompactionHalted
|
||||
expr: 'thanos_compact_halted == 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos compaction halted (instance {{ $labels.instance }})
|
||||
description: "Thanos compaction has failed to run and is now halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosCompactBucketOperationFailure
|
||||
expr: 'rate(thanos_objstore_bucket_operation_failures_total[1m]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos compact bucket operation failure (instance {{ $labels.instance }})
|
||||
description: "Thanos compaction has failing storage operations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosCompactNotRun
|
||||
expr: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos compact not run (instance {{ $labels.instance }})
|
||||
description: "Thanos compaction has not run in 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
23
dist/rules/thanos/thanos-bucket-replicate.yml
vendored
Normal file
23
dist/rules/thanos/thanos-bucket-replicate.yml
vendored
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
groups:
|
||||
|
||||
- name: ThanosBucketReplicate
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosBucketReplicateErrorRate
|
||||
expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m]))/ on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Bucket Replicate Error Rate (instance {{ $labels.instance }})
|
||||
description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosBucketReplicateRunLatency
|
||||
expr: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Bucket Replicate Run Latency (instance {{ $labels.instance }})
|
||||
description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
50
dist/rules/thanos/thanos-compactor.yml
vendored
Normal file
50
dist/rules/thanos/thanos-compactor.yml
vendored
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
groups:
|
||||
|
||||
- name: ThanosCompactor
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosCompactorMultipleRunning
|
||||
expr: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Compactor Multiple Running (instance {{ $labels.instance }})
|
||||
description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosCompactorHalted
|
||||
expr: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Compactor Halted (instance {{ $labels.instance }})
|
||||
description: "Thanos Compact {{$labels.job}} has failed to run and now is halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosCompactorHighCompactionFailures
|
||||
expr: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Compactor High Compaction Failures (instance {{ $labels.instance }})
|
||||
description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosCompactBucketHighOperationFailures
|
||||
expr: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Compact Bucket High Operation Failures (instance {{ $labels.instance }})
|
||||
description: "Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosCompactHasNotRun
|
||||
expr: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Compact Has Not Run (instance {{ $labels.instance }})
|
||||
description: "Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
59
dist/rules/thanos/thanos-component-absent.yml
vendored
Normal file
59
dist/rules/thanos/thanos-component-absent.yml
vendored
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
groups:
|
||||
|
||||
- name: ThanosComponentAbsent
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosCompactIsDown
|
||||
expr: 'absent(up{job=~".*thanos-compact.*"} == 1)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Compact Is Down (instance {{ $labels.instance }})
|
||||
description: "ThanosCompact has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosQueryIsDown
|
||||
expr: 'absent(up{job=~".*thanos-query.*"} == 1)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Query Is Down (instance {{ $labels.instance }})
|
||||
description: "ThanosQuery has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosReceiveIsDown
|
||||
expr: 'absent(up{job=~".*thanos-receive.*"} == 1)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Receive Is Down (instance {{ $labels.instance }})
|
||||
description: "ThanosReceive has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosRuleIsDown
|
||||
expr: 'absent(up{job=~".*thanos-rule.*"} == 1)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Rule Is Down (instance {{ $labels.instance }})
|
||||
description: "ThanosRule has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosSidecarIsDown
|
||||
expr: 'absent(up{job=~".*thanos-sidecar.*"} == 1)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Sidecar Is Down (instance {{ $labels.instance }})
|
||||
description: "ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosStoreIsDown
|
||||
expr: 'absent(up{job=~".*thanos-store.*"} == 1)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Store Is Down (instance {{ $labels.instance }})
|
||||
description: "ThanosStore has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
77
dist/rules/thanos/thanos-query.yml
vendored
Normal file
77
dist/rules/thanos/thanos-query.yml
vendored
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
groups:
|
||||
|
||||
- name: ThanosQuery
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosQueryHttpRequestQueryErrorRateHigh
|
||||
expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Query Http Request Query Error Rate High (instance {{ $labels.instance }})
|
||||
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
|
||||
expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Query Http Request Query Range Error Rate High (instance {{ $labels.instance }})
|
||||
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query_range\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosQueryGrpcServerErrorRate
|
||||
expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Query Grpc Server Error Rate (instance {{ $labels.instance }})
|
||||
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosQueryGrpcClientErrorRate
|
||||
expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Query Grpc Client Error Rate (instance {{ $labels.instance }})
|
||||
description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosQueryHighDNSFailures
|
||||
expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Query High D N S Failures (instance {{ $labels.instance }})
|
||||
description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosQueryInstantLatencyHigh
|
||||
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Query Instant Latency High (instance {{ $labels.instance }})
|
||||
description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosQueryRangeLatencyHigh
|
||||
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Query Range Latency High (instance {{ $labels.instance }})
|
||||
description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosQueryOverload
|
||||
expr: '(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Query Overload (instance {{ $labels.instance }})
|
||||
description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
68
dist/rules/thanos/thanos-receiver.yml
vendored
Normal file
68
dist/rules/thanos/thanos-receiver.yml
vendored
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
groups:
|
||||
|
||||
- name: ThanosReceiver
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosReceiveHttpRequestErrorRateHigh
|
||||
expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Receive Http Request Error Rate High (instance {{ $labels.instance }})
|
||||
description: "Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosReceiveHttpRequestLatencyHigh
|
||||
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Receive Http Request Latency High (instance {{ $labels.instance }})
|
||||
description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosReceiveHighReplicationFailures
|
||||
expr: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Receive High Replication Failures (instance {{ $labels.instance }})
|
||||
description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosReceiveHighForwardRequestFailures
|
||||
expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Thanos Receive High Forward Request Failures (instance {{ $labels.instance }})
|
||||
description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosReceiveHighHashringFileRefreshFailures
|
||||
expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0)'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Receive High Hashring File Refresh Failures (instance {{ $labels.instance }})
|
||||
description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosReceiveConfigReloadFailure
|
||||
expr: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Receive Config Reload Failure (instance {{ $labels.instance }})
|
||||
description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosReceiveNoUpload
|
||||
expr: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)'
|
||||
for: 3h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Receive No Upload (instance {{ $labels.instance }})
|
||||
description: "Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
104
dist/rules/thanos/thanos-ruler.yml
vendored
Normal file
104
dist/rules/thanos/thanos-ruler.yml
vendored
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
groups:
|
||||
|
||||
- name: ThanosRuler
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosRuleQueueIsDroppingAlerts
|
||||
expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Rule Queue Is Dropping Alerts (instance {{ $labels.instance }})
|
||||
description: "Thanos Rule {{$labels.instance}} is failing to queue alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosRuleSenderIsFailingAlerts
|
||||
expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Rule Sender Is Failing Alerts (instance {{ $labels.instance }})
|
||||
description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosRuleHighRuleEvaluationFailures
|
||||
expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Rule High Rule Evaluation Failures (instance {{ $labels.instance }})
|
||||
description: "Thanos Rule {{$labels.instance}} is failing to evaluate rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosRuleHighRuleEvaluationWarnings
|
||||
expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Thanos Rule High Rule Evaluation Warnings (instance {{ $labels.instance }})
|
||||
description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosRuleRuleEvaluationLatencyHigh
|
||||
expr: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Rule Rule Evaluation Latency High (instance {{ $labels.instance }})
|
||||
description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosRuleGrpcErrorRate
|
||||
expr: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Rule Grpc Error Rate (instance {{ $labels.instance }})
|
||||
description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosRuleConfigReloadFailure
|
||||
expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Thanos Rule Config Reload Failure (instance {{ $labels.instance }})
|
||||
description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosRuleQueryHighDNSFailures
|
||||
expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Rule Query High D N S Failures (instance {{ $labels.instance }})
|
||||
description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosRuleAlertmanagerHighDNSFailures
|
||||
expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Rule Alertmanager High D N S Failures (instance {{ $labels.instance }})
|
||||
description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosRuleNoEvaluationFor10Intervals
|
||||
expr: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Thanos Rule No Evaluation For10 Intervals (instance {{ $labels.instance }})
|
||||
description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosNoRuleEvaluations
|
||||
expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos No Rule Evaluations (instance {{ $labels.instance }})
|
||||
description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
23
dist/rules/thanos/thanos-sidecar.yml
vendored
Normal file
23
dist/rules/thanos/thanos-sidecar.yml
vendored
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
groups:
|
||||
|
||||
- name: ThanosSidecar
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosSidecarBucketOperationsFailed
|
||||
expr: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Sidecar Bucket Operations Failed (instance {{ $labels.instance }})
|
||||
description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosSidecarNoConnectionToStartedPrometheus
|
||||
expr: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Thanos Sidecar No Connection To Started Prometheus (instance {{ $labels.instance }})
|
||||
description: "Thanos Sidecar {{$labels.instance}} is unhealthy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
41
dist/rules/thanos/thanos-store.yml
vendored
Normal file
41
dist/rules/thanos/thanos-store.yml
vendored
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
groups:
|
||||
|
||||
- name: ThanosStore
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosStoreGrpcErrorRate
|
||||
expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Store Grpc Error Rate (instance {{ $labels.instance }})
|
||||
description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosStoreSeriesGateLatencyHigh
|
||||
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Store Series Gate Latency High (instance {{ $labels.instance }})
|
||||
description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosStoreBucketHighOperationFailures
|
||||
expr: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Store Bucket High Operation Failures (instance {{ $labels.instance }})
|
||||
description: "Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ThanosStoreObjstoreOperationLatencyHigh
|
||||
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Thanos Store Objstore Operation Latency High (instance {{ $labels.instance }})
|
||||
description: "Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
Loading…
Reference in a new issue