mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 19:37:27 +08:00
lint
This commit is contained in:
parent
e7db9b5b25
commit
689c923cf2
1 changed files with 89 additions and 194 deletions
283
_data/rules.yml
283
_data/rules.yml
|
|
@ -2273,353 +2273,248 @@ groups:
|
||||||
slug: thanos-compactor
|
slug: thanos-compactor
|
||||||
rules:
|
rules:
|
||||||
- name: Thanos Compactor Multiple Running
|
- name: Thanos Compactor Multiple Running
|
||||||
description: No more than one Thanos Compact instance should be running at once.
|
description: 'No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running.'
|
||||||
There are {{$value}} instances running.
|
query: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1'
|
||||||
query: sum by (job) (up{job=~".*thanos-compact.*"}) > 1
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Compactor Halted
|
- name: Thanos Compactor Halted
|
||||||
description: Thanos Compact {{$labels.job}} has failed to run and now is halted.
|
description: 'Thanos Compact {{$labels.job}} has failed to run and now is halted.'
|
||||||
query: thanos_compact_halted{job=~".*thanos-compact.*"} == 1
|
query: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Compactor High Compaction Failures
|
- name: Thanos Compactor High Compaction Failures
|
||||||
description: Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}%
|
description: 'Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions.'
|
||||||
of compactions.
|
query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
|
||||||
query: ( sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m]))/ sum
|
|
||||||
by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m]))*
|
|
||||||
100 > 5)
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 15m
|
for: 15m
|
||||||
- name: Thanos Compact Bucket High Operation Failures
|
- name: Thanos Compact Bucket High Operation Failures
|
||||||
description: Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value
|
description: 'Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.'
|
||||||
| humanize}}% of operations.
|
query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
|
||||||
query: ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m]))/ sum
|
|
||||||
by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m]))*
|
|
||||||
100 > 5)
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 15m
|
for: 15m
|
||||||
- name: Thanos Compact Has Not Run
|
- name: Thanos Compact Has Not Run
|
||||||
description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
|
description: 'Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.'
|
||||||
query: (time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h])))
|
query: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24'
|
||||||
/ 60 / 60 > 24
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 0m
|
for: 0m
|
||||||
- name: Thanos Query
|
- name: Thanos Query
|
||||||
slug: thanos-query
|
slug: thanos-query
|
||||||
rules:
|
rules:
|
||||||
- name: Thanos Query Http Request Query Error Rate High
|
- name: Thanos Query Http Request Query Error Rate High
|
||||||
description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}%
|
description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query" requests.'
|
||||||
of "query" requests.
|
query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5'
|
||||||
query: ( sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*",
|
|
||||||
handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*",
|
|
||||||
handler="query"}[5m]))) * 100 > 5
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Query Http Request Query Range Error Rate High
|
- name: Thanos Query Http Request Query Range Error Rate High
|
||||||
description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}%
|
description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query_range" requests.'
|
||||||
of "query_range" requests.
|
query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5'
|
||||||
query: ( sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*",
|
|
||||||
handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*",
|
|
||||||
handler="query_range"}[5m]))) * 100 > 5
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Query Grpc Server Error Rate
|
- name: Thanos Query Grpc Server Error Rate
|
||||||
description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}%
|
description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.'
|
||||||
of requests.
|
query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)'
|
||||||
query: ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded",
|
|
||||||
job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m]))*
|
|
||||||
100 > 5)
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Query Grpc Client Error Rate
|
- name: Thanos Query Grpc Client Error Rate
|
||||||
description: Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}%
|
description: 'Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.'
|
||||||
of requests.
|
query: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5'
|
||||||
query: ( sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m]))/ sum
|
|
||||||
by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100
|
|
||||||
> 5
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Query High D N S Failures
|
- name: Thanos Query High D N S Failures
|
||||||
description: Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing
|
description: 'Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.'
|
||||||
DNS queries for store endpoints.
|
query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1'
|
||||||
query: ( sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m]))/ sum
|
|
||||||
by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])))
|
|
||||||
* 100 > 1
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 15m
|
for: 15m
|
||||||
- name: Thanos Query Instant Latency High
|
- name: Thanos Query Instant Latency High
|
||||||
description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}}
|
description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.'
|
||||||
seconds for instant queries.
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
|
||||||
query: ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*",
|
|
||||||
handler="query"}[5m]))) > 40and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*",
|
|
||||||
handler="query"}[5m])) > 0)
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 10m
|
for: 10m
|
||||||
- name: Thanos Query Range Latency High
|
- name: Thanos Query Range Latency High
|
||||||
description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}}
|
description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.'
|
||||||
seconds for range queries.
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
|
||||||
query: ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*",
|
|
||||||
handler="query_range"}[5m]))) > 90and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*",
|
|
||||||
handler="query_range"}[5m])) > 0)
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 10m
|
for: 10m
|
||||||
- name: Thanos Query Overload
|
- name: Thanos Query Overload
|
||||||
description: Thanos Query {{$labels.job}} has been overloaded for more than 15
|
description: 'Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.'
|
||||||
minutes. This may be a symptom of excessive simultanous complex requests, low
|
query: '(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)'
|
||||||
performance of the Prometheus API, or failures within these components. Assess
|
|
||||||
the health of the Thanos query instances, the connnected Prometheus instances,
|
|
||||||
look for potential senders of these requests and then contact support.
|
|
||||||
query: "( max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m])
|
|
||||||
< 1)"
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 15m
|
for: 15m
|
||||||
- name: Thanos Receiver
|
- name: Thanos Receiver
|
||||||
slug: thanos-receiver
|
slug: thanos-receiver
|
||||||
rules:
|
rules:
|
||||||
- name: Thanos Receive Http Request Error Rate High
|
- name: Thanos Receive Http Request Error Rate High
|
||||||
description: Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}%
|
description: 'Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.'
|
||||||
of requests.
|
query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5'
|
||||||
query: ( sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*",
|
|
||||||
handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*",
|
|
||||||
handler="receive"}[5m]))) * 100 > 5
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Receive Http Request Latency High
|
- name: Thanos Receive Http Request Latency High
|
||||||
description: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{
|
description: 'Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.'
|
||||||
$value }} seconds for requests.
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
|
||||||
query: ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*",
|
|
||||||
handler="receive"}[5m]))) > 10and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*",
|
|
||||||
handler="receive"}[5m])) > 0)
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 10m
|
for: 10m
|
||||||
- name: Thanos Receive High Replication Failures
|
- name: Thanos Receive High Replication Failures
|
||||||
description: Thanos Receive {{$labels.job}} is failing to replicate {{$value |
|
description: 'Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.'
|
||||||
humanize}}% of requests.
|
query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100
|
||||||
query: thanos_receive_replication_factor > 1 and( ( sum by (job) (rate(thanos_receive_replications_total{result="error",
|
|
||||||
job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m])) ) > ( max
|
|
||||||
by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)
|
|
||||||
/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}) ))
|
|
||||||
* 100
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Receive High Forward Request Failures
|
- name: Thanos Receive High Forward Request Failures
|
||||||
description: Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}%
|
description: 'Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.'
|
||||||
of requests.
|
query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20'
|
||||||
query: ( sum by (job) (rate(thanos_receive_forward_requests_total{result="error",
|
|
||||||
job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])))
|
|
||||||
* 100 > 20
|
|
||||||
severity: info
|
severity: info
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Receive High Hashring File Refresh Failures
|
- name: Thanos Receive High Hashring File Refresh Failures
|
||||||
description: Thanos Receive {{$labels.job}} is failing to refresh hashring file,
|
description: 'Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed.'
|
||||||
{{$value | humanize}} of attempts failed.
|
query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0)'
|
||||||
query: ( sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m]))/ sum
|
|
||||||
by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m]))>
|
|
||||||
0)
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 15m
|
for: 15m
|
||||||
- name: Thanos Receive Config Reload Failure
|
- name: Thanos Receive Config Reload Failure
|
||||||
description: Thanos Receive {{$labels.job}} has not been able to reload hashring
|
description: 'Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.'
|
||||||
configurations.
|
query: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
|
||||||
query: avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"})
|
|
||||||
!= 1
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Receive No Upload
|
- name: Thanos Receive No Upload
|
||||||
description: Thanos Receive {{$labels.instance}} has not uploaded latest data
|
description: 'Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.'
|
||||||
to object storage.
|
query: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)'
|
||||||
query: '(up{job=~".*thanos-receive.*"} - 1)+ on (job, instance) # filters to only
|
|
||||||
alert on current instance last 3h(sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h]))
|
|
||||||
== 0)'
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 3h
|
for: 3h
|
||||||
- name: Thanos Sidecar
|
- name: Thanos Sidecar
|
||||||
slug: thanos-sidecar
|
slug: thanos-sidecar
|
||||||
rules:
|
rules:
|
||||||
- name: Thanos Sidecar Bucket Operations Failed
|
- name: Thanos Sidecar Bucket Operations Failed
|
||||||
description: Thanos Sidecar {{$labels.instance}} bucket operations are failing
|
description: 'Thanos Sidecar {{$labels.instance}} bucket operations are failing'
|
||||||
query: sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m]))
|
query: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0'
|
||||||
> 0
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Sidecar No Connection To Started Prometheus
|
- name: Thanos Sidecar No Connection To Started Prometheus
|
||||||
description: Thanos Sidecar {{$labels.instance}} is unhealthy.
|
description: 'Thanos Sidecar {{$labels.instance}} is unhealthy.'
|
||||||
query: thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0AND on (namespace,
|
query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0AND on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
|
||||||
pod)prometheus_tsdb_data_replay_duration_seconds != 0
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Store
|
- name: Thanos Store
|
||||||
slug: thanos-store
|
slug: thanos-store
|
||||||
rules:
|
rules:
|
||||||
- name: Thanos Store Grpc Error Rate
|
- name: Thanos Store Grpc Error Rate
|
||||||
description: Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}%
|
description: 'Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.'
|
||||||
of requests.
|
query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
|
||||||
query: ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded",
|
|
||||||
job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m]))*
|
|
||||||
100 > 5)
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Store Series Gate Latency High
|
- name: Thanos Store Series Gate Latency High
|
||||||
description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}}
|
description: 'Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests.'
|
||||||
seconds for store series gate requests.
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
|
||||||
query: ( histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m])))
|
|
||||||
> 2and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m]))
|
|
||||||
> 0)
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 10m
|
for: 10m
|
||||||
- name: Thanos Store Bucket High Operation Failures
|
- name: Thanos Store Bucket High Operation Failures
|
||||||
description: Thanos Store {{$labels.job}} Bucket is failing to execute {{$value
|
description: 'Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.'
|
||||||
| humanize}}% of operations.
|
query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
|
||||||
query: ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m]))/ sum
|
|
||||||
by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m]))*
|
|
||||||
100 > 5)
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 15m
|
for: 15m
|
||||||
- name: Thanos Store Objstore Operation Latency High
|
- name: Thanos Store Objstore Operation Latency High
|
||||||
description: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency
|
description: 'Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.'
|
||||||
of {{$value}} seconds for the bucket operations.
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
|
||||||
query: ( histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m])))
|
|
||||||
> 2and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m]))
|
|
||||||
> 0)
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 10m
|
for: 10m
|
||||||
- name: Thanos Ruler
|
- name: Thanos Ruler
|
||||||
slug: thanos-ruler
|
slug: thanos-ruler
|
||||||
rules:
|
rules:
|
||||||
- name: Thanos Rule Queue Is Dropping Alerts
|
- name: Thanos Rule Queue Is Dropping Alerts
|
||||||
description: Thanos Rule {{$labels.instance}} is failing to queue alerts.
|
description: 'Thanos Rule {{$labels.instance}} is failing to queue alerts.'
|
||||||
query: sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m]))
|
query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
|
||||||
> 0
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Rule Sender Is Failing Alerts
|
- name: Thanos Rule Sender Is Failing Alerts
|
||||||
description: Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.
|
description: 'Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.'
|
||||||
query: sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m]))
|
query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
|
||||||
> 0
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Rule High Rule Evaluation Failures
|
- name: Thanos Rule High Rule Evaluation Failures
|
||||||
description: Thanos Rule {{$labels.instance}} is failing to evaluate rules.
|
description: 'Thanos Rule {{$labels.instance}} is failing to evaluate rules.'
|
||||||
query: ( sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m]))/ sum
|
query: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
|
||||||
by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m]))*
|
|
||||||
100 > 5)
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Rule High Rule Evaluation Warnings
|
- name: Thanos Rule High Rule Evaluation Warnings
|
||||||
description: Thanos Rule {{$labels.instance}} has high number of evaluation warnings.
|
description: 'Thanos Rule {{$labels.instance}} has high number of evaluation warnings.'
|
||||||
query: sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m]))
|
query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
|
||||||
> 0
|
|
||||||
severity: info
|
severity: info
|
||||||
for: 15m
|
for: 15m
|
||||||
- name: Thanos Rule Rule Evaluation Latency High
|
- name: Thanos Rule Rule Evaluation Latency High
|
||||||
description: Thanos Rule {{$labels.instance}} has higher evaluation latency than
|
description: 'Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.'
|
||||||
interval for {{$labels.rule_group}}.
|
query: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
|
||||||
query: ( sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"})> sum
|
|
||||||
by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Rule Grpc Error Rate
|
- name: Thanos Rule Grpc Error Rate
|
||||||
description: Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}%
|
description: 'Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.'
|
||||||
of requests.
|
query: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
|
||||||
query: ( sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded",
|
|
||||||
job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m]))*
|
|
||||||
100 > 5)
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Rule Config Reload Failure
|
- name: Thanos Rule Config Reload Failure
|
||||||
description: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
|
description: 'Thanos Rule {{$labels.job}} has not been able to reload its configuration.'
|
||||||
query: avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"})
|
query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
|
||||||
!= 1
|
|
||||||
severity: info
|
severity: info
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Rule Query High D N S Failures
|
- name: Thanos Rule Query High D N S Failures
|
||||||
description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing
|
description: 'Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.'
|
||||||
DNS queries for query endpoints.
|
query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
|
||||||
query: ( sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m]))/ sum
|
|
||||||
by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m]))*
|
|
||||||
100 > 1)
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 15m
|
for: 15m
|
||||||
- name: Thanos Rule Alertmanager High D N S Failures
|
- name: Thanos Rule Alertmanager High D N S Failures
|
||||||
description: Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing
|
description: 'Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints.'
|
||||||
DNS queries for Alertmanager endpoints.
|
query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
|
||||||
query: ( sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m]))/ sum
|
|
||||||
by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m]))*
|
|
||||||
100 > 1)
|
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 15m
|
for: 15m
|
||||||
- name: Thanos Rule No Evaluation For10 Intervals
|
- name: Thanos Rule No Evaluation For10 Intervals
|
||||||
description: Thanos Rule {{$labels.job}} has rule groups that did not evaluate
|
description: 'Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.'
|
||||||
for at least 10x of their expected interval.
|
query: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
|
||||||
query: time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10
|
|
||||||
* max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})
|
|
||||||
severity: info
|
severity: info
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos No Rule Evaluations
|
- name: Thanos No Rule Evaluations
|
||||||
description: Thanos Rule {{$labels.instance}} did not perform any rule evaluations
|
description: 'Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.'
|
||||||
in the past 10 minutes.
|
query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 andsum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
|
||||||
query: sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m]))
|
|
||||||
<= 0 andsum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"})
|
|
||||||
> 0
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Bucket Replicate
|
- name: Thanos Bucket Replicate
|
||||||
slug: thanos-bucket-replicate
|
slug: thanos-bucket-replicate
|
||||||
rules:
|
rules:
|
||||||
- name: Thanos Bucket Replicate Error Rate
|
- name: Thanos Bucket Replicate Error Rate
|
||||||
description: Thanos Replicate is failing to run, {{$value | humanize}}% of attempts
|
description: 'Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed.'
|
||||||
failed.
|
query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m]))/ on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10'
|
||||||
query: ( sum by (job) (rate(thanos_replicate_replication_runs_total{result="error",
|
|
||||||
job=~".*thanos-bucket-replicate.*"}[5m]))/ on (job) group_left sum by (job)
|
|
||||||
(rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])))
|
|
||||||
* 100 >= 10
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Bucket Replicate Run Latency
|
- name: Thanos Bucket Replicate Run Latency
|
||||||
description: Thanos Replicate {{$labels.job}} has a 99th percentile latency of
|
description: 'Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.'
|
||||||
{{$value}} seconds for the replicate operations.
|
query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
|
||||||
query: ( histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])))
|
|
||||||
> 20and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))
|
|
||||||
> 0)
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Component Absent
|
- name: Thanos Component Absent
|
||||||
slug: thanos-component-absent
|
slug: thanos-component-absent
|
||||||
rules:
|
rules:
|
||||||
- name: Thanos Compact Is Down
|
- name: Thanos Compact Is Down
|
||||||
description: ThanosCompact has disappeared. Prometheus target for the component
|
description: 'ThanosCompact has disappeared. Prometheus target for the component cannot be discovered.'
|
||||||
cannot be discovered.
|
query: 'absent(up{job=~".*thanos-compact.*"} == 1)'
|
||||||
query: absent(up{job=~".*thanos-compact.*"} == 1)
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Query Is Down
|
- name: Thanos Query Is Down
|
||||||
description: ThanosQuery has disappeared. Prometheus target for the component
|
description: 'ThanosQuery has disappeared. Prometheus target for the component cannot be discovered.'
|
||||||
cannot be discovered.
|
query: 'absent(up{job=~".*thanos-query.*"} == 1)'
|
||||||
query: absent(up{job=~".*thanos-query.*"} == 1)
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Receive Is Down
|
- name: Thanos Receive Is Down
|
||||||
description: ThanosReceive has disappeared. Prometheus target for the component
|
description: 'ThanosReceive has disappeared. Prometheus target for the component cannot be discovered.'
|
||||||
cannot be discovered.
|
query: 'absent(up{job=~".*thanos-receive.*"} == 1)'
|
||||||
query: absent(up{job=~".*thanos-receive.*"} == 1)
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Rule Is Down
|
- name: Thanos Rule Is Down
|
||||||
description: ThanosRule has disappeared. Prometheus target for the component cannot
|
description: 'ThanosRule has disappeared. Prometheus target for the component cannot be discovered.'
|
||||||
be discovered.
|
query: 'absent(up{job=~".*thanos-rule.*"} == 1)'
|
||||||
query: absent(up{job=~".*thanos-rule.*"} == 1)
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Sidecar Is Down
|
- name: Thanos Sidecar Is Down
|
||||||
description: ThanosSidecar has disappeared. Prometheus target for the component
|
description: 'ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered.'
|
||||||
cannot be discovered.
|
query: 'absent(up{job=~".*thanos-sidecar.*"} == 1)'
|
||||||
query: absent(up{job=~".*thanos-sidecar.*"} == 1)
|
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Store Is Down
|
- name: Thanos Store Is Down
|
||||||
description: ThanosStore has disappeared. Prometheus target for the component
|
description: 'ThanosStore has disappeared. Prometheus target for the component cannot be discovered.'
|
||||||
cannot be discovered.
|
|
||||||
query: absent(up{job=~".*thanos-store.*"} == 1)
|
query: absent(up{job=~".*thanos-store.*"} == 1)
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue