mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 11:27:00 +08:00
Adding more rules for Thanos Components Monitoring
This commit is contained in:
parent
293aba1437
commit
e7db9b5b25
1 changed files with 354 additions and 14 deletions
368
_data/rules.yml
368
_data/rules.yml
|
|
@ -2269,21 +2269,361 @@ groups:
|
||||||
services:
|
services:
|
||||||
- name: Thanos
|
- name: Thanos
|
||||||
exporters:
|
exporters:
|
||||||
- name: Embedded exporter
|
- name: Thanos Compactor
|
||||||
slug: embedded-exporter
|
slug: thanos-compactor
|
||||||
rules:
|
rules:
|
||||||
- name: Thanos compaction halted
|
- name: Thanos Compactor Multiple Running
|
||||||
description: Thanos compaction has failed to run and is now halted.
|
description: No more than one Thanos Compact instance should be running at once.
|
||||||
query: 'thanos_compact_halted == 1'
|
There are {{$value}} instances running.
|
||||||
severity: critical
|
query: sum by (job) (up{job=~".*thanos-compact.*"}) > 1
|
||||||
- name: Thanos compact bucket operation failure
|
severity: warning
|
||||||
description: Thanos compaction has failing storage operations
|
for: 5m
|
||||||
query: 'rate(thanos_objstore_bucket_operation_failures_total[1m]) > 0'
|
- name: Thanos Compactor Halted
|
||||||
severity: critical
|
description: Thanos Compact {{$labels.job}} has failed to run and now is halted.
|
||||||
- name: Thanos compact not run
|
query: thanos_compact_halted{job=~".*thanos-compact.*"} == 1
|
||||||
description: Thanos compaction has not run in 24 hours.
|
severity: warning
|
||||||
query: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60'
|
for: 5m
|
||||||
severity: critical
|
- name: Thanos Compactor High Compaction Failures
|
||||||
|
description: Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}%
|
||||||
|
of compactions.
|
||||||
|
query: ( sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m]))/ sum
|
||||||
|
by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m]))*
|
||||||
|
100 > 5)
|
||||||
|
severity: warning
|
||||||
|
for: 15m
|
||||||
|
- name: Thanos Compact Bucket High Operation Failures
|
||||||
|
description: Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value
|
||||||
|
| humanize}}% of operations.
|
||||||
|
query: ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m]))/ sum
|
||||||
|
by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m]))*
|
||||||
|
100 > 5)
|
||||||
|
severity: warning
|
||||||
|
for: 15m
|
||||||
|
- name: Thanos Compact Has Not Run
|
||||||
|
description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
|
||||||
|
query: (time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h])))
|
||||||
|
/ 60 / 60 > 24
|
||||||
|
severity: warning
|
||||||
|
for: 0m
|
||||||
|
- name: Thanos Query
|
||||||
|
slug: thanos-query
|
||||||
|
rules:
|
||||||
|
- name: Thanos Query Http Request Query Error Rate High
|
||||||
|
description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}%
|
||||||
|
of "query" requests.
|
||||||
|
query: ( sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*",
|
||||||
|
handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*",
|
||||||
|
handler="query"}[5m]))) * 100 > 5
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Query Http Request Query Range Error Rate High
|
||||||
|
description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}%
|
||||||
|
of "query_range" requests.
|
||||||
|
query: ( sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*",
|
||||||
|
handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*",
|
||||||
|
handler="query_range"}[5m]))) * 100 > 5
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Query Grpc Server Error Rate
|
||||||
|
description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}%
|
||||||
|
of requests.
|
||||||
|
query: ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded",
|
||||||
|
job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m]))*
|
||||||
|
100 > 5)
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Query Grpc Client Error Rate
|
||||||
|
description: Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}%
|
||||||
|
of requests.
|
||||||
|
query: ( sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m]))/ sum
|
||||||
|
by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100
|
||||||
|
> 5
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Query High D N S Failures
|
||||||
|
description: Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing
|
||||||
|
DNS queries for store endpoints.
|
||||||
|
query: ( sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m]))/ sum
|
||||||
|
by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])))
|
||||||
|
* 100 > 1
|
||||||
|
severity: warning
|
||||||
|
for: 15m
|
||||||
|
- name: Thanos Query Instant Latency High
|
||||||
|
description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}}
|
||||||
|
seconds for instant queries.
|
||||||
|
query: ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*",
|
||||||
|
handler="query"}[5m]))) > 40and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*",
|
||||||
|
handler="query"}[5m])) > 0)
|
||||||
|
severity: critical
|
||||||
|
for: 10m
|
||||||
|
- name: Thanos Query Range Latency High
|
||||||
|
description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}}
|
||||||
|
seconds for range queries.
|
||||||
|
query: ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*",
|
||||||
|
handler="query_range"}[5m]))) > 90and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*",
|
||||||
|
handler="query_range"}[5m])) > 0)
|
||||||
|
severity: critical
|
||||||
|
for: 10m
|
||||||
|
- name: Thanos Query Overload
|
||||||
|
description: Thanos Query {{$labels.job}} has been overloaded for more than 15
|
||||||
|
minutes. This may be a symptom of excessive simultanous complex requests, low
|
||||||
|
performance of the Prometheus API, or failures within these components. Assess
|
||||||
|
the health of the Thanos query instances, the connnected Prometheus instances,
|
||||||
|
look for potential senders of these requests and then contact support.
|
||||||
|
query: "( max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m])
|
||||||
|
< 1)"
|
||||||
|
severity: warning
|
||||||
|
for: 15m
|
||||||
|
- name: Thanos Receiver
|
||||||
|
slug: thanos-receiver
|
||||||
|
rules:
|
||||||
|
- name: Thanos Receive Http Request Error Rate High
|
||||||
|
description: Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}%
|
||||||
|
of requests.
|
||||||
|
query: ( sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*",
|
||||||
|
handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*",
|
||||||
|
handler="receive"}[5m]))) * 100 > 5
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Receive Http Request Latency High
|
||||||
|
description: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{
|
||||||
|
$value }} seconds for requests.
|
||||||
|
query: ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*",
|
||||||
|
handler="receive"}[5m]))) > 10and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*",
|
||||||
|
handler="receive"}[5m])) > 0)
|
||||||
|
severity: critical
|
||||||
|
for: 10m
|
||||||
|
- name: Thanos Receive High Replication Failures
|
||||||
|
description: Thanos Receive {{$labels.job}} is failing to replicate {{$value |
|
||||||
|
humanize}}% of requests.
|
||||||
|
query: thanos_receive_replication_factor > 1 and( ( sum by (job) (rate(thanos_receive_replications_total{result="error",
|
||||||
|
job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m])) ) > ( max
|
||||||
|
by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)
|
||||||
|
/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}) ))
|
||||||
|
* 100
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Receive High Forward Request Failures
|
||||||
|
description: Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}%
|
||||||
|
of requests.
|
||||||
|
query: ( sum by (job) (rate(thanos_receive_forward_requests_total{result="error",
|
||||||
|
job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])))
|
||||||
|
* 100 > 20
|
||||||
|
severity: info
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Receive High Hashring File Refresh Failures
|
||||||
|
description: Thanos Receive {{$labels.job}} is failing to refresh hashring file,
|
||||||
|
{{$value | humanize}} of attempts failed.
|
||||||
|
query: ( sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m]))/ sum
|
||||||
|
by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m]))>
|
||||||
|
0)
|
||||||
|
severity: warning
|
||||||
|
for: 15m
|
||||||
|
- name: Thanos Receive Config Reload Failure
|
||||||
|
description: Thanos Receive {{$labels.job}} has not been able to reload hashring
|
||||||
|
configurations.
|
||||||
|
query: avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"})
|
||||||
|
!= 1
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Receive No Upload
|
||||||
|
description: Thanos Receive {{$labels.instance}} has not uploaded latest data
|
||||||
|
to object storage.
|
||||||
|
query: '(up{job=~".*thanos-receive.*"} - 1)+ on (job, instance) # filters to only
|
||||||
|
alert on current instance last 3h(sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h]))
|
||||||
|
== 0)'
|
||||||
|
severity: critical
|
||||||
|
for: 3h
|
||||||
|
- name: Thanos Sidecar
|
||||||
|
slug: thanos-sidecar
|
||||||
|
rules:
|
||||||
|
- name: Thanos Sidecar Bucket Operations Failed
|
||||||
|
description: Thanos Sidecar {{$labels.instance}} bucket operations are failing
|
||||||
|
query: sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m]))
|
||||||
|
> 0
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Sidecar No Connection To Started Prometheus
|
||||||
|
description: Thanos Sidecar {{$labels.instance}} is unhealthy.
|
||||||
|
query: thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0AND on (namespace,
|
||||||
|
pod)prometheus_tsdb_data_replay_duration_seconds != 0
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Store
|
||||||
|
slug: thanos-store
|
||||||
|
rules:
|
||||||
|
- name: Thanos Store Grpc Error Rate
|
||||||
|
description: Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}%
|
||||||
|
of requests.
|
||||||
|
query: ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded",
|
||||||
|
job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m]))*
|
||||||
|
100 > 5)
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Store Series Gate Latency High
|
||||||
|
description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}}
|
||||||
|
seconds for store series gate requests.
|
||||||
|
query: ( histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m])))
|
||||||
|
> 2and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m]))
|
||||||
|
> 0)
|
||||||
|
severity: warning
|
||||||
|
for: 10m
|
||||||
|
- name: Thanos Store Bucket High Operation Failures
|
||||||
|
description: Thanos Store {{$labels.job}} Bucket is failing to execute {{$value
|
||||||
|
| humanize}}% of operations.
|
||||||
|
query: ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m]))/ sum
|
||||||
|
by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m]))*
|
||||||
|
100 > 5)
|
||||||
|
severity: warning
|
||||||
|
for: 15m
|
||||||
|
- name: Thanos Store Objstore Operation Latency High
|
||||||
|
description: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency
|
||||||
|
of {{$value}} seconds for the bucket operations.
|
||||||
|
query: ( histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m])))
|
||||||
|
> 2and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m]))
|
||||||
|
> 0)
|
||||||
|
severity: warning
|
||||||
|
for: 10m
|
||||||
|
- name: Thanos Ruler
|
||||||
|
slug: thanos-ruler
|
||||||
|
rules:
|
||||||
|
- name: Thanos Rule Queue Is Dropping Alerts
|
||||||
|
description: Thanos Rule {{$labels.instance}} is failing to queue alerts.
|
||||||
|
query: sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m]))
|
||||||
|
> 0
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Rule Sender Is Failing Alerts
|
||||||
|
description: Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.
|
||||||
|
query: sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m]))
|
||||||
|
> 0
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Rule High Rule Evaluation Failures
|
||||||
|
description: Thanos Rule {{$labels.instance}} is failing to evaluate rules.
|
||||||
|
query: ( sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m]))/ sum
|
||||||
|
by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m]))*
|
||||||
|
100 > 5)
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Rule High Rule Evaluation Warnings
|
||||||
|
description: Thanos Rule {{$labels.instance}} has high number of evaluation warnings.
|
||||||
|
query: sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m]))
|
||||||
|
> 0
|
||||||
|
severity: info
|
||||||
|
for: 15m
|
||||||
|
- name: Thanos Rule Rule Evaluation Latency High
|
||||||
|
description: Thanos Rule {{$labels.instance}} has higher evaluation latency than
|
||||||
|
interval for {{$labels.rule_group}}.
|
||||||
|
query: ( sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"})> sum
|
||||||
|
by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Rule Grpc Error Rate
|
||||||
|
description: Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}%
|
||||||
|
of requests.
|
||||||
|
query: ( sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded",
|
||||||
|
job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m]))*
|
||||||
|
100 > 5)
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Rule Config Reload Failure
|
||||||
|
description: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
|
||||||
|
query: avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"})
|
||||||
|
!= 1
|
||||||
|
severity: info
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Rule Query High D N S Failures
|
||||||
|
description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing
|
||||||
|
DNS queries for query endpoints.
|
||||||
|
query: ( sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m]))/ sum
|
||||||
|
by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m]))*
|
||||||
|
100 > 1)
|
||||||
|
severity: warning
|
||||||
|
for: 15m
|
||||||
|
- name: Thanos Rule Alertmanager High D N S Failures
|
||||||
|
description: Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing
|
||||||
|
DNS queries for Alertmanager endpoints.
|
||||||
|
query: ( sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m]))/ sum
|
||||||
|
by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m]))*
|
||||||
|
100 > 1)
|
||||||
|
severity: warning
|
||||||
|
for: 15m
|
||||||
|
- name: Thanos Rule No Evaluation For10 Intervals
|
||||||
|
description: Thanos Rule {{$labels.job}} has rule groups that did not evaluate
|
||||||
|
for at least 10x of their expected interval.
|
||||||
|
query: time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10
|
||||||
|
* max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})
|
||||||
|
severity: info
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos No Rule Evaluations
|
||||||
|
description: Thanos Rule {{$labels.instance}} did not perform any rule evaluations
|
||||||
|
in the past 10 minutes.
|
||||||
|
query: sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m]))
|
||||||
|
<= 0 andsum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"})
|
||||||
|
> 0
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Bucket Replicate
|
||||||
|
slug: thanos-bucket-replicate
|
||||||
|
rules:
|
||||||
|
- name: Thanos Bucket Replicate Error Rate
|
||||||
|
description: Thanos Replicate is failing to run, {{$value | humanize}}% of attempts
|
||||||
|
failed.
|
||||||
|
query: ( sum by (job) (rate(thanos_replicate_replication_runs_total{result="error",
|
||||||
|
job=~".*thanos-bucket-replicate.*"}[5m]))/ on (job) group_left sum by (job)
|
||||||
|
(rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])))
|
||||||
|
* 100 >= 10
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Bucket Replicate Run Latency
|
||||||
|
description: Thanos Replicate {{$labels.job}} has a 99th percentile latency of
|
||||||
|
{{$value}} seconds for the replicate operations.
|
||||||
|
query: ( histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])))
|
||||||
|
> 20and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))
|
||||||
|
> 0)
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Component Absent
|
||||||
|
slug: thanos-component-absent
|
||||||
|
rules:
|
||||||
|
- name: Thanos Compact Is Down
|
||||||
|
description: ThanosCompact has disappeared. Prometheus target for the component
|
||||||
|
cannot be discovered.
|
||||||
|
query: absent(up{job=~".*thanos-compact.*"} == 1)
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Query Is Down
|
||||||
|
description: ThanosQuery has disappeared. Prometheus target for the component
|
||||||
|
cannot be discovered.
|
||||||
|
query: absent(up{job=~".*thanos-query.*"} == 1)
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Receive Is Down
|
||||||
|
description: ThanosReceive has disappeared. Prometheus target for the component
|
||||||
|
cannot be discovered.
|
||||||
|
query: absent(up{job=~".*thanos-receive.*"} == 1)
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Rule Is Down
|
||||||
|
description: ThanosRule has disappeared. Prometheus target for the component cannot
|
||||||
|
be discovered.
|
||||||
|
query: absent(up{job=~".*thanos-rule.*"} == 1)
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Sidecar Is Down
|
||||||
|
description: ThanosSidecar has disappeared. Prometheus target for the component
|
||||||
|
cannot be discovered.
|
||||||
|
query: absent(up{job=~".*thanos-sidecar.*"} == 1)
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
- name: Thanos Store Is Down
|
||||||
|
description: ThanosStore has disappeared. Prometheus target for the component
|
||||||
|
cannot be discovered.
|
||||||
|
query: absent(up{job=~".*thanos-store.*"} == 1)
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
|
||||||
- name: Loki
|
- name: Loki
|
||||||
exporters:
|
exporters:
|
||||||
- name: Embedded exporter
|
- name: Embedded exporter
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue