mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 19:37:27 +08:00
lint
This commit is contained in:
parent
f0798b42ba
commit
f042883cf2
1 changed files with 8 additions and 8 deletions
|
|
@ -2327,12 +2327,12 @@ groups:
|
||||||
for: 15m
|
for: 15m
|
||||||
- name: Thanos Query Instant Latency High
|
- name: Thanos Query Instant Latency High
|
||||||
description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.'
|
description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.'
|
||||||
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 10m
|
for: 10m
|
||||||
- name: Thanos Query Range Latency High
|
- name: Thanos Query Range Latency High
|
||||||
description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.'
|
description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.'
|
||||||
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 10m
|
for: 10m
|
||||||
- name: Thanos Query Overload
|
- name: Thanos Query Overload
|
||||||
|
|
@ -2350,12 +2350,12 @@ groups:
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Receive Http Request Latency High
|
- name: Thanos Receive Http Request Latency High
|
||||||
description: 'Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.'
|
description: 'Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.'
|
||||||
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 10m
|
for: 10m
|
||||||
- name: Thanos Receive High Replication Failures
|
- name: Thanos Receive High Replication Failures
|
||||||
description: 'Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.'
|
description: 'Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.'
|
||||||
query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
|
query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Receive High Forward Request Failures
|
- name: Thanos Receive High Forward Request Failures
|
||||||
|
|
@ -2388,7 +2388,7 @@ groups:
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Sidecar No Connection To Started Prometheus
|
- name: Thanos Sidecar No Connection To Started Prometheus
|
||||||
description: 'Thanos Sidecar {{$labels.instance}} is unhealthy.'
|
description: 'Thanos Sidecar {{$labels.instance}} is unhealthy.'
|
||||||
query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0AND on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
|
query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Store
|
- name: Thanos Store
|
||||||
|
|
@ -2411,7 +2411,7 @@ groups:
|
||||||
for: 15m
|
for: 15m
|
||||||
- name: Thanos Store Objstore Operation Latency High
|
- name: Thanos Store Objstore Operation Latency High
|
||||||
description: 'Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.'
|
description: 'Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.'
|
||||||
query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 10m
|
for: 10m
|
||||||
- name: Thanos Ruler
|
- name: Thanos Ruler
|
||||||
|
|
@ -2469,7 +2469,7 @@ groups:
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos No Rule Evaluations
|
- name: Thanos No Rule Evaluations
|
||||||
description: 'Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.'
|
description: 'Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.'
|
||||||
query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 andsum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
|
query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Bucket Replicate
|
- name: Thanos Bucket Replicate
|
||||||
|
|
@ -2482,7 +2482,7 @@ groups:
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Bucket Replicate Run Latency
|
- name: Thanos Bucket Replicate Run Latency
|
||||||
description: 'Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.'
|
description: 'Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.'
|
||||||
query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
|
query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Thanos Component Absent
|
- name: Thanos Component Absent
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue