diff --git a/_data/rules.yml b/_data/rules.yml index e3d7484..c3780d0 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -2327,12 +2327,12 @@ groups: for: 15m - name: Thanos Query Instant Latency High description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.' - query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)' + query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)' severity: critical for: 10m - name: Thanos Query Range Latency High description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.' - query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)' + query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)' severity: critical for: 10m - name: Thanos Query Overload @@ -2350,12 +2350,12 @@ groups: for: 5m - name: Thanos Receive Http Request Latency High description: 'Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.' - query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)' + query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)' severity: critical for: 10m - name: Thanos Receive High Replication Failures description: 'Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.' - query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100' + query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100' severity: warning for: 5m - name: Thanos Receive High Forward Request Failures @@ -2388,7 +2388,7 @@ groups: for: 5m - name: Thanos Sidecar No Connection To Started Prometheus description: 'Thanos Sidecar {{$labels.instance}} is unhealthy.' - query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0AND on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0' + query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0' severity: critical for: 5m - name: Thanos Store @@ -2411,7 +2411,7 @@ groups: for: 15m - name: Thanos Store Objstore Operation Latency High description: 'Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.' - query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)' + query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)' severity: warning for: 10m - name: Thanos Ruler @@ -2469,7 +2469,7 @@ groups: for: 5m - name: Thanos No Rule Evaluations description: 'Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.' - query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 andsum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0' + query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0' severity: critical for: 5m - name: Thanos Bucket Replicate @@ -2482,7 +2482,7 @@ groups: for: 5m - name: Thanos Bucket Replicate Run Latency description: 'Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.' - query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)' + query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)' severity: critical for: 5m - name: Thanos Component Absent