This commit is contained in:
Samuel Berthe 2023-03-15 18:20:05 +01:00 committed by GitHub
parent f0798b42ba
commit f042883cf2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -2327,12 +2327,12 @@ groups:
for: 15m for: 15m
- name: Thanos Query Instant Latency High - name: Thanos Query Instant Latency High
description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.' description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.'
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)' query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
severity: critical severity: critical
for: 10m for: 10m
- name: Thanos Query Range Latency High - name: Thanos Query Range Latency High
description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.' description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.'
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)' query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
severity: critical severity: critical
for: 10m for: 10m
- name: Thanos Query Overload - name: Thanos Query Overload
@ -2350,7 +2350,7 @@ groups:
for: 5m for: 5m
- name: Thanos Receive Http Request Latency High - name: Thanos Receive Http Request Latency High
description: 'Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.' description: 'Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.'
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)' query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
severity: critical severity: critical
for: 10m for: 10m
- name: Thanos Receive High Replication Failures - name: Thanos Receive High Replication Failures
@ -2388,7 +2388,7 @@ groups:
for: 5m for: 5m
- name: Thanos Sidecar No Connection To Started Prometheus - name: Thanos Sidecar No Connection To Started Prometheus
description: 'Thanos Sidecar {{$labels.instance}} is unhealthy.' description: 'Thanos Sidecar {{$labels.instance}} is unhealthy.'
query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0AND on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0' query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
severity: critical severity: critical
for: 5m for: 5m
- name: Thanos Store - name: Thanos Store
@ -2411,7 +2411,7 @@ groups:
for: 15m for: 15m
- name: Thanos Store Objstore Operation Latency High - name: Thanos Store Objstore Operation Latency High
description: 'Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.' description: 'Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.'
query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)' query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
severity: warning severity: warning
for: 10m for: 10m
- name: Thanos Ruler - name: Thanos Ruler
@ -2469,7 +2469,7 @@ groups:
for: 5m for: 5m
- name: Thanos No Rule Evaluations - name: Thanos No Rule Evaluations
description: 'Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.' description: 'Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.'
query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 andsum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0' query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
severity: critical severity: critical
for: 5m for: 5m
- name: Thanos Bucket Replicate - name: Thanos Bucket Replicate
@ -2482,7 +2482,7 @@ groups:
for: 5m for: 5m
- name: Thanos Bucket Replicate Run Latency - name: Thanos Bucket Replicate Run Latency
description: 'Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.' description: 'Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.'
query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)' query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
severity: critical severity: critical
for: 5m for: 5m
- name: Thanos Component Absent - name: Thanos Component Absent