This commit is contained in:
samber 2023-03-15 17:27:02 +00:00
parent aa2988693b
commit 2ead3bcbd8
9 changed files with 445 additions and 32 deletions

View file

@ -1,32 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
- alert: ThanosCompactionHalted
expr: 'thanos_compact_halted == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Thanos compaction halted (instance {{ $labels.instance }})
description: "Thanos compaction has failed to run and is now halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosCompactBucketOperationFailure
expr: 'rate(thanos_objstore_bucket_operation_failures_total[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Thanos compact bucket operation failure (instance {{ $labels.instance }})
description: "Thanos compaction has failing storage operations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosCompactNotRun
expr: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60'
for: 0m
labels:
severity: critical
annotations:
summary: Thanos compact not run (instance {{ $labels.instance }})
description: "Thanos compaction has not run in 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -0,0 +1,23 @@
groups:
- name: ThanosBucketReplicate
rules:
- alert: ThanosBucketReplicateErrorRate
expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m]))/ on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Bucket Replicate Error Rate (instance {{ $labels.instance }})
description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosBucketReplicateRunLatency
expr: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Bucket Replicate Run Latency (instance {{ $labels.instance }})
description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

50
dist/rules/thanos/thanos-compactor.yml vendored Normal file
View file

@ -0,0 +1,50 @@
groups:
- name: ThanosCompactor
rules:
- alert: ThanosCompactorMultipleRunning
expr: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Compactor Multiple Running (instance {{ $labels.instance }})
description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosCompactorHalted
expr: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Compactor Halted (instance {{ $labels.instance }})
description: "Thanos Compact {{$labels.job}} has failed to run and now is halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosCompactorHighCompactionFailures
expr: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Compactor High Compaction Failures (instance {{ $labels.instance }})
description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosCompactBucketHighOperationFailures
expr: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Compact Bucket High Operation Failures (instance {{ $labels.instance }})
description: "Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosCompactHasNotRun
expr: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24'
for: 0m
labels:
severity: warning
annotations:
summary: Thanos Compact Has Not Run (instance {{ $labels.instance }})
description: "Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -0,0 +1,59 @@
groups:
- name: ThanosComponentAbsent
rules:
- alert: ThanosCompactIsDown
expr: 'absent(up{job=~".*thanos-compact.*"} == 1)'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Compact Is Down (instance {{ $labels.instance }})
description: "ThanosCompact has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryIsDown
expr: 'absent(up{job=~".*thanos-query.*"} == 1)'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Query Is Down (instance {{ $labels.instance }})
description: "ThanosQuery has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveIsDown
expr: 'absent(up{job=~".*thanos-receive.*"} == 1)'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Receive Is Down (instance {{ $labels.instance }})
description: "ThanosReceive has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleIsDown
expr: 'absent(up{job=~".*thanos-rule.*"} == 1)'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Rule Is Down (instance {{ $labels.instance }})
description: "ThanosRule has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosSidecarIsDown
expr: 'absent(up{job=~".*thanos-sidecar.*"} == 1)'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Sidecar Is Down (instance {{ $labels.instance }})
description: "ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosStoreIsDown
expr: 'absent(up{job=~".*thanos-store.*"} == 1)'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Store Is Down (instance {{ $labels.instance }})
description: "ThanosStore has disappeared. Prometheus target for the component cannot be discovered.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

77
dist/rules/thanos/thanos-query.yml vendored Normal file
View file

@ -0,0 +1,77 @@
groups:
- name: ThanosQuery
rules:
- alert: ThanosQueryHttpRequestQueryErrorRateHigh
expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Query Http Request Query Error Rate High (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Query Http Request Query Range Error Rate High (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query_range\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryGrpcServerErrorRate
expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Query Grpc Server Error Rate (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryGrpcClientErrorRate
expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Query Grpc Client Error Rate (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryHighDNSFailures
expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Query High D N S Failures (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryInstantLatencyHigh
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
for: 10m
labels:
severity: critical
annotations:
summary: Thanos Query Instant Latency High (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryRangeLatencyHigh
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
for: 10m
labels:
severity: critical
annotations:
summary: Thanos Query Range Latency High (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryOverload
expr: '(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Query Overload (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

68
dist/rules/thanos/thanos-receiver.yml vendored Normal file
View file

@ -0,0 +1,68 @@
groups:
- name: ThanosReceiver
rules:
- alert: ThanosReceiveHttpRequestErrorRateHigh
expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Receive Http Request Error Rate High (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveHttpRequestLatencyHigh
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
for: 10m
labels:
severity: critical
annotations:
summary: Thanos Receive Http Request Latency High (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveHighReplicationFailures
expr: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Receive High Replication Failures (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveHighForwardRequestFailures
expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20'
for: 5m
labels:
severity: info
annotations:
summary: Thanos Receive High Forward Request Failures (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveHighHashringFileRefreshFailures
expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0)'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Receive High Hashring File Refresh Failures (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveConfigReloadFailure
expr: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Receive Config Reload Failure (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveNoUpload
expr: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)'
for: 3h
labels:
severity: critical
annotations:
summary: Thanos Receive No Upload (instance {{ $labels.instance }})
description: "Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

104
dist/rules/thanos/thanos-ruler.yml vendored Normal file
View file

@ -0,0 +1,104 @@
groups:
- name: ThanosRuler
rules:
- alert: ThanosRuleQueueIsDroppingAlerts
expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Rule Queue Is Dropping Alerts (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} is failing to queue alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleSenderIsFailingAlerts
expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Rule Sender Is Failing Alerts (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleHighRuleEvaluationFailures
expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Rule High Rule Evaluation Failures (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} is failing to evaluate rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleHighRuleEvaluationWarnings
expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
for: 15m
labels:
severity: info
annotations:
summary: Thanos Rule High Rule Evaluation Warnings (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleRuleEvaluationLatencyHigh
expr: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Rule Rule Evaluation Latency High (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleGrpcErrorRate
expr: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Rule Grpc Error Rate (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleConfigReloadFailure
expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
for: 5m
labels:
severity: info
annotations:
summary: Thanos Rule Config Reload Failure (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleQueryHighDNSFailures
expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Rule Query High D N S Failures (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleAlertmanagerHighDNSFailures
expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Rule Alertmanager High D N S Failures (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleNoEvaluationFor10Intervals
expr: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
for: 5m
labels:
severity: info
annotations:
summary: Thanos Rule No Evaluation For10 Intervals (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosNoRuleEvaluations
expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos No Rule Evaluations (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

23
dist/rules/thanos/thanos-sidecar.yml vendored Normal file
View file

@ -0,0 +1,23 @@
groups:
- name: ThanosSidecar
rules:
- alert: ThanosSidecarBucketOperationsFailed
expr: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Sidecar Bucket Operations Failed (instance {{ $labels.instance }})
description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosSidecarNoConnectionToStartedPrometheus
expr: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Sidecar No Connection To Started Prometheus (instance {{ $labels.instance }})
description: "Thanos Sidecar {{$labels.instance}} is unhealthy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

41
dist/rules/thanos/thanos-store.yml vendored Normal file
View file

@ -0,0 +1,41 @@
groups:
- name: ThanosStore
rules:
- alert: ThanosStoreGrpcErrorRate
expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
for: 5m
labels:
severity: warning
annotations:
summary: Thanos Store Grpc Error Rate (instance {{ $labels.instance }})
description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosStoreSeriesGateLatencyHigh
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
for: 10m
labels:
severity: warning
annotations:
summary: Thanos Store Series Gate Latency High (instance {{ $labels.instance }})
description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosStoreBucketHighOperationFailures
expr: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
for: 15m
labels:
severity: warning
annotations:
summary: Thanos Store Bucket High Operation Failures (instance {{ $labels.instance }})
description: "Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosStoreObjstoreOperationLatencyHigh
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
for: 10m
labels:
severity: warning
annotations:
summary: Thanos Store Objstore Operation Latency High (instance {{ $labels.instance }})
description: "Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"