From e7db9b5b25a18d62295cfb3f802f4474c6cbcb4e Mon Sep 17 00:00:00 2001 From: Kratik Jain Date: Tue, 14 Mar 2023 14:00:53 +0530 Subject: [PATCH] Adding more rules for Thanos Components Monitoring --- _data/rules.yml | 368 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 354 insertions(+), 14 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index 4a2501a..32cb564 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -2269,21 +2269,361 @@ groups: services: - name: Thanos exporters: - - name: Embedded exporter - slug: embedded-exporter + - name: Thanos Compactor + slug: thanos-compactor rules: - - name: Thanos compaction halted - description: Thanos compaction has failed to run and is now halted. - query: 'thanos_compact_halted == 1' - severity: critical - - name: Thanos compact bucket operation failure - description: Thanos compaction has failing storage operations - query: 'rate(thanos_objstore_bucket_operation_failures_total[1m]) > 0' - severity: critical - - name: Thanos compact not run - description: Thanos compaction has not run in 24 hours. - query: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60' - severity: critical + - name: Thanos Compactor Multiple Running + description: No more than one Thanos Compact instance should be running at once. + There are {{$value}} instances running. + query: sum by (job) (up{job=~".*thanos-compact.*"}) > 1 + severity: warning + for: 5m + - name: Thanos Compactor Halted + description: Thanos Compact {{$labels.job}} has failed to run and now is halted. + query: thanos_compact_halted{job=~".*thanos-compact.*"} == 1 + severity: warning + for: 5m + - name: Thanos Compactor High Compaction Failures + description: Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% + of compactions. + query: ( sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m]))/ sum + by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m]))* + 100 > 5) + severity: warning + for: 15m + - name: Thanos Compact Bucket High Operation Failures + description: Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value + | humanize}}% of operations. + query: ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m]))/ sum + by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m]))* + 100 > 5) + severity: warning + for: 15m + - name: Thanos Compact Has Not Run + description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. + query: (time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) + / 60 / 60 > 24 + severity: warning + for: 0m + - name: Thanos Query + slug: thanos-query + rules: + - name: Thanos Query Http Request Query Error Rate High + description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% + of "query" requests. + query: ( sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", + handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", + handler="query"}[5m]))) * 100 > 5 + severity: critical + for: 5m + - name: Thanos Query Http Request Query Range Error Rate High + description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% + of "query_range" requests. + query: ( sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", + handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", + handler="query_range"}[5m]))) * 100 > 5 + severity: critical + for: 5m + - name: Thanos Query Grpc Server Error Rate + description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% + of requests. + query: ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", + job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m]))* + 100 > 5) + severity: warning + for: 5m + - name: Thanos Query Grpc Client Error Rate + description: Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% + of requests. + query: ( sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m]))/ sum + by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 + > 5 + severity: warning + for: 5m + - name: Thanos Query High D N S Failures + description: Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing + DNS queries for store endpoints. + query: ( sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m]))/ sum + by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) + * 100 > 1 + severity: warning + for: 15m + - name: Thanos Query Instant Latency High + description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} + seconds for instant queries. + query: ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", + handler="query"}[5m]))) > 40and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", + handler="query"}[5m])) > 0) + severity: critical + for: 10m + - name: Thanos Query Range Latency High + description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} + seconds for range queries. + query: ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", + handler="query_range"}[5m]))) > 90and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", + handler="query_range"}[5m])) > 0) + severity: critical + for: 10m + - name: Thanos Query Overload + description: Thanos Query {{$labels.job}} has been overloaded for more than 15 + minutes. This may be a symptom of excessive simultanous complex requests, low + performance of the Prometheus API, or failures within these components. Assess + the health of the Thanos query instances, the connnected Prometheus instances, + look for potential senders of these requests and then contact support. + query: "( max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) + < 1)" + severity: warning + for: 15m + - name: Thanos Receiver + slug: thanos-receiver + rules: + - name: Thanos Receive Http Request Error Rate High + description: Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% + of requests. + query: ( sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", + handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", + handler="receive"}[5m]))) * 100 > 5 + severity: critical + for: 5m + - name: Thanos Receive Http Request Latency High + description: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for requests. + query: ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", + handler="receive"}[5m]))) > 10and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", + handler="receive"}[5m])) > 0) + severity: critical + for: 10m + - name: Thanos Receive High Replication Failures + description: Thanos Receive {{$labels.job}} is failing to replicate {{$value | + humanize}}% of requests. + query: thanos_receive_replication_factor > 1 and( ( sum by (job) (rate(thanos_receive_replications_total{result="error", + job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m])) ) > ( max + by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1) + / 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}) )) + * 100 + severity: warning + for: 5m + - name: Thanos Receive High Forward Request Failures + description: Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% + of requests. + query: ( sum by (job) (rate(thanos_receive_forward_requests_total{result="error", + job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) + * 100 > 20 + severity: info + for: 5m + - name: Thanos Receive High Hashring File Refresh Failures + description: Thanos Receive {{$labels.job}} is failing to refresh hashring file, + {{$value | humanize}} of attempts failed. + query: ( sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m]))/ sum + by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m]))> + 0) + severity: warning + for: 15m + - name: Thanos Receive Config Reload Failure + description: Thanos Receive {{$labels.job}} has not been able to reload hashring + configurations. + query: avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) + != 1 + severity: warning + for: 5m + - name: Thanos Receive No Upload + description: Thanos Receive {{$labels.instance}} has not uploaded latest data + to object storage. + query: '(up{job=~".*thanos-receive.*"} - 1)+ on (job, instance) # filters to only + alert on current instance last 3h(sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) + == 0)' + severity: critical + for: 3h + - name: Thanos Sidecar + slug: thanos-sidecar + rules: + - name: Thanos Sidecar Bucket Operations Failed + description: Thanos Sidecar {{$labels.instance}} bucket operations are failing + query: sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) + > 0 + severity: critical + for: 5m + - name: Thanos Sidecar No Connection To Started Prometheus + description: Thanos Sidecar {{$labels.instance}} is unhealthy. + query: thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0AND on (namespace, + pod)prometheus_tsdb_data_replay_duration_seconds != 0 + severity: critical + for: 5m + - name: Thanos Store + slug: thanos-store + rules: + - name: Thanos Store Grpc Error Rate + description: Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% + of requests. + query: ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", + job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m]))* + 100 > 5) + severity: warning + for: 5m + - name: Thanos Store Series Gate Latency High + description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} + seconds for store series gate requests. + query: ( histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) + > 2and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) + > 0) + severity: warning + for: 10m + - name: Thanos Store Bucket High Operation Failures + description: Thanos Store {{$labels.job}} Bucket is failing to execute {{$value + | humanize}}% of operations. + query: ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m]))/ sum + by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m]))* + 100 > 5) + severity: warning + for: 15m + - name: Thanos Store Objstore Operation Latency High + description: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency + of {{$value}} seconds for the bucket operations. + query: ( histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) + > 2and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) + > 0) + severity: warning + for: 10m + - name: Thanos Ruler + slug: thanos-ruler + rules: + - name: Thanos Rule Queue Is Dropping Alerts + description: Thanos Rule {{$labels.instance}} is failing to queue alerts. + query: sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) + > 0 + severity: critical + for: 5m + - name: Thanos Rule Sender Is Failing Alerts + description: Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager. + query: sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) + > 0 + severity: critical + for: 5m + - name: Thanos Rule High Rule Evaluation Failures + description: Thanos Rule {{$labels.instance}} is failing to evaluate rules. + query: ( sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m]))/ sum + by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m]))* + 100 > 5) + severity: critical + for: 5m + - name: Thanos Rule High Rule Evaluation Warnings + description: Thanos Rule {{$labels.instance}} has high number of evaluation warnings. + query: sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) + > 0 + severity: info + for: 15m + - name: Thanos Rule Rule Evaluation Latency High + description: Thanos Rule {{$labels.instance}} has higher evaluation latency than + interval for {{$labels.rule_group}}. + query: ( sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"})> sum + by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})) + severity: warning + for: 5m + - name: Thanos Rule Grpc Error Rate + description: Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% + of requests. + query: ( sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", + job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m]))* + 100 > 5) + severity: warning + for: 5m + - name: Thanos Rule Config Reload Failure + description: Thanos Rule {{$labels.job}} has not been able to reload its configuration. + query: avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) + != 1 + severity: info + for: 5m + - name: Thanos Rule Query High D N S Failures + description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing + DNS queries for query endpoints. + query: ( sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m]))/ sum + by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m]))* + 100 > 1) + severity: warning + for: 15m + - name: Thanos Rule Alertmanager High D N S Failures + description: Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing + DNS queries for Alertmanager endpoints. + query: ( sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m]))/ sum + by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m]))* + 100 > 1) + severity: warning + for: 15m + - name: Thanos Rule No Evaluation For10 Intervals + description: Thanos Rule {{$labels.job}} has rule groups that did not evaluate + for at least 10x of their expected interval. + query: time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 + * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}) + severity: info + for: 5m + - name: Thanos No Rule Evaluations + description: Thanos Rule {{$labels.instance}} did not perform any rule evaluations + in the past 10 minutes. + query: sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) + <= 0 andsum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) + > 0 + severity: critical + for: 5m + - name: Thanos Bucket Replicate + slug: thanos-bucket-replicate + rules: + - name: Thanos Bucket Replicate Error Rate + description: Thanos Replicate is failing to run, {{$value | humanize}}% of attempts + failed. + query: ( sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", + job=~".*thanos-bucket-replicate.*"}[5m]))/ on (job) group_left sum by (job) + (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) + * 100 >= 10 + severity: critical + for: 5m + - name: Thanos Bucket Replicate Run Latency + description: Thanos Replicate {{$labels.job}} has a 99th percentile latency of + {{$value}} seconds for the replicate operations. + query: ( histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) + > 20and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) + > 0) + severity: critical + for: 5m + - name: Thanos Component Absent + slug: thanos-component-absent + rules: + - name: Thanos Compact Is Down + description: ThanosCompact has disappeared. Prometheus target for the component + cannot be discovered. + query: absent(up{job=~".*thanos-compact.*"} == 1) + severity: critical + for: 5m + - name: Thanos Query Is Down + description: ThanosQuery has disappeared. Prometheus target for the component + cannot be discovered. + query: absent(up{job=~".*thanos-query.*"} == 1) + severity: critical + for: 5m + - name: Thanos Receive Is Down + description: ThanosReceive has disappeared. Prometheus target for the component + cannot be discovered. + query: absent(up{job=~".*thanos-receive.*"} == 1) + severity: critical + for: 5m + - name: Thanos Rule Is Down + description: ThanosRule has disappeared. Prometheus target for the component cannot + be discovered. + query: absent(up{job=~".*thanos-rule.*"} == 1) + severity: critical + for: 5m + - name: Thanos Sidecar Is Down + description: ThanosSidecar has disappeared. Prometheus target for the component + cannot be discovered. + query: absent(up{job=~".*thanos-sidecar.*"} == 1) + severity: critical + for: 5m + - name: Thanos Store Is Down + description: ThanosStore has disappeared. Prometheus target for the component + cannot be discovered. + query: absent(up{job=~".*thanos-store.*"} == 1) + severity: critical + for: 5m + - name: Loki exporters: - name: Embedded exporter