diff --git a/_data/rules.yml b/_data/rules.yml index bb68445..83dfae5 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -82,7 +82,7 @@ groups: severity: warning - name: Prometheus AlertManager notification failing description: "Alertmanager is failing sending notifications ({{ $value }} notifications/s)" - query: "rate(alertmanager_notifications_failed_total[1m]) > 0" + query: "rate(alertmanager_notifications_failed_total[3m]) > 0.05" severity: critical - name: Prometheus target empty description: Prometheus has no target in service discovery @@ -148,8 +148,10 @@ groups: for: 2m - name: Host memory under memory pressure description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s)." - query: "(rate(node_vmstat_pgmajfault[5m]) > 1000)" + query: "(deriv(node_vmstat_pgmajfault[5m]) > 1000)" severity: warning + comments: | + node_vmstat_pgmajfault is exposed as untyped/gauge by node_exporter (from /proc/vmstat), so deriv() is used instead of rate(). - name: Host Memory is underutilized description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})" query: "min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8" @@ -232,13 +234,13 @@ groups: query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' severity: warning - name: Host unusual disk IO - description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues." + description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities." query: "rate(node_disk_io_time_seconds_total[5m]) > 0.8" severity: warning for: 5m - name: Host context switching high description: Context switching is growing on the node (twice the daily average during the last 15m) - query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2' + query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 and rate(node_context_switches_total[1d]) > 0' severity: warning comments: | x2 context switches is an arbitrary number. @@ -266,7 +268,7 @@ groups: description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining." query: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)' comments: | - Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin. + Uses ignoring(state) to handle additional labels on node_md_disks. severity: critical - name: Host software RAID disk failure description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention." @@ -279,16 +281,18 @@ groups: severity: info - name: Host OOM kill detected description: OOM kill detected - query: "(increase(node_vmstat_oom_kill[30m]) > 0)" + query: "(delta(node_vmstat_oom_kill[30m]) > 0)" + comments: | + node_vmstat_oom_kill is exposed as untyped/gauge by node_exporter (from /proc/vmstat), so delta() is used instead of increase(). severity: warning comments: | When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger. - name: Host EDAC Correctable Errors detected - description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.' + description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 1 minute.' query: "(increase(node_edac_correctable_errors_total[1m]) > 0)" severity: info - name: Host EDAC Uncorrectable Errors detected - description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.' + description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC.' query: "(node_edac_uncorrectable_errors_total > 0)" severity: warning - name: Host Network Receive Errors @@ -337,27 +341,27 @@ groups: query: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70' severity: critical - name: SMART device temperature over trip value - description: Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }}) + description: Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }} query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}' severity: critical - name: SMART device temperature nearing trip value - description: Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }}) + description: Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }} query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)' severity: warning - name: SMART status - description: Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }}) + description: Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }} query: "smartctl_device_smart_status != 1" severity: critical - name: SMART critical warning - description: Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }}) + description: Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }} query: "smartctl_device_critical_warning > 0" severity: critical - name: SMART media errors - description: Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }}) + description: Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }} query: "smartctl_device_media_errors > 0" severity: critical - name: SMART Wearout Indicator - description: Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }}) + description: Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }} query: "smartctl_device_available_spare < smartctl_device_available_spare_threshold" severity: critical @@ -505,7 +509,7 @@ groups: severity: info - name: Container Low CPU utilization description: 'Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf "%.2f" }}%)' - query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20' + query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0' severity: info for: 7d - name: Container Low Memory usage @@ -524,6 +528,7 @@ groups: description: Probe failed query: probe_success == 0 severity: critical + for: 1m - name: Blackbox configuration reload failure description: Blackbox configuration reload failure query: "blackbox_exporter_config_last_reload_successful != 1" @@ -537,6 +542,7 @@ groups: description: HTTP status code is not 200-399 query: "probe_http_status_code <= 199 OR probe_http_status_code >= 400" severity: critical + for: 1m - name: Blackbox SSL certificate will expire soon description: SSL certificate expires in less than 20 days query: "3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20" @@ -743,7 +749,7 @@ groups: The exporter uses loose attachment: if a program fails to load (missing BTF, kernel incompatibility), it sets this metric to 0 and continues running. - name: eBPF exporter decoder errors description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})" - query: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0' + query: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0.05' severity: warning for: 5m - name: eBPF exporter no enabled configs @@ -850,7 +856,9 @@ groups: for: 5m - name: Systemd socket refused connections description: "Systemd socket {{ $labels.name }} is refusing connections. ({{ $value }} refused in last 5m, instance {{ $labels.instance }})" - query: 'increase(systemd_socket_refused_connections_total[5m]) > 0' + query: 'delta(systemd_socket_refused_connections_total[5m]) > 3' + comments: | + systemd_socket_refused_connections_total is declared as Gauge by the exporter despite the _total suffix, so delta() is used instead of increase(). severity: warning for: 2m - name: Systemd socket high connections @@ -918,13 +926,17 @@ groups: severity: critical for: 1m - name: MySQL slow queries - description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute)." - query: increase(mysql_global_status_slow_queries[1m]) > 0 + description: "MySQL server has some new slow queries ({{ $value }} in the last minute)." + query: delta(mysql_global_status_slow_queries[1m]) > 0 + comments: | + mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so delta() is used instead of increase(). severity: warning for: 2m - name: MySQL InnoDB log waits description: "MySQL innodb log writes stalling ({{ $value }} waits/s)" - query: rate(mysql_global_status_innodb_log_waits[15m]) > 10 + query: deriv(mysql_global_status_innodb_log_waits[15m]) > 10 + comments: | + mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of rate(). severity: warning - name: MySQL restarted description: MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}. @@ -932,7 +944,9 @@ groups: severity: info - name: MySQL High QPS description: MySQL is being overload with unusual QPS (> 10k QPS). - query: "irate(mysql_global_status_questions[1m]) > 10000" + query: "deriv(mysql_global_status_questions[1m]) > 10000" + comments: | + mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of irate(). severity: info for: 2m - name: MySQL too many open files @@ -992,11 +1006,11 @@ groups: for: 2m - name: Postgresql dead locks description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)" - query: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' + query: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres",datid!="0"}[1m]) > 5' severity: warning - name: Postgresql high rollback rate description: Ratio of transactions being aborted compared to committed is > 2 % - query: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02' + query: 'sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / (sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + sum by (namespace,datname,instance) (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))) > 0.02 and (sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + sum by (namespace,datname,instance) (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))) > 0' severity: warning - name: Postgresql commit rate low description: Postgresql seems to be processing very few transactions @@ -1008,6 +1022,8 @@ groups: query: "rate(pg_txid_current[1m]) < 5" severity: warning for: 2m + comments: | + pg_txid_current is not a default postgres_exporter metric. You need to define a custom query. See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: Postgresql unused replication slot description: Unused Replication Slots query: "(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)" @@ -1026,6 +1042,8 @@ groups: description: Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`. query: "sum by (instance) (pg_stat_ssl_compression) > 0" severity: warning + comments: | + pg_stat_ssl_compression is not a default postgres_exporter metric and is only available on PostgreSQL 9.5-13 (removed in PG 14). See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: Postgresql too many locks acquired description: Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction. query: "((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 and (pg_settings_max_locks_per_transaction * pg_settings_max_connections) > 0" @@ -1262,8 +1280,8 @@ groups: severity: critical for: 2m - name: Memcached out of memory errors - description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}" - query: "sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0" + description: "Memcached is returning out-of-memory errors on {{ $labels.instance }} ({{ $value }} errors/s)" + query: "sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0.05" severity: warning for: 5m - name: Memcached memory usage high (> 90%) @@ -1289,12 +1307,12 @@ groups: A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns. - name: Memcached connections rejected description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)" - query: "increase(memcached_connections_rejected_total[5m]) > 0" + query: "increase(memcached_connections_rejected_total[5m]) > 3" severity: warning for: 5m - name: Memcached items too large description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)" - query: "increase(memcached_item_too_large_total[5m]) > 0" + query: "increase(memcached_item_too_large_total[5m]) > 3" severity: info for: 5m @@ -1477,19 +1495,25 @@ groups: severity: warning - name: Elasticsearch High Indexing Latency description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s)." - query: "rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0" + query: "rate(elasticsearch_indices_indexing_index_time_seconds_total[5m]) / rate(elasticsearch_indices_indexing_index_total[5m]) > 0.01 and rate(elasticsearch_indices_indexing_index_total[5m]) > 0" severity: warning for: 10m + comments: | + Threshold of 10ms (0.01s) per indexing operation is a rough default. Adjust based on your document size and cluster performance. - name: Elasticsearch High Indexing Rate description: "The indexing rate on Elasticsearch cluster is higher than the threshold." query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000" severity: warning for: 5m + comments: | + Threshold of 10000 ops/s is a rough default. Adjust based on your cluster capacity and expected workload. - name: Elasticsearch High Query Rate description: "The query rate on Elasticsearch cluster is higher than the threshold." query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100" severity: warning for: 5m + comments: | + Threshold of 100 queries/s is very low for most production clusters. Adjust based on your expected query volume. - name: Elasticsearch High Query Latency description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s)." query: "rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0" @@ -1606,12 +1630,12 @@ groups: severity: critical - name: "Cassandra client request write failure (Instaclustr)" description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}" - query: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0' + query: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 5' for: 2m severity: critical - name: "Cassandra client request read failure (Instaclustr)" description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}" - query: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0' + query: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 5' for: 2m severity: critical @@ -1635,7 +1659,7 @@ groups: for: 2m - name: Cassandra authentication failures description: Increase of Cassandra authentication failures - query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5' + query: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5' severity: warning for: 2m - name: Cassandra node down @@ -1672,7 +1696,7 @@ groups: for: 2m - name: Cassandra connection timeouts total (Criteo) description: Some connection between nodes are ending in timeout - query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5' + query: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5' severity: critical for: 2m - name: Cassandra storage exceptions (Criteo) @@ -1693,17 +1717,19 @@ groups: severity: critical - name: Cassandra client request write failure (Criteo) description: A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large. - query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0' + query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0.05' severity: critical - name: Cassandra client request read failure (Criteo) description: A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large. - query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0' + query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0.05' severity: critical - name: Cassandra cache hit rate key cache description: Key cache hit rate is below 85% query: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85' - severity: critical + severity: warning for: 2m + comments: | + A low key cache hit rate increases disk I/O. Threshold is workload-dependent — adjust based on your data access patterns. - name: Clickhouse exporters: @@ -1789,12 +1815,12 @@ groups: - name: ClickHouse rejected insert queries description: "INSERTs rejected due to too many active data parts. Reduce insert frequency." - query: "increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0" + query: "increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 2" severity: warning for: 1m - name: ClickHouse delayed insert queries description: "INSERTs delayed due to high number of active parts." - query: "increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0" + query: "increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 10" severity: warning for: 2m - name: ClickHouse zookeeper hardware exception @@ -1811,7 +1837,7 @@ groups: Please replace the threshold with an appropriate value - name: ClickHouse distributed rejected inserts description: "INSERTs into Distributed tables rejected due to pending bytes limit." - query: "increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 0" + query: "increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 3" severity: critical for: 2m @@ -1836,11 +1862,15 @@ groups: query: "couchdb_httpd_open_databases > 0.9 * 1000" severity: critical for: 5m + comments: | + The default max_dbs_open is 500. Adjust the threshold (currently 0.9 * 1000) to match your max_dbs_open setting. - name: CouchDB open OS files critical description: CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files query: "couchdb_httpd_open_os_files > 0.9 * 65535" severity: critical for: 5m + comments: | + Adjust 65535 to match your system's file descriptor limit (ulimit -n). - name: CouchDB 5xx error ratio high description: More than 5% of HTTP requests are returning 5xx errors query: "rate(couchdb_httpd_status_codes{code=~\"5..\"}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0" @@ -1908,7 +1938,7 @@ groups: for: 1m - name: CouchDB critical log entries description: Critical or error log entries detected in the last 5 minutes - query: "increase(couchdb_server_couch_log{level=~\"error|critical\"}[5m]) > 0" + query: "increase(couchdb_server_couch_log{level=~\"error|critical\"}[5m]) > 5" severity: critical for: 1m @@ -1952,7 +1982,7 @@ groups: comments: | 1m delay allows a restart without triggering an alert. - name: RabbitMQ node not distributed - description: Distribution link state is not 'up' + description: Distribution link to peer {{ $labels.peer }} is not 'up' (state {{ $value }}) query: "erlang_vm_dist_node_state < 3" severity: critical for: 1m @@ -1974,12 +2004,12 @@ groups: severity: warning for: 2m - name: RabbitMQ too many ready messages - description: RabbitMQ too many ready messages on {{ $labels.instance }} + description: RabbitMQ too many ready messages on queue {{ $labels.queue }} ({{ $value }}) query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000" severity: warning for: 1m - name: RabbitMQ too many unack messages - description: Too many unacknowledged messages + description: Too many unacknowledged messages on queue {{ $labels.queue }} ({{ $value }}) query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000" severity: warning for: 1m @@ -1994,10 +2024,12 @@ groups: severity: warning for: 1m # allows a short service restart - name: RabbitMQ unroutable messages - description: A queue has unroutable messages ({{ $value }} in the last 1m) - query: "increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0" + description: A queue has unroutable messages ({{ $value }} in the last 5m) + query: "increase(rabbitmq_channel_messages_unroutable_returned_total[5m]) > 3 or increase(rabbitmq_channel_messages_unroutable_dropped_total[5m]) > 3" severity: warning for: 2m + comments: | + Threshold of 3 avoids noise from occasional misroutes. Adjust based on your expected traffic patterns. - name: kbudde/rabbitmq-exporter slug: kbudde-rabbitmq-exporter @@ -2018,7 +2050,7 @@ groups: comments: | 1m delay allows a restart without triggering an alert. - name: RabbitMQ cluster partition - description: Cluster partition + description: RabbitMQ cluster has a network partition ({{ $value }} partitions detected). Messages may be lost or duplicated. query: "rabbitmq_partitions > 0" severity: critical - name: RabbitMQ out of memory @@ -2028,7 +2060,7 @@ groups: for: 2m - name: RabbitMQ instance too many connections description: RabbitMQ instance has too many connections (> 1000) - query: "rabbitmq_connectionsTotal > 1000" + query: "rabbitmq_connections > 1000" severity: warning for: 2m - name: RabbitMQ dead letter queue filling up @@ -2095,11 +2127,11 @@ groups: query: "sum(zk_server_leader) == 0" severity: critical - name: Zookeeper Too Many Leaders - description: "Zookeeper cluster has too many nodes marked as leader" + description: "Zookeeper cluster has {{ $value }} nodes marked as leader (expected 1), indicating a split-brain" query: "sum(zk_server_leader) > 1" severity: critical - name: Zookeeper Not Ok - description: "Zookeeper instance is not ok" + description: "Zookeeper instance {{ $labels.instance }} is not ok (ruok check failed)" query: "zk_ruok == 0" severity: warning for: 3m @@ -2111,7 +2143,7 @@ groups: doc_url: https://github.com/danielqsj/kafka_exporter rules: - name: Kafka topics replicas - description: Kafka topic in-sync partition + description: Kafka topic {{ $labels.topic }} has fewer than 3 in-sync replicas ({{ $value }}), data durability is at risk. query: "min(kafka_topic_partition_in_sync_replica) by (topic) < 3" severity: critical - name: Kafka consumer group lag @@ -2160,32 +2192,41 @@ groups: for: 1h severity: critical - name: Pulsar high write latency - description: "Messages cannot be written in a timely fashion" - query: sum(pulsar_storage_write_latency_overflow > 0) by (topic) + description: "Pulsar topic {{ $labels.topic }} has {{ $value }} storage write operations exceeding the maximum latency bucket (> 1000ms)" + query: sum(pulsar_storage_write_latency_le_overflow > 0) by (topic) for: 1h severity: critical + comments: | + pulsar_storage_write_latency_le_overflow is the overflow bucket of Pulsar's non-standard histogram. + It counts write operations exceeding all defined latency bounds (> 1000ms). - name: Pulsar large message payload - description: "Observing large message payload (> 1MB)" - query: sum(pulsar_entry_size_overflow > 0) by (topic) + description: "Pulsar topic {{ $labels.topic }} has {{ $value }} message entries exceeding the maximum size bucket (> 1MB)" + query: sum(pulsar_entry_size_le_overflow > 0) by (topic) for: 1h severity: warning + comments: | + pulsar_entry_size_le_overflow is the overflow bucket of Pulsar's non-standard histogram. + It counts message entries exceeding all defined size bounds. - name: Pulsar high ledger disk usage description: "Observing Ledger Disk Usage (> 75%)" query: sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75 for: 1h severity: critical + comments: | + This metric name is path-dependent and may differ based on your BookKeeper data directory configuration. + Adjust the metric name to match your actual ledger directory path. - name: Pulsar read only bookies description: "Observing Readonly Bookies" query: count(bookie_SERVER_STATUS{} == 0) by (pod) for: 5m severity: critical - name: Pulsar high number of function errors - description: "Observing more than 10 Function errors per minute" + description: "Pulsar function {{ $labels.name }} has more than 10 errors per second ({{ $value | printf \"%.2f\" }}/s)" query: sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10 for: 1m severity: critical - name: Pulsar high number of sink errors - description: "Observing more than 10 Sink errors per minute" + description: "Pulsar sink {{ $labels.name }} has more than 10 errors per second ({{ $value | printf \"%.2f\" }}/s)" query: sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10 for: 1m severity: critical @@ -2216,6 +2257,8 @@ groups: query: 'absent(up{job="nats"})' severity: critical for: 5m + comments: | + Replace job="nats" with the actual job name in your Prometheus configuration. - name: Nats high CPU usage description: NATS server is using more than 80% CPU for the last 5 minutes query: "gnatsd_varz_cpu > 80" @@ -2240,7 +2283,7 @@ groups: for: 5m - name: Nats high number of subscriptions description: NATS server has more than 1000 active subscriptions - query: "gnatsd_connz_subscriptions > 1000" + query: "gnatsd_varz_subscriptions > 1000" severity: warning for: 5m - name: Nats high pending bytes @@ -2250,7 +2293,7 @@ groups: for: 5m - name: Nats too many errors description: NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes - query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0" + query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 5" severity: warning for: 5m - name: Nats JetStream accounts exceeded @@ -2263,6 +2306,9 @@ groups: query: "gnatsd_varz_leafnodes == 0" severity: warning for: 5m + comments: | + Only enable this alert if your deployment requires leaf node connections. + This will fire spuriously if leaf nodes are not configured. - name: Proxies, load balancers and service meshes services: @@ -2306,7 +2352,7 @@ groups: - name: Apache restart description: Apache has just been restarted. query: "apache_uptime_seconds_total / 60 < 1" - severity: warning + severity: info - name: HaProxy exporters: @@ -2315,12 +2361,12 @@ groups: doc_url: https://github.com/haproxy/haproxy/tree/master/contrib/prometheus-exporter rules: - name: HAProxy high HTTP 4xx error rate backend - description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} + description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.proxy }} query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0 severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate backend - description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} + description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.proxy }} query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0 severity: critical for: 1m @@ -2340,17 +2386,17 @@ groups: severity: critical for: 1m - name: HAProxy backend connection errors - description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high. + description: Too many connection errors to {{ $labels.proxy }} backend (> 100 req/s). Request throughput may be too high. query: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100 severity: critical for: 1m - name: HAProxy server connection errors - description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high. + description: Too many connection errors to {{ $labels.proxy }} (> 100 req/s). Request throughput may be too high. query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100 severity: critical - name: HAProxy backend max active session > 80% description: Session limit from backend {{ $labels.proxy }} reached 80% of limit - {{ $value | printf "%.2f"}}% - query: ((haproxy_backend_current_sessions >0) * 100) / (haproxy_backend_limit_sessions > 0) > 80 + query: (haproxy_backend_current_sessions / haproxy_backend_limit_sessions * 100) > 80 and haproxy_backend_limit_sessions > 0 severity: warning for: 2m - name: HAProxy pending requests @@ -2361,7 +2407,7 @@ groups: severity: warning for: 2m - name: HAProxy HTTP slowing down - description: Average request time is increasing - {{ $value | printf "%.2f"}} + description: HAProxy backend max total time is above 1s on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}s query: avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1 severity: warning for: 1m @@ -2381,9 +2427,8 @@ groups: for: 2m - name: HAProxy server healthcheck failure description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m) - query: increase(haproxy_server_check_failures_total[1m]) > 0 + query: increase(haproxy_server_check_failures_total[1m]) > 2 severity: warning - for: 1m - name: prometheus/haproxy_exporter (HAProxy < v2) slug: haproxy-exporter-v1 doc_url: https://github.com/prometheus/haproxy_exporter @@ -2393,13 +2438,13 @@ groups: query: "haproxy_up == 0" severity: critical - name: HAProxy high HTTP 4xx error rate backend (v1) - description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} - query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' + description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.backend }} + query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate backend (v1) - description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} - query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' + description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.backend }} + query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' severity: critical for: 1m - name: HAProxy high HTTP 4xx error rate server (v1) @@ -2418,7 +2463,7 @@ groups: severity: critical for: 1m - name: HAProxy backend connection errors (v1) - description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high. + description: Too many connection errors to {{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high. query: "sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100" severity: critical for: 1m @@ -2427,12 +2472,12 @@ groups: query: "sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100" severity: critical - name: HAProxy backend max active session - description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%). + description: HAProxy backend {{ $labels.backend }} is reaching session limit (> 80%). query: "((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0" severity: warning for: 2m - name: HAProxy pending requests (v1) - description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend + description: Some HAProxy requests are pending on {{ $labels.backend }} backend query: "sum by (backend) (haproxy_backend_current_queue) > 0" severity: warning for: 2m @@ -2442,7 +2487,7 @@ groups: severity: warning for: 1m - name: HAProxy retry high (v1) - description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend + description: High rate of retry on {{ $labels.backend }} backend query: "sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10" severity: warning for: 2m @@ -2461,9 +2506,8 @@ groups: for: 2m - name: HAProxy server healthcheck failure (v1) description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m) - query: "increase(haproxy_server_check_failures_total[1m]) > 0" + query: "increase(haproxy_server_check_failures_total[1m]) > 2" severity: warning - for: 1m - name: Traefik exporters: @@ -2511,8 +2555,8 @@ groups: doc_url: https://caddyserver.com/docs/metrics rules: - name: Caddy Reverse Proxy Down - description: "All Caddy reverse proxies are down" - query: "count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0" + description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy" + query: "caddy_reverse_proxy_upstreams_healthy == 0" severity: critical - name: Caddy high HTTP 4xx error rate service @@ -2562,7 +2606,7 @@ groups: severity: critical for: 1m - name: Envoy cluster membership degraded - description: "More than 25% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are unhealthy" + description: "Only {{ $value | printf \"%.1f\" }}% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are healthy (threshold: 75%)" query: "envoy_cluster_membership_healthy / envoy_cluster_membership_total * 100 < 75 and envoy_cluster_membership_total > 0" severity: warning for: 5m @@ -2613,7 +2657,7 @@ groups: severity: critical - name: Envoy no healthy upstream description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)" - query: "increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 0" + query: "increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 3" severity: critical - name: Envoy high downstream request timeout rate description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)" @@ -2642,11 +2686,11 @@ groups: doc_url: https://istio.io/latest/docs/tasks/observability/metrics/querying-metrics/ rules: - name: Istio Kubernetes gateway availability drop - description: Gateway pods have dropped. Inbound traffic will likely be affected. + description: Istio ingress gateway has only {{ $value }} available pod(s). Inbound traffic will likely be affected. query: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2' severity: warning for: 1m - - name: Istio Pilot high total request rate + - name: Istio Pilot high push error rate description: Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration. query: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0" severity: warning @@ -2656,39 +2700,45 @@ groups: query: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180' severity: warning for: 1m + comments: | + Mixer was deprecated in Istio 1.5 and removed in Istio 1.8+. This alert only applies to Istio < 1.8. - name: Istio high total request rate - description: Global request rate in the service mesh is unusually high. + description: Global request rate in the service mesh is unusually high ({{ $value | printf "%.2f" }} req/s). query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000' severity: warning for: 2m + comments: | + Threshold of 1000 req/s is a rough default. Adjust to your expected peak traffic. - name: Istio low total request rate - description: Global request rate in the service mesh is unusually low. + description: Global request rate in the service mesh is unusually low ({{ $value | printf "%.2f" }} req/s). query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100' severity: warning for: 2m + comments: | + Threshold of 100 req/s is a rough default. Adjust to your expected baseline traffic. This alert may fire on startup or low-traffic environments. - name: Istio high 4xx error rate - description: High percentage of HTTP 4xx responses in Istio (> 5%). + description: High percentage of HTTP 4xx responses in Istio ({{ $value | printf "%.1f" }}% > 5%). query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0' severity: warning for: 1m - name: Istio high 5xx error rate - description: High percentage of HTTP 5xx responses in Istio (> 5%). + description: High percentage of HTTP 5xx responses in Istio ({{ $value | printf "%.1f" }}% > 5%). query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0' severity: warning for: 1m - name: Istio high request latency - description: Istio average requests execution is longer than 100ms. + description: Istio average request duration is {{ $value }}ms (> 100ms). query: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100 and rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 0' severity: warning for: 1m - name: Istio latency 99 percentile - description: Istio 1% slowest requests are longer than 1000ms. - query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000" + description: "Istio p99 request latency is {{ $value }}ms (threshold: 1000ms)." + query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, le)) > 1000" severity: warning for: 1m - name: Istio Pilot Duplicate Entry - description: Istio pilot duplicate entry error. - query: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0" + description: Istio Pilot has detected {{ $value }} duplicate Envoy cluster(s), indicating misconfigured DestinationRules or ServiceEntries. + query: "sum(pilot_duplicate_envoy_clusters{}) > 0" severity: critical - name: Runtimes @@ -2825,22 +2875,27 @@ groups: Threshold is a rough default. Adjust based on your application's normal object count. - name: Go GC CPU fraction high description: Go GC is consuming too much CPU (> 5%) - query: 'go_memstats_gc_cpu_fraction > 0.05' + query: 'rate(go_gc_duration_seconds_sum[5m]) > 0.05' severity: warning for: 5m comments: | - go_memstats_gc_cpu_fraction is deprecated since Go 1.20 and may return 0 in newer versions. - Consider using runtime/metrics-based alternatives if running Go >= 1.20. + rate(go_gc_duration_seconds_sum) approximates the fraction of wall-clock time spent in GC. + This replaces go_memstats_gc_cpu_fraction which was removed in client_golang v1.12+. - name: Go goroutine spike - description: Go goroutine count is growing rapidly - query: 'deriv(go_goroutines[5m]) > 100' + description: Go goroutine count is growing rapidly ({{ $value | printf "%.0f" }} goroutines/s) + query: 'deriv(go_goroutines[5m]) > 10' severity: warning for: 5m - - name: Go heap fragmentation - description: Go heap has high idle ratio (> 90%), indicating memory fragmentation - query: 'go_memstats_heap_idle_bytes / go_memstats_heap_sys_bytes > 0.9' + comments: | + A threshold of 100/s only catches catastrophic leaks (30k goroutines in 5m). 10/s catches gradual leaks (~3k in 5m). + Adjust based on your application's expected concurrency patterns. + - name: Go heap in-use growing + description: Go heap in-use memory is growing steadily, potential memory leak or under-sized heap + query: 'deriv(go_memstats_heap_inuse_bytes[10m]) > 1e7' severity: warning - for: 5m + comments: | + Alerts when heap in-use grows by more than 10MB/s sustained over 10 minutes. + Adjust threshold based on your workload. - name: Go memory leak description: Go application has sustained high allocation rate (> 1GB/s), potential memory leak query: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9' @@ -2872,11 +2927,11 @@ groups: for: 5m - name: Ruby major GC rate high description: Ruby is performing too many major GC cycles, indicating memory pressure - query: 'rate(ruby_major_gc_ops_total[5m]) > 5' + query: 'rate(ruby_major_gc_ops_total[5m]) > 2' severity: warning for: 5m comments: | - Major GC rate > 5/s is extremely high. Consider lowering to > 1 or > 2 for earlier detection. + Major GC rate > 5/s only fires if the app is essentially non-functional. Threshold of 2/s provides earlier detection. - name: Ruby RSS high description: Ruby process RSS is high (> 1GB) query: 'ruby_rss > 1e9' @@ -2895,8 +2950,8 @@ groups: doc_url: https://github.com/prometheus/client_python rules: - name: Python GC objects uncollectable - description: Python has uncollectable objects, potential memory leak via reference cycles - query: 'increase(python_gc_objects_uncollectable_total[5m]) > 0' + description: Python has uncollectable objects ({{ $value }}), potential memory leak via reference cycles + query: 'increase(python_gc_objects_uncollectable_total[5m]) > 1' severity: warning for: 5m - name: Python GC collections high @@ -2933,13 +2988,13 @@ groups: doc_url: https://github.com/Strech/sidekiq-prometheus-exporter rules: - name: Sidekiq queue size - description: Sidekiq queue {{ $labels.name }} is growing - query: "sidekiq_queue_size > 100" + description: Sidekiq queue {{ $labels.name }} is growing ({{ $value }} enqueued jobs) + query: "sidekiq_queue_enqueued_jobs > 100" severity: warning for: 1m - name: Sidekiq scheduling latency too high description: Sidekiq jobs are taking more than 1min to be picked up. Users may be seeing delays in background processing. - query: "max(sidekiq_queue_latency) > 60" + query: "max(sidekiq_queue_latency_seconds) > 60" severity: critical - name: Data engineering @@ -2969,14 +3024,16 @@ groups: This alert fires when there are no available task slots. Adjust the threshold if your cluster is expected to run at full capacity. - name: Flink job restart increasing description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes." - query: "increase(flink_jobmanager_job_numRestarts[5m]) > 1" + query: "delta(flink_jobmanager_job_numRestarts[5m]) > 1" + comments: | + Flink exposes numRestarts as a gauge (cumulative count), so delta() is used instead of increase(). severity: warning for: 5m comments: | A single restart may be normal during deployments. Adjust threshold based on restart tolerance. - name: Flink checkpoint failures description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes." - query: "increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1" + query: "delta(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1" severity: warning for: 5m - name: Flink checkpoint duration high @@ -3001,24 +3058,27 @@ groups: Fires when a task spends more than 500ms/sec backpressured. This indicates the task cannot keep up with upstream data rate. - name: Flink TaskManager heap memory high description: "Flink TaskManager {{ $labels.instance }} heap memory usage is above 90%." - query: "flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9" + query: "flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_taskmanager_Status_JVM_Memory_Heap_Max > 0" severity: warning for: 5m + comments: | + Flink TaskManagers manage their own memory pool. High JVM heap usage (outside managed memory) may indicate memory leaks or misconfiguration. - name: Flink JobManager heap memory high description: "Flink JobManager {{ $labels.instance }} heap memory usage is above 90%." - query: "flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9" + query: "flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_jobmanager_Status_JVM_Memory_Heap_Max > 0" severity: warning for: 5m - name: Flink TaskManager GC time high description: "Flink TaskManager {{ $labels.instance }} is spending more than 10% of time in garbage collection." - query: "rate(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100" + query: "deriv(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100" severity: warning for: 5m comments: | + Flink exposes GC time as a gauge (cumulative milliseconds), so deriv() is used instead of rate(). Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload. - name: Flink no records processed description: "Flink task {{ $labels.task_name }} has not processed any records in the last 5 minutes." - query: "rate(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0" + query: "delta(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0" severity: warning for: 5m comments: | @@ -3099,6 +3159,10 @@ groups: for: 5m severity: critical description: "The Hadoop NameNode service is unavailable." + comments: | + When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0, + so this alert may not fire. Prefer application-level availability metrics if available. + Rename job="hadoop-namenode" to match the actual job name in your Prometheus scrape config. # Alert rule for ResourceManager availability - name: Hadoop Resource Manager Down @@ -3106,6 +3170,10 @@ groups: for: 5m severity: critical description: "The Hadoop ResourceManager service is unavailable." + comments: | + When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0, + so this alert may not fire. Prefer application-level availability metrics if available. + Rename job="hadoop-resourcemanager" to match the actual job name in your Prometheus scrape config. # Alert rule for DataNode status - name: Hadoop Data Node Out Of Service @@ -3130,7 +3198,7 @@ groups: # Alert rule for high ResourceManager memory usage - name: Hadoop Resource Manager Memory High - query: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 + query: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 and hadoop_resourcemanager_memory_max_bytes > 0 for: 15m severity: warning description: "The Hadoop ResourceManager is approaching its memory limit." @@ -3151,7 +3219,7 @@ groups: # Alert rule for low HBase region server heap space - name: Hadoop HBase Region Server Heap Low - query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 + query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 and hadoop_hbase_region_server_max_heap_bytes > 0 for: 10m severity: warning description: "HBase Region Servers are running low on heap space." @@ -3240,7 +3308,7 @@ groups: severity: critical - name: Kubernetes PersistentVolume error description: "Persistent volume {{ $labels.persistentvolume }} is in bad state" - query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0' + query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending"} > 0' severity: critical - name: Kubernetes StatefulSet down description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down @@ -3356,20 +3424,20 @@ groups: slug: embedded-exporter rules: - name: Nomad job failed - description: Nomad job failed + description: "Nomad job {{ $labels.job }} has {{ $value }} failed allocations." query: "nomad_nomad_job_summary_failed > 0" severity: warning - name: Nomad job lost - description: Nomad job lost + description: "Nomad job {{ $labels.job }} has {{ $value }} lost allocations." query: "nomad_nomad_job_summary_lost > 0" severity: warning - name: Nomad job queued - description: Nomad job queued + description: "Nomad job {{ $labels.job }} has {{ $value }} queued allocations." query: "nomad_nomad_job_summary_queued > 0" severity: warning for: 2m - name: Nomad blocked evaluation - description: Nomad blocked evaluation + description: "Nomad has {{ $value }} blocked evaluations. The cluster may lack resources to place allocations." query: "nomad_nomad_blocked_evals_total_blocked > 0" severity: warning @@ -3429,24 +3497,29 @@ groups: query: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15' severity: warning for: 2m + # etcd_http_* metrics are from the v2 API and were removed in etcd 3.x. + # These rules only apply if you are running etcd 2.x. - name: Etcd high number of failed HTTP requests warning description: More than 1% HTTP failure detected in Etcd query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0" severity: warning for: 2m + comments: "These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x." - name: Etcd high number of failed HTTP requests critical description: More than 5% HTTP failure detected in Etcd query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0" severity: critical for: 2m + comments: "These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x." - name: Etcd HTTP requests slow description: HTTP requests slowing down, 99th percentile is over 0.15s query: "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15" severity: warning for: 2m + comments: "This etcd_http_* metric is from the etcd v2 API and does not exist in etcd 3.x. Remove this rule if running etcd 3.x." - name: Etcd member communication slow description: Etcd member communication slowing down, 99th percentile is over 0.15s - query: "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15" + query: "histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) by (instance, le)) > 0.15" severity: warning for: 2m - name: Etcd high number of failed proposals @@ -3456,12 +3529,12 @@ groups: for: 2m - name: Etcd high fsync durations description: Etcd WAL fsync duration increasing, 99th percentile is over 0.5s - query: "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5" + query: "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le)) > 0.5" severity: warning for: 2m - name: Etcd high commit durations description: Etcd commit duration increasing, 99th percentile is over 0.25s - query: "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25" + query: "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le)) > 0.25" severity: warning for: 2m @@ -3476,6 +3549,8 @@ groups: query: 'up{job=~".*openstack.*"} == 0' severity: critical for: 2m + comments: | + Adjust the job label regex to match the actual job name in your Prometheus scrape config. - name: OpenStack Nova agent down description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}" query: 'openstack_nova_agent_state{adminState="enabled"} == 0' @@ -3608,7 +3683,7 @@ groups: severity: critical - name: Jenkins run failure total description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" - query: "delta(jenkins_runs_failure_total[1h]) > 100" + query: "increase(jenkins_runs_failure_total[1h]) > 100" severity: warning - name: Jenkins build tests failing description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})" @@ -3770,7 +3845,7 @@ groups: for: 5m - name: GitLab CI pipeline failures increasing description: "GitLab CI pipeline failures are increasing on {{ $labels.instance }} ({{ $value }}/s)." - query: "rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0" + query: "deriv(gitlab_ci_pipeline_failure_reasons[5m]) > 0.05" severity: warning for: 10m comments: | @@ -3802,7 +3877,7 @@ groups: # Uncaught errors - name: GitLab rack uncaught errors description: "GitLab is experiencing uncaught errors in the Rack layer on {{ $labels.instance }} ({{ $value }}/s)." - query: "rate(rack_uncaught_errors_total[5m]) > 0" + query: "rate(rack_uncaught_errors_total[5m]) > 0.05" severity: warning for: 5m # Application version / deployment @@ -3856,11 +3931,11 @@ groups: rules: - name: GitLab Gitaly high gRPC error rate description: "Gitaly on {{ $labels.instance }} is returning more than 5% gRPC errors." - query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' + query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown|DataLoss"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' severity: warning for: 5m comments: | - grpc_code!="OK" includes non-error codes like NotFound, AlreadyExists. Consider filtering to specific error codes for less noise. + Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - name: GitLab Gitaly resource exhausted description: "Gitaly on {{ $labels.instance }} is returning ResourceExhausted errors, indicating overload ({{ $value }}%)." query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' @@ -3869,7 +3944,6 @@ groups: comments: | ResourceExhausted errors from Gitaly mean Git operations are being rejected due to concurrency limits. This directly impacts users trying to push, pull, or clone. - This alert is derived from the GitLab Omnibus default rules. - name: GitLab Gitaly high RPC latency description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s)." query: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1' @@ -3877,12 +3951,14 @@ groups: for: 5m - name: GitLab Gitaly CPU throttled description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups." - query: "rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0" + query: "rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0.1" severity: warning for: 5m + comments: | + Brief throttling spikes are normal. Threshold of 0.1s/s (10% of CPU time throttled) filters out transient noise. - name: GitLab Gitaly authentication failures description: "Gitaly on {{ $labels.instance }} has authentication failures ({{ $value }})." - query: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 0' + query: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 3' severity: warning - name: GitLab Gitaly circuit breaker tripped description: "Gitaly circuit breaker has tripped on {{ $labels.instance }}. Git operations are failing." @@ -3919,13 +3995,13 @@ groups: comments: | The 30s threshold is a rough default. Adjust based on your pipeline SLOs. - name: Spinnaker dead messages - description: "Orca is producing dead-lettered messages ({{ $value }} per second). These are tasks that exhausted all retries and will not be executed." - query: 'rate(queue_dead_messages_total[5m]) > 0' + description: "Orca is producing dead-lettered messages ({{ $value | humanize }}/s). These are tasks that exhausted all retries and will not be executed." + query: 'rate(queue_dead_messages_total[5m]) > 0.05' severity: critical for: 2m - name: Spinnaker zombie executions - description: "{{ $value }} zombie pipeline executions detected. These are executions with no corresponding queue messages." - query: 'rate(queue_zombies_total[5m]) > 0' + description: "Zombie pipeline executions rate is {{ $value | humanize }}/s. These are executions with no corresponding queue messages." + query: 'rate(queue_zombies_total[5m]) > 0.05' severity: warning for: 5m comments: | @@ -3946,7 +4022,7 @@ groups: See https://kb.armory.io/s/article/Hitting-Igor-s-caching-thresholds - name: Spinnaker polling monitor failures description: "Igor polling monitor is experiencing failures ({{ $value }} per second). CI/SCM integrations may not trigger pipelines." - query: 'rate(pollingMonitor_failed_total[5m]) > 0' + query: 'rate(pollingMonitor_failed_total[5m]) > 0.05' severity: warning for: 5m - name: Spinnaker high API error rate @@ -3958,7 +4034,7 @@ groups: The 5% threshold is a rough default. Adjust based on your traffic patterns. - name: Spinnaker API rate limit throttling description: "Gate is actively throttling API requests on {{ $labels.instance }} ({{ $value }} throttled requests per second)." - query: 'rate(rateLimitThrottling_total[5m]) > 0' + query: 'rate(rateLimitThrottling_total[5m]) > 0.05' severity: warning for: 2m - name: Spinnaker Clouddriver high error rate @@ -4002,8 +4078,9 @@ groups: description: Failed to fetch SSL information {{ $labels.instance }} query: ssl_probe_success == 0 severity: critical - - name: SSL certificate OSCP status unknown - description: Failed to get the OSCP status {{ $labels.instance }} + for: 1m + - name: SSL certificate OCSP status unknown + description: Failed to get the OCSP status for {{ $labels.instance }} query: ssl_ocsp_response_status == 2 severity: warning - name: SSL certificate revoked @@ -4040,11 +4117,12 @@ groups: for: 10m - name: Cert-Manager hitting ACME rate limits description: Cert-Manager is being rate-limited by the ACME provider. Certificate issuance and renewal may be blocked for up to a week. - query: 'sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0' + query: 'sum by (host) (rate(certmanager_acme_client_request_count{status="429"}[5m])) > 0' severity: critical for: 5m comments: | - In cert-manager 1.19+, the metric was renamed (dropped http_ prefix). Verify metric name against your version. + Metric renamed in cert-manager v1.19+ (dropped the http_ prefix): certmanager_acme_client_request_count. + For cert-manager < v1.19, use: certmanager_http_acme_client_request_count. - name: Juniper exporters: @@ -4084,9 +4162,10 @@ groups: doc_url: https://pypi.org/project/prometheus-freeswitch-exporter rules: - name: Freeswitch down - description: Freeswitch is unresponsive + description: "Freeswitch {{ $labels.instance }} is unresponsive." query: "freeswitch_up == 0" severity: critical + for: 1m - name: Freeswitch Sessions Warning description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 80 and freeswitch_session_limit > 0" @@ -4108,19 +4187,20 @@ groups: description: "Vault instance is sealed on {{ $labels.instance }}" query: "vault_core_unsealed == 0" severity: critical + for: 1m - name: Vault too many pending tokens - description: 'Too many pending tokens {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' + description: "Too many pending tokens on {{ $labels.instance }}: {{ $value }} tokens created but not yet stored." query: "avg(vault_token_create_count - vault_token_store_count) > 0" severity: warning for: 5m - name: Vault too many infinity tokens - description: 'Too many infinity tokens {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' + description: "Too many non-expiring tokens on {{ $labels.instance }}: {{ $value }} tokens with infinite TTL." query: 'vault_token_count_by_ttl{creation_ttl="+Inf"} > 3' severity: warning for: 5m - name: Vault cluster health - description: 'Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' - query: "sum(vault_core_active) / count(vault_core_active) <= 0.5" + description: "Vault cluster is not healthy: only {{ $value | humanizePercentage }} of nodes are active." + query: "sum(vault_core_active) / count(vault_core_active) <= 0.5 and count(vault_core_active) > 0" severity: critical - name: Keycloak @@ -4198,7 +4278,8 @@ groups: query: 'up{job=~"snmp.*"} == 0' severity: critical for: 5m - comments: From the official snmp-mixin. + comments: | + Rename job=~"snmp.*" to match the actual job name in your Prometheus scrape config. - name: SNMP interface down description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} is operationally down while administratively up." query: '(ifOperStatus{job=~"snmp.*"} == 2) and on(instance, job, ifIndex) (ifAdminStatus{job=~"snmp.*"} == 1)' @@ -4272,23 +4353,23 @@ groups: for: 5m - name: Cilium agent endpoint regeneration failures description: "Cilium agent {{ $labels.pod }} is failing to regenerate endpoints. Network policy enforcement may be stale." - query: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0' + query: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0.05' severity: warning for: 5m - name: Cilium agent endpoint update failure description: "Cilium agent {{ $labels.pod }} is failing K8s endpoint update API calls ({{ $labels.method }} {{ $labels.return_code }})." - query: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0' + query: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0.05' severity: warning for: 5m - name: Cilium agent endpoint create failure description: "Cilium agent {{ $labels.pod }} is failing CNI endpoint-create calls. New pods may fail to get networking." - query: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0' + query: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0.05' severity: info for: 5m # BPF maps - name: Cilium agent map operation failures description: "Cilium agent {{ $labels.pod }} has eBPF map operation failures on {{ $labels.map_name }}. Datapath may be degraded." - query: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0' + query: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0.05' severity: warning for: 5m - name: Cilium agent BPF map pressure @@ -4305,7 +4386,7 @@ groups: for: 5m - name: Cilium agent conntrack failed garbage collection description: "Cilium agent {{ $labels.pod }} conntrack garbage collection is failing. Stale entries may accumulate." - query: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0' + query: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0.05' severity: warning for: 5m - name: Cilium agent NAT table full @@ -4322,7 +4403,7 @@ groups: comments: Policy denials may be expected behavior. Investigate only if unexpected traffic is being blocked. - name: Cilium agent high drop rate description: "Cilium agent {{ $labels.pod }} is dropping packets for reason {{ $labels.reason }}. This indicates infrastructure issues." - query: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0' + query: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0.05' severity: warning for: 5m # Policy @@ -4333,12 +4414,12 @@ groups: for: 5m - name: Cilium agent policy import errors description: "Cilium agent {{ $labels.pod }} is failing to import network policies. Policy enforcement may be incomplete." - query: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0' + query: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0.05' severity: warning for: 5m - name: Cilium agent policy implementation delay description: "Cilium agent {{ $labels.pod }} P99 policy deployment latency exceeds 60 seconds. Endpoints may run with stale policies." - query: "histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay[5m])) by (le, pod)) > 60" + query: "histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay_bucket[5m])) by (le, pod)) > 60" severity: warning for: 5m comments: Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity. @@ -4367,7 +4448,7 @@ groups: comments: Threshold of 90% is a rough default. Adjust based on your pod churn rate and IP pool size. - name: Cilium operator IPAM interface creation failures description: "Cilium operator is failing to create IPAM network interfaces. IP allocation may be impacted." - query: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0' + query: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0.05' severity: warning for: 10m comments: | @@ -4375,12 +4456,12 @@ groups: # API and K8s client - name: Cilium agent API errors description: "Cilium agent {{ $labels.pod }} API is returning 5xx errors ({{ $labels.return_code }}). Agent may be unhealthy." - query: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0' + query: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0.05' severity: warning for: 5m - name: Cilium agent Kubernetes client errors description: "Cilium agent {{ $labels.pod }} is receiving errors from K8s API for endpoint {{ $labels.endpoint }} ({{ $labels.return_code }})." - query: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0' + query: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0.05' severity: info for: 5m # ClusterMesh @@ -4390,8 +4471,8 @@ groups: severity: critical for: 5m - name: Cilium ClusterMesh remote cluster failing - description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing." - query: "sum(rate(cilium_clustermesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0" + description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing ({{ $value }} failures)." + query: "sum(cilium_clustermesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0" severity: critical for: 5m # KVStoreMesh @@ -4401,19 +4482,19 @@ groups: severity: critical for: 5m - name: Cilium KVStoreMesh remote cluster failing - description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures." - query: "sum(rate(cilium_kvstoremesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0" + description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures ({{ $value }} failures)." + query: "sum(cilium_kvstoremesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0" severity: critical for: 5m - name: Cilium KVStoreMesh sync errors description: "Cilium KVStoreMesh from {{ $labels.source_cluster }} is experiencing kvstore sync errors." - query: "sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0" + query: "sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0.05" severity: critical for: 5m # Hubble - name: Cilium Hubble lost events description: "Cilium Hubble on {{ $labels.pod }} is losing flow events. Observability data may be incomplete." - query: "sum(rate(hubble_lost_events_total[5m])) by (pod) > 0" + query: "sum(rate(hubble_lost_events_total[5m])) by (pod) > 0.05" severity: warning for: 5m - name: Cilium Hubble high DNS error rate @@ -4467,6 +4548,10 @@ groups: description: Ceph instance unhealthy query: "ceph_health_status != 0" severity: critical + for: 1m + comments: | + ceph_health_status: 0=HEALTH_OK, 1=HEALTH_WARN, 2=HEALTH_ERR. + This rule fires on any non-OK state. Split into separate warning/critical rules by using ==1 and ==2 thresholds if needed. - name: Ceph monitor clock skew description: Ceph monitor clock skew detected. Please check ntp and hardware clock settings query: "abs(ceph_monitor_clock_skew_seconds) > 0.2" @@ -4481,16 +4566,22 @@ groups: description: Ceph Object Storage Daemon Down query: "ceph_osd_up == 0" severity: critical + for: 1m - name: Ceph high OSD latency description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state." - query: "ceph_osd_perf_apply_latency_seconds > 5" + query: "ceph_osd_apply_latency_ms > 5000" severity: warning for: 1m - - name: Ceph OSD low space - description: Ceph Object Storage Daemon is going out of space. Please add more disks. - query: ceph_osd_utilization > 90 + comments: | + Threshold of 5000ms (5 seconds). Adjust based on your expected OSD performance. + - name: Ceph OSD near full + description: A Ceph OSD is dangerously full. Please add more disks. + query: 'ceph_health_detail{name="OSD_NEARFULL"} == 1' severity: warning - for: 2m + for: 5m + comments: | + Ceph internally triggers OSD_NEARFULL based on the nearfull_ratio (default 85%). + ceph_health_detail can also be used for more granular OSD space alerts. - name: Ceph OSD reweighted description: Ceph Object Storage Daemon takes too much time to resize. query: "ceph_osd_weight < 1" @@ -4522,6 +4613,7 @@ groups: description: Some Ceph placement groups are unavailable. query: "ceph_pg_total - ceph_pg_active > 0" severity: critical + for: 1m - name: ZFS exporters: @@ -4539,8 +4631,8 @@ groups: doc_url: https://github.com/pdf/zfs_exporter rules: - name: ZFS pool out of space - description: Disk is almost full (< 10% left) - query: "zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0" + description: "ZFS pool {{ $labels.pool }} is almost full (< 10% left)." + query: "zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0" severity: warning - name: ZFS pool unhealthy description: ZFS pool state is {{ $value }}. See comments for more information. @@ -4761,7 +4853,7 @@ groups: severity: warning - name: DigitalOcean exporter collection errors description: "DigitalOcean exporter {{ $labels.collector }} collector has {{ $value }} errors." - query: "increase(digitalocean_errors_total[5m]) > 0" + query: "increase(digitalocean_errors_total[5m]) > 3" severity: warning for: 5m - name: DigitalOcean droplet limit approaching @@ -4822,12 +4914,12 @@ groups: for: 5m - name: Thanos Compactor Halted description: "Thanos Compact {{$labels.job}} has failed to run and now is halted." - query: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1' + query: 'thanos_compact_halted == 1' severity: warning for: 5m - name: Thanos Compactor High Compaction Failures description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions." - query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) > 0' + query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total[5m])) > 0' severity: warning for: 15m - name: Thanos Compact Bucket High Operation Failures @@ -4860,17 +4952,19 @@ groups: for: 5m - name: Thanos Query Grpc Client Error Rate description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests." - query: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0' + query: '(sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded|ResourceExhausted", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0' severity: warning for: 5m + comments: | + Filters to actual error codes only. grpc_code!="OK" would include benign codes like NotFound, AlreadyExists, and Cancelled. - name: Thanos Query High D N S Failures description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints." - query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) > 0' + query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m])) > 0' severity: warning for: 15m - name: Thanos Query Instant Latency High description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries." - query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)' + query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query"}[5m])) > 0)' severity: critical for: 10m - name: Thanos Query Range Latency High @@ -4898,22 +4992,22 @@ groups: for: 10m - name: Thanos Receive High Replication Failures description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests." - query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100' + query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error"}[5m])) / sum by (job) (rate(thanos_receive_replications_total[5m]))) > (max by (job) (floor((thanos_receive_replication_factor+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes))) * 100' severity: warning for: 5m - name: Thanos Receive High Forward Request Failures description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests." - query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) > 0' + query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total[5m])) > 0' severity: info for: 5m - name: Thanos Receive High Hashring File Refresh Failures description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed." - query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0' + query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0' severity: warning for: 15m - name: Thanos Receive Config Reload Failure description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations." - query: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1' + query: 'avg by (job) (thanos_receive_config_last_reload_successful) != 1' severity: warning for: 5m - name: Thanos Receive No Upload @@ -4933,7 +5027,7 @@ groups: for: 5m - name: Thanos Sidecar No Connection To Started Prometheus description: "Thanos Sidecar {{$labels.instance}} is unhealthy." - query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0' + query: 'thanos_sidecar_prometheus_up == 0 and on (namespace, pod) prometheus_tsdb_data_replay_duration_seconds != 0' severity: critical for: 5m - name: Thanos Store @@ -4946,7 +5040,7 @@ groups: for: 5m - name: Thanos Store Series Gate Latency High description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests." - query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)' + query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count[5m])) > 0)' severity: warning for: 10m - name: Thanos Store Bucket High Operation Failures @@ -4964,12 +5058,12 @@ groups: rules: - name: Thanos Rule Queue Is Dropping Alerts description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s)." - query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0' + query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total[5m])) > 0' severity: critical for: 5m - name: Thanos Rule Sender Is Failing Alerts description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager ({{ $value | humanize }}/s)." - query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0' + query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total[5m])) > 0' severity: critical for: 5m - name: Thanos Rule High Rule Evaluation Failures @@ -4979,7 +5073,7 @@ groups: for: 5m - name: Thanos Rule High Rule Evaluation Warnings description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings ({{ $value | humanize }}/s)." - query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05' + query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total[5m])) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: info @@ -4996,17 +5090,17 @@ groups: for: 5m - name: Thanos Rule Config Reload Failure description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration." - query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1' + query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful) != 1' severity: info for: 5m - name: Thanos Rule Query High D N S Failures description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints." - query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0' + query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) > 0' severity: warning for: 15m - name: Thanos Rule Alertmanager High D N S Failures description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints." - query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0' + query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) > 0' severity: warning for: 15m - name: Thanos Rule No Evaluation For10 Intervals @@ -5016,7 +5110,7 @@ groups: for: 5m - name: Thanos No Rule Evaluations description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes." - query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0' + query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules) > 0' severity: critical for: 5m - name: Thanos Bucket Replicate @@ -5024,12 +5118,12 @@ groups: rules: - name: Thanos Bucket Replicate Error Rate description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed." - query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) > 0' + query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total[5m])) > 0' severity: critical for: 5m - name: Thanos Bucket Replicate Run Latency description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations." - query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)' + query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_count[5m])) > 0)' severity: critical for: 5m - name: Thanos Component Absent @@ -5081,13 +5175,12 @@ groups: severity: critical for: 15m - name: Loki request panic - description: The {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics - query: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + description: "{{ $labels.job }} is experiencing {{ $value | humanize }} panic(s) in the last 5 minutes." + query: sum(increase(loki_panic_total[5m])) by (namespace, job) > 0 severity: critical - for: 5m - name: Loki request latency - description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency - query: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1 + description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + query: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (namespace, job, route, le)) > 1 severity: critical for: 5m - name: Promtail @@ -5102,7 +5195,7 @@ groups: for: 5m - name: Promtail request latency description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - query: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1 + query: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (namespace, job, route, le)) > 1 severity: critical for: 5m - name: Cortex @@ -5118,13 +5211,13 @@ groups: description: Cortex not connected to Alertmanager (instance {{ $labels.instance }}) query: cortex_prometheus_notifications_alertmanagers_discovered < 1 severity: critical - - name: Cortex notification are being dropped - description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s)." + - name: Cortex notifications are being dropped + description: "Cortex notifications are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s)." query: rate(cortex_prometheus_notifications_dropped_total[5m]) > 0.05 comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical - - name: Cortex notification error + - name: Cortex notification errors description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s)." query: rate(cortex_prometheus_notifications_errors_total[5m]) > 0.05 comments: | @@ -5217,20 +5310,22 @@ groups: severity: critical for: 24h comments: | - Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration. + Threshold of 100 blocks per compactor instance. Normalize by backend-worker count if needed. Adjust based on your environment. - name: Tempo distributor usage tracker errors description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }})." - query: sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0 + query: sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0.05 severity: critical for: 30m + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Tempo metrics generator processor updates failing description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m)." - query: sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 0 + query: sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 2 severity: critical for: 15m - name: Tempo metrics generator service graphs dropping spans description: Tempo metrics generator is dropping {{ printf "%.2f" $value }}% of spans in service graphs for {{ $labels.job }}. - query: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0' + query: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans_total[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0' severity: warning for: 15m - name: Tempo metrics generator collections failing @@ -5356,35 +5451,49 @@ groups: # Blocks and TSDB - name: Mimir ingester TSDB head compaction failed description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s)." - query: rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 + query: rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0.05 severity: critical for: 15m + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir ingester TSDB head truncation failed description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s)." - query: rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 + query: rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0.05 severity: critical + for: 15m + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir ingester TSDB checkpoint creation failed description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s)." - query: rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 + query: rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0.05 severity: critical + for: 15m + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir ingester TSDB checkpoint deletion failed description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s)." - query: rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 + query: rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0.05 severity: critical + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir ingester TSDB WAL truncation failed description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s)." - query: rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 + query: rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0.05 severity: warning + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir ingester TSDB WAL writes failed description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s)." - query: rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 + query: rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0.05 severity: critical for: 3m + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir store gateway has not synced bucket - description: Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 10 minutes. + description: Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 30 minutes. query: (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0 comments: | - Threshold aligned with official Mimir mixin (30 minutes). + Threshold of 30 minutes. Adjust based on your sync interval. severity: critical for: 5m - name: Mimir store gateway no synced tenants @@ -5413,7 +5522,9 @@ groups: severity: critical - name: Mimir compactor has run out of disk space description: Mimir compactor {{ $labels.instance }} has run out of disk space. - query: increase(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1 + query: delta(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1 + comments: | + cortex_compactor_disk_out_of_space_errors_total is declared as gauge by Mimir despite the _total suffix, so delta() is used instead of increase(). severity: critical - name: Mimir compactor has not uploaded blocks description: Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours. @@ -5424,7 +5535,7 @@ groups: description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }})." query: increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0 comments: | - Using 24h window per official mixin — compaction skips are rare events. + Using a 24h window as compaction skips are rare events. severity: warning for: 5m # Ruler @@ -5453,29 +5564,39 @@ groups: # Alertmanager - name: Mimir alertmanager sync configs failing description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s)." - query: rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 + query: rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0.05 severity: critical for: 30m + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir alertmanager ring check failing description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s)." - query: rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0 + query: rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0.05 severity: critical for: 10m + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir alertmanager state merge failing description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s)." - query: rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0 + query: rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0.05 severity: critical for: 10m + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir alertmanager replication failing description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s)." - query: rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0 + query: rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0.05 severity: critical for: 10m + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir alertmanager persist state failing description: "Mimir alertmanager {{ $labels.job }} is failing to persist state ({{ $value | humanize }}/s)." - query: rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 + query: rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0.05 severity: critical for: 1h + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir alertmanager initial sync failed description: Mimir alertmanager {{ $labels.job }} failed initial state sync. query: increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 @@ -5512,7 +5633,8 @@ groups: - name: Grafana Alloy exporters: - - slug: embedded-exporter + - name: Embedded exporter + slug: embedded-exporter rules: - name: Grafana Alloy service down description: "Alloy on instance {{ $labels.instance }} is not responding or has stopped running." @@ -5534,19 +5656,27 @@ groups: query: 'up{job=~".*otel.*collector.*"} == 0' severity: critical for: 1m + comments: | + Adjust the job label regex to match the actual job name in your Prometheus scrape config. - name: OpenTelemetry Collector receiver refused spans description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}." - query: 'rate(otelcol_receiver_refused_spans[5m]) > 0' + query: 'rate(otelcol_receiver_refused_spans[5m]) > 0.05' + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical for: 5m - name: OpenTelemetry Collector receiver refused metric points description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}." - query: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0' + query: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0.05' + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical for: 5m - name: OpenTelemetry Collector receiver refused log records description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s log records on {{ $labels.receiver }}." - query: 'rate(otelcol_receiver_refused_log_records[5m]) > 0' + query: 'rate(otelcol_receiver_refused_log_records[5m]) > 0.05' + comments: | + Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical for: 5m - name: OpenTelemetry Collector exporter failed spans @@ -5579,6 +5709,7 @@ groups: query: 'rate(otelcol_processor_refused_spans[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. + These processor metrics are deprecated since collector v0.110.0. severity: warning for: 5m - name: OpenTelemetry Collector processor refused metric points @@ -5586,11 +5717,12 @@ groups: query: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. + These processor metrics are deprecated since collector v0.110.0. severity: warning for: 5m - name: OpenTelemetry Collector high memory usage description: "OpenTelemetry Collector memory usage is above 90%" - query: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9' + query: '(otelcol_process_runtime_heap_alloc_bytes / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes) > 0.9' severity: warning for: 5m - name: OpenTelemetry Collector OTLP receiver errors @@ -5668,7 +5800,7 @@ groups: severity: warning - name: APC UPS low battery voltage description: Battery voltage is lower than nominal (< 95%) - query: "(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95" + query: "(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95 and apcupsd_battery_nominal_volts > 0" severity: warning - name: APC UPS high temperature description: Internal temperature is high ({{$value}}°C) @@ -5705,7 +5837,11 @@ groups: description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`" query: "store_connection_wait_time_ms > 10" severity: warning + comments: | + Threshold of 10ms. Adjust based on your expected database latency. - name: Store connection very slow description: "Store connection is very slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`" query: "store_connection_wait_time_ms > 20" severity: critical + comments: | + Threshold of 20ms. Adjust based on your expected database latency.