From c37ef8f50c879fc3df5b7ac4fd078835043cf8b5 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 16 Mar 2026 01:27:18 +0100 Subject: [PATCH] fix: review and fix 74 database & broker alert rules (#504) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: review and fix 74 database & broker alert rules Comprehensive review of all database and broker alerts covering 16 services. Typos & descriptions (8 fixes): - PGBouncer: "a a server" → "a server" - RabbitMQ: "instace" → "instance", "RabbmitMQ" → "RabbitMQ", "unactive" → "inactive" - Cassandra: write failure said "Read failures", "bad hacker" → "authentication failures" - Solr: replication errors said "failed updates" - Meilisearch: "index is empty" said "instance is down" Duplicates removed (5 fixes): - PostgreSQL: 2 rules using wrong exporter metric (postgresql_errors_total) - ClickHouse: "High Network Traffic" (thread counts) duplicated byte-rate rule - NATS: 2 rules with low thresholds duplicated better rules Broken queries (20 fixes): - Patroni: patroni_master → patroni_primary (renamed in v3) - MongoDB: rate() on gauge → direct ratio for connection queries - MongoDB: removed WiredTiger-incompatible virtual memory rule - Cassandra instaclustr: avg() on counter → rate()[5m] - Cassandra criteo: increase() on JMX rate metric → direct threshold - ClickHouse: increase() on gauge → direct threshold - NATS: rate() on gauge → direct comparison, removed 4 config-value rules - SQL Server: increase() on gauge → direct threshold - Pulsar: moved comparison outside sum() (4 rules) - Hadoop: inverted comparison < 0.2 → > 0.8, counters → increase()[1h] Severity adjustments (7 fixes): - Redis: backup threshold 24h → 48h, rejected connections → warning > 5 - RabbitMQ: no consumer for: 5m with comment - Elasticsearch: unassigned shards added for: 2m - CouchDB: process restarted critical → info - Kafka: consumer group lag → warning, threshold 10000, better description - Hadoop: HBase heap low critical → warning Missing for duration (18 fixes): - Added for: 1m to service-down alerts across MySQL, PostgreSQL, SQL Server, Patroni, Redis, MongoDB, RabbitMQ, Elasticsearch, Cassandra, Zookeeper with restart-tolerance comments Division by zero guards (9 fixes): - Added denominator > 0 guards to ratio queries in PostgreSQL, RabbitMQ, Elasticsearch, ClickHouse, CouchDB, NATS Query design improvements (5 fixes): - Cassandra: removed unnecessary sum() and redundant avg_over_time() - ClickHouse: ZooKeeper avg() → per-instance check - PostgreSQL: sum() → sum by (instance) for SSL and locks - PGBouncer: 30s range window → 2m Hardcoded labels (2 fixes): - ClickHouse: added comment about job="clickhouse" - Cassandra criteo: removed hardcoded service="cas" * fix: address PR review comments - Cassandra connection timeouts: wrap rate() in sum by() (rate() by() is invalid PromQL) - Elasticsearch query latency: add division-by-zero guard - Redis backup: "backuped" → "backed up" --- _data/rules.yml | 238 ++++++++++++++++++++++++++---------------------- 1 file changed, 128 insertions(+), 110 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index 13a1124..1ac85f7 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -579,6 +579,9 @@ groups: description: MySQL instance is down on {{ $labels.instance }} query: "mysql_up == 0" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: MySQL too many connections (> 80%) description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}" query: "max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80" @@ -598,10 +601,16 @@ groups: description: "MySQL Slave IO thread not running on {{ $labels.instance }}" query: "( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: MySQL Slave SQL thread not running description: "MySQL Slave SQL thread not running on {{ $labels.instance }}" query: "( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: MySQL Slave replication lag description: "MySQL replication lag on {{ $labels.instance }}" query: "( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30" @@ -651,6 +660,9 @@ groups: description: Postgresql instance is down query: "pg_up == 0" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: Postgresql restarted description: Postgresql restarted query: "time() - pg_postmaster_start_time_seconds < 60" @@ -695,14 +707,6 @@ groups: query: "rate(pg_txid_current[1m]) < 5" severity: warning for: 2m - - name: Postgresql high rate statement timeout - description: Postgres transactions showing high rate of statement timeouts - query: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3' - severity: critical - - name: Postgresql high rate deadlock - description: Postgres detected deadlocks - query: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1' - severity: critical - name: Postgresql unused replication slot description: Unused Replication Slots query: "(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)" @@ -719,11 +723,11 @@ groups: severity: info - name: Postgresql SSL compression active description: Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`. - query: "sum(pg_stat_ssl_compression) > 0" + query: "sum by (instance) (pg_stat_ssl_compression) > 0" severity: warning - name: Postgresql too many locks acquired description: Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction. - query: "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20" + query: "((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20" severity: critical for: 2m - name: Postgresql bloat index high (> 80%) @@ -763,10 +767,14 @@ groups: description: SQL server instance is down query: mssql_up == 0 severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: SQL Server deadlock - description: SQL Server is having some deadlock. - query: increase(mssql_deadlocks[1m]) > 5 + description: SQL Server {{ $labels.instance }} is experiencing deadlocks ({{ $value }}/s) + query: mssql_deadlocks > 5 severity: warning + for: 1m - name: Patroni exporters: @@ -776,8 +784,11 @@ groups: rules: - name: Patroni has no Leader description: A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }} - query: (max by (scope) (patroni_master) < 1) and (max by (scope) (patroni_standby_leader) < 1) + query: (max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1) severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: PGBouncer exporters: @@ -791,12 +802,12 @@ groups: severity: warning for: 2m - name: PGBouncer errors - description: PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console. + description: PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console. query: 'increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[1m]) > 10' severity: warning - name: PGBouncer max connections description: The number of PGBouncer client connections has reached max_client_conn. - query: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[30s]) > 0' + query: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0' severity: critical - name: Redis @@ -809,6 +820,9 @@ groups: description: Redis instance is down query: "redis_up == 0" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: Redis missing master description: Redis cluster has no node marked as master. query: '(count(redis_instance_info{role="master"}) or vector(0)) < 1' @@ -817,6 +831,9 @@ groups: description: Redis cluster has too many nodes marked as master. query: 'count(redis_instance_info{role="master"}) > 1' severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: Redis disconnected slaves description: Redis not replicating for all slaves. Consider reviewing the redis replication status. query: "count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0" @@ -831,8 +848,8 @@ groups: severity: critical for: 2m - name: Redis missing backup - description: Redis has not been backuped for 24 hours - query: "time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24" + description: Redis has not been backed up for 48 hours + query: "time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 48" severity: critical - name: Redis out of system memory description: Redis is running out of system memory (> 90%) @@ -858,8 +875,8 @@ groups: for: 2m - name: Redis rejected connections description: Some connections to Redis has been rejected - query: "increase(redis_rejected_connections_total[1m]) > 0" - severity: critical + query: "increase(redis_rejected_connections_total[1m]) > 5" + severity: warning - name: MongoDB exporters: @@ -871,10 +888,16 @@ groups: description: MongoDB instance is down query: "mongodb_up == 0" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: Mongodb replica member unhealthy description: MongoDB replica member is not healthy query: "mongodb_rs_members_health == 0" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: MongoDB replication lag description: Mongodb replication lag is more than 10s query: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10' @@ -883,6 +906,8 @@ groups: description: MongoDB replication headroom is <= 0 query: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0' severity: critical + comments: | + This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both. - name: MongoDB number cursors open description: Too many cursors opened by MongoDB for clients (> 10k) query: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000' @@ -895,7 +920,7 @@ groups: for: 2m - name: MongoDB too many connections description: Too many connections (> 80%) - query: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80' + query: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80' severity: warning for: 2m @@ -939,15 +964,9 @@ groups: for: 2m - name: MongoDB too many connections description: Too many connections (> 80%) - query: 'avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80' + query: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80' severity: warning for: 2m - - name: MongoDB virtual memory usage - description: High memory usage - query: '(sum(mongodb_memory{type="virtual"}) BY (instance) / sum(mongodb_memory{type="mapped"}) BY (instance)) > 3' - severity: warning - for: 2m - - name: stefanprodan/mgob slug: stefanprodan-mgob-exporter doc_url: https://github.com/stefanprodan/mgob @@ -967,10 +986,16 @@ groups: description: Less than 3 nodes running in RabbitMQ cluster query: "sum(rabbitmq_build_info) < 3" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: RabbitMQ node not distributed description: Distribution link state is not 'up' query: "erlang_vm_dist_node_state < 3" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: RabbitMQ instances different versions description: Running different version of RabbitMQ in the same cluster, can lead to failure. query: "count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1" @@ -978,16 +1003,16 @@ groups: for: 1h - name: RabbitMQ memory high description: A node use more than 90% of allocated RAM - query: "rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90" + query: "rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0" severity: warning for: 2m - name: RabbitMQ file descriptors usage description: A node use more than 90% of file descriptors - query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90" + query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0" severity: warning for: 2m - name: RabbitMQ too many ready messages - description: RabbitMQ too many ready messages on {{ $labels.instace }} + description: RabbitMQ too many ready messages on {{ $labels.instance }} query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000" severity: warning for: 1m @@ -1020,17 +1045,23 @@ groups: description: RabbitMQ node down query: "rabbitmq_up == 0" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: RabbitMQ cluster down description: Less than 3 nodes running in RabbitMQ cluster query: "sum(rabbitmq_running) < 3" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: RabbitMQ cluster partition description: Cluster partition query: "rabbitmq_partitions > 0" severity: critical - name: RabbitMQ out of memory - description: Memory available for RabbmitMQ is low (< 10%) - query: "rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90" + description: Memory available for RabbitMQ is low (< 10%) + query: "rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0" severity: warning for: 2m - name: RabbitMQ too many connections @@ -1063,14 +1094,16 @@ groups: description: Queue has no consumer query: "rabbitmq_queue_consumers == 0" severity: critical - for: 1m # allows a short service restart + for: 5m + comments: | + Allows a short service restart. - name: RabbitMQ too many consumers description: Queue should have only 1 consumer query: 'rabbitmq_queue_consumers{queue="my-queue"} > 1' severity: critical comments: | Indicate the queue name in dedicated label. - - name: RabbitMQ unactive exchange + - name: RabbitMQ inactive exchange description: Exchange receive less than 5 msgs per second query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5' severity: warning @@ -1115,10 +1148,16 @@ groups: description: "Missing node in Elasticsearch cluster" query: "elasticsearch_cluster_health_number_of_nodes < 3" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: Elasticsearch Healthy Data Nodes description: "Missing data node in Elasticsearch cluster" query: "elasticsearch_cluster_health_number_of_data_nodes < 3" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: Elasticsearch relocating shards description: "Elasticsearch is relocating shards" query: "elasticsearch_cluster_health_relocating_shards > 0" @@ -1141,6 +1180,7 @@ groups: description: "Elasticsearch has unassigned shards" query: "elasticsearch_cluster_health_unassigned_shards > 0" severity: critical + for: 2m - name: Elasticsearch pending tasks description: "Elasticsearch has pending tasks. Cluster works slowly." query: "elasticsearch_cluster_health_number_of_pending_tasks > 0" @@ -1152,7 +1192,7 @@ groups: severity: warning - name: Elasticsearch High Indexing Latency description: "The indexing latency on Elasticsearch cluster is higher than the threshold." - query: "increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005" + query: "increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and increase(elasticsearch_indices_indexing_index_total[1m]) > 0" severity: warning for: 10m - name: Elasticsearch High Indexing Rate @@ -1167,7 +1207,7 @@ groups: for: 5m - name: Elasticsearch High Query Latency description: "The query latency on Elasticsearch cluster is higher than the threshold." - query: "increase(elasticsearch_indices_search_fetch_time_seconds[1m]) / increase(elasticsearch_indices_search_fetch_total[1m]) > 1" + query: "increase(elasticsearch_indices_search_query_time_seconds[1m]) / increase(elasticsearch_indices_search_query_total[1m]) > 1 and increase(elasticsearch_indices_search_query_total[1m]) > 0" severity: warning for: 5m @@ -1178,7 +1218,7 @@ groups: doc_url: https://github.com/orgs/meilisearch/discussions/625 rules: - name: Meilisearch index is empty - description: Meilisearch instance is down + description: Meilisearch index {{ $labels.index }} has zero documents query: "meilisearch_index_docs_count == 0" severity: warning - name: Meilisearch http response time @@ -1194,8 +1234,11 @@ groups: rules: - name: "Cassandra Node is unavailable" description: "Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}" - query: "sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1" + query: "cassandra_endpoint_active < 1" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: "Cassandra many compaction tasks are pending" description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}" query: "cassandra_table_estimated_pending_compactions > 100" @@ -1217,7 +1260,7 @@ groups: severity: warning - name: "Cassandra connection timeouts total" description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}" - query: "avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5" + query: "sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5" for: 2m severity: critical - name: "Cassandra storage exceptions" @@ -1240,7 +1283,7 @@ groups: for: 2m severity: critical - name: "Cassandra client request write failure" - description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}" + description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}" query: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0' for: 2m severity: critical @@ -1260,15 +1303,15 @@ groups: severity: critical - name: Cassandra compaction task pending description: Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster. - query: 'avg_over_time(cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"}[1m]) > 100' + query: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100' severity: warning for: 2m - name: Cassandra viewwrite latency description: High viewwrite latency on {{ $labels.instance }} cassandra node - query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile",service="cas"} > 100000' + query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000' severity: warning for: 2m - - name: Cassandra bad hacker + - name: Cassandra authentication failures description: Increase of Cassandra authentication failures query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5' severity: warning @@ -1277,6 +1320,9 @@ groups: description: Cassandra node down query: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0' severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: Cassandra commitlog pending tasks description: Unexpected number of Cassandra commitlog pending tasks query: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15' @@ -1325,11 +1371,11 @@ groups: severity: critical - name: Cassandra client request write failure description: A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large. - query: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"}[1m]) > 0' + query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0' severity: critical - name: Cassandra client request read failure description: A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large. - query: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"}[1m]) > 0' + query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0' severity: critical - name: Cassandra cache hit rate key cache description: Key cache hit rate is below 85% @@ -1348,14 +1394,16 @@ groups: query: 'up{job="clickhouse"} == 0' severity: critical for: 2m + comments: | + Adjust the job label to match your Prometheus configuration. - name: ClickHouse Memory Usage Critical description: "Memory usage is critically high, over 90%." - query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90" + query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0" severity: critical for: 5m - name: ClickHouse Memory Usage Warning description: "Memory usage is over 80%." - query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80" + query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0" severity: warning for: 5m - name: ClickHouse Disk Space Low on Default @@ -1388,13 +1436,6 @@ groups: query: "ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1" severity: critical - - name: ClickHouse High Network Traffic - description: "Network traffic is unusually high, may affect cluster performance." - query: "ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250" - severity: warning - for: 5m - comments: | - Please replace the threshold with an appropriate value - name: ClickHouse High TCP Connections description: "High number of TCP connections, indicating heavy client or inter-cluster communication." query: "ClickHouseMetrics_TCPConnection > 400" @@ -1403,13 +1444,15 @@ groups: comments: | Please replace the threshold with an appropriate value - name: ClickHouse Interserver Connection Issues - description: "An increase in interserver connections may indicate replication or distributed query handling issues." - query: "increase(ClickHouseMetrics_InterserverConnection[5m]) > 0" + description: "High number of interserver connections may indicate replication or distributed query handling issues." + query: "ClickHouseMetrics_InterserverConnection > 50" severity: warning - for: 1m + for: 5m + comments: | + Adjust the threshold based on your cluster size and expected replication traffic. - name: ClickHouse ZooKeeper Connection Issues description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination." - query: "avg(ClickHouseMetrics_ZooKeeperSession) != 1" + query: "ClickHouseMetrics_ZooKeeperSession != 1" severity: warning for: 3m - name: ClickHouse Authentication Failures @@ -1478,7 +1521,7 @@ groups: for: 5m - name: CouchDB 5xx error ratio high description: More than 5% of HTTP requests are returning 5xx errors - query: "rate(couchdb_httpd_status_codes{code=~\"5..\"}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05" + query: "rate(couchdb_httpd_status_codes{code=~\"5..\"}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0" severity: critical for: 5m - name: CouchDB temporary view read rate critical @@ -1539,7 +1582,7 @@ groups: - name: CouchDB process restarted description: CouchDB process has restarted recently query: "changes(process_start_time_seconds[1h]) > 0" - severity: critical + severity: info for: 1m - name: CouchDB critical log entries description: Critical or error log entries detected in the last 5 minutes @@ -1561,6 +1604,9 @@ groups: description: "Zookeeper down on instance {{ $labels.instance }}" query: "zk_up == 0" severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. - name: Zookeeper missing leader description: "Zookeeper cluster has no node marked as leader" query: "sum(zk_server_leader) == 0" @@ -1585,10 +1631,10 @@ groups: description: Kafka topic in-sync partition query: "min(kafka_topic_partition_in_sync_replica) by (topic) < 3" severity: critical - - name: Kafka consumers group - description: Kafka consumers group - query: "sum(kafka_consumergroup_lag) by (consumergroup) > 50" - severity: critical + - name: Kafka consumer group lag + description: Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages) + query: "sum(kafka_consumergroup_lag) by (consumergroup) > 10000" + severity: warning for: 1m - name: linkedin/Burrow slug: linkedin-kafka-exporter @@ -1622,12 +1668,12 @@ groups: severity: critical - name: Pulsar topic large backlog storage size description: "The topic backlog storage size is over 5 GB" - query: sum(pulsar_storage_size > 5*1024*1024*1024) by (topic) + query: sum(pulsar_storage_size) by (topic) > 5*1024*1024*1024 for: 1h severity: warning - name: Pulsar topic very large backlog storage size description: "The topic backlog storage size is over 20 GB" - query: sum(pulsar_storage_size > 20*1024*1024*1024) by (topic) + query: sum(pulsar_storage_size) by (topic) > 20*1024*1024*1024 for: 1h severity: critical - name: Pulsar high write latency @@ -1652,12 +1698,12 @@ groups: severity: critical - name: Pulsar high number of function errors description: "Observing more than 10 Function errors per minute" - query: sum((rate(pulsar_function_user_exceptions_total{}[1m]) + rate(pulsar_function_system_exceptions_total{}[1m])) > 10) by (name) + query: sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10 for: 1m severity: critical - name: Pulsar high number of sink errors description: "Observing more than 10 Sink errors per minute" - query: sum(rate(pulsar_sink_sink_exceptions_total{}[1m]) > 10) by (name) + query: sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10 for: 1m severity: critical @@ -1667,16 +1713,6 @@ groups: slug: nats-exporter doc_url: https://github.com/nats-io/prometheus-nats-exporter rules: - - name: Nats high connection count - description: High number of NATS connections ({{ $value }}) for {{ $labels.instance }} - query: "gnatsd_varz_connections > 100" - severity: warning - for: 3m - - name: Nats high subscriptions count - description: High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }} - query: "gnatsd_connz_subscriptions > 50" - severity: warning - for: 3m - name: Nats high routes count description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }} query: "gnatsd_varz_routes > 10" @@ -1699,9 +1735,11 @@ groups: for: 5m - name: Nats high CPU usage description: NATS server is using more than 80% CPU for the last 5 minutes - query: "rate(gnatsd_varz_cpu[5m]) > 0.8" + query: "gnatsd_varz_cpu > 80" severity: warning for: 5m + comments: | + gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale). - name: Nats high number of connections description: NATS server has more than 1000 active connections query: "gnatsd_connz_num_connections > 1000" @@ -1709,12 +1747,12 @@ groups: for: 5m - name: Nats high JetStream store usage description: JetStream store usage is over 80% - query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8" + query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0" severity: warning for: 5m - name: Nats high JetStream memory usage description: JetStream memory usage is over 80% - query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8" + query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0" severity: warning for: 5m - name: Nats high number of subscriptions @@ -1732,36 +1770,16 @@ groups: query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0" severity: warning for: 5m - - name: Nats JetStream consumers exceeded - description: JetStream has more than 100 active consumers + - name: Nats JetStream accounts exceeded + description: JetStream has more than 100 active accounts query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100" severity: warning for: 5m - - name: Nats frequent authentication timeouts - description: There have been more than 5 authentication timeouts in the last 5 minutes - query: "increase(gnatsd_varz_auth_timeout[5m]) > 5" - severity: warning - for: 5m - - name: Nats max payload size exceeded - description: The max payload size allowed by NATS has been exceeded (1MB) - query: "max(gnatsd_varz_max_payload) > 1024 * 1024" - severity: critical - for: 5m - name: Nats leaf node connection issue - description: No leaf node connections have been established in the last 5 minutes - query: "increase(gnatsd_varz_leafnodes[5m]) == 0" - severity: critical - for: 5m - - name: Nats max ping operations exceeded - description: The maximum number of ping operations in NATS has exceeded 50 - query: "gnatsd_varz_ping_max > 50" + description: No leaf node connections on {{ $labels.instance }} + query: "gnatsd_varz_leafnodes == 0" severity: warning for: 5m - - name: Nats write deadline exceeded - description: The write deadline has been exceeded in NATS, indicating potential message delivery issues - query: "gnatsd_varz_write_deadline > 10" - severity: critical - for: 5m - name: Solr exporters: @@ -1779,7 +1797,7 @@ groups: severity: warning for: 5m - name: Solr replication errors - description: Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}. + description: Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}. query: 'increase(solr_metrics_core_errors_total{category="REPLICATION"}[1m]) > 1' severity: critical - name: Solr low live node count @@ -1823,7 +1841,7 @@ groups: # Alert rule for excessive MapReduce task failures - name: Hadoop Map Reduce Task Failures - query: hadoop_mapreduce_task_failures_total > 100 + query: increase(hadoop_mapreduce_task_failures_total[1h]) > 100 for: 10m severity: critical description: "There is an unusually high number of MapReduce task failures." @@ -1837,7 +1855,7 @@ groups: # Alert rule for high YARN container allocation failures - name: Hadoop YARN Container Allocation Failures - query: hadoop_yarn_container_allocation_failures_total > 10 + query: increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10 for: 10m severity: warning description: "There is a significant number of YARN container allocation failures." @@ -1851,9 +1869,9 @@ groups: # Alert rule for low HBase region server heap space - name: Hadoop HBase Region Server Heap Low - query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2 + query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 for: 10m - severity: critical + severity: warning description: "HBase Region Servers are running low on heap space." # Alert rule for high HBase Write Requests latency