Publish

2026-06-21 00:47:18 +08:00 · 2026-03-16 00:27:40 +00:00 · 2026-03-16 00:27:40 +00:00 · e2af1325c6
commit e2af1325c6
parent c37ef8f50c
22 changed files with 109 additions and 176 deletions
--- a/dist/rules/cassandra/criteo-cassandra-exporter.yml
+++ b/dist/rules/cassandra/criteo-cassandra-exporter.yml
@ -15,7 +15,7 @@ groups:
        description: "Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraCompactionTaskPending
-      expr: 'avg_over_time(cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"}[1m]) > 100'
+      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100'
      for: 2m
      labels:
        severity: warning
@ -24,7 +24,7 @@ groups:
        description: "Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraViewwriteLatency
-      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile",service="cas"} > 100000'
+      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000'
      for: 2m
      labels:
        severity: warning
@ -32,18 +32,19 @@ groups:
        summary: Cassandra viewwrite latency (instance {{ $labels.instance }})
        description: "High viewwrite latency on {{ $labels.instance }} cassandra node\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraBadHacker
+    - alert: CassandraAuthenticationFailures
      expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Cassandra bad hacker (instance {{ $labels.instance }})
+        summary: Cassandra authentication failures (instance {{ $labels.instance }})
        description: "Increase of Cassandra authentication failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # 1m delay allows a restart without triggering an alert.
    - alert: CassandraNodeDown
      expr: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -141,7 +142,7 @@ groups:
        description: "Read failures have occurred because too many nodes are unavailable\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestWriteFailure
-      expr: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"}[1m]) > 0'
+      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0'
      for: 0m
      labels:
        severity: critical
@ -150,7 +151,7 @@ groups:
        description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestReadFailure
-      expr: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"}[1m]) > 0'
+      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0'
      for: 0m
      labels:
        severity: critical
--- a/dist/rules/cassandra/instaclustr-cassandra-exporter.yml
+++ b/dist/rules/cassandra/instaclustr-cassandra-exporter.yml
@ -5,9 +5,10 @@ groups:
  
  rules:

+    # 1m delay allows a restart without triggering an alert.
    - alert: CassandraNodeIsUnavailable
-      expr: 'sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1'
-      for: 0m
+      expr: 'cassandra_endpoint_active < 1'
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -51,7 +52,7 @@ groups:
        description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraConnectionTimeoutsTotal
-      expr: 'avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5'
+      expr: 'sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5'
      for: 2m
      labels:
        severity: critical
@ -102,7 +103,7 @@ groups:
        severity: critical
      annotations:
        summary: Cassandra client request write failure (instance {{ $labels.instance }})
-        description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestReadFailure
      expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
--- a/dist/rules/clickhouse/embedded-exporter.yml
+++ b/dist/rules/clickhouse/embedded-exporter.yml
@ -5,6 +5,7 @@ groups:
  
  rules:

+    # Adjust the job label to match your Prometheus configuration.
    - alert: ClickhouseNodeDown
      expr: 'up{job="clickhouse"} == 0'
      for: 2m
@ -15,7 +16,7 @@ groups:
        description: "No metrics received from ClickHouse exporter for over 2 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseMemoryUsageCritical
-      expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90'
+      expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0'
      for: 5m
      labels:
        severity: critical
@ -24,7 +25,7 @@ groups:
        description: "Memory usage is critically high, over 90%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseMemoryUsageWarning
-      expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80'
+      expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0'
      for: 5m
      labels:
        severity: warning
@ -86,16 +87,6 @@ groups:
        summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
        description: "There are too few live replicas available, risking data loss and service disruption.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # Please replace the threshold with an appropriate value
-    - alert: ClickhouseHighNetworkTraffic
-      expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
-        description: "Network traffic is unusually high, may affect cluster performance.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
    # Please replace the threshold with an appropriate value
    - alert: ClickhouseHighTcpConnections
      expr: 'ClickHouseMetrics_TCPConnection > 400'
@ -106,17 +97,18 @@ groups:
        summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
        description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Adjust the threshold based on your cluster size and expected replication traffic.
    - alert: ClickhouseInterserverConnectionIssues
-      expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0'
-      for: 1m
+      expr: 'ClickHouseMetrics_InterserverConnection > 50'
+      for: 5m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
-        description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "High number of interserver connections may indicate replication or distributed query handling issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseZookeeperConnectionIssues
-      expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1'
+      expr: 'ClickHouseMetrics_ZooKeeperSession != 1'
      for: 3m
      labels:
        severity: warning
--- a/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml
+++ b/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml
@ -42,7 +42,7 @@ groups:
        description: "CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: Couchdb5xxErrorRatioHigh
-      expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05'
+      expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0'
      for: 5m
      labels:
        severity: critical
@ -153,7 +153,7 @@ groups:
      expr: 'changes(process_start_time_seconds[1h]) > 0'
      for: 1m
      labels:
-        severity: critical
+        severity: info
      annotations:
        summary: CouchDB process restarted (instance {{ $labels.instance }})
        description: "CouchDB process has restarted recently\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
+++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
@ -59,18 +59,20 @@ groups:
        summary: Elasticsearch Cluster Yellow (instance {{ $labels.instance }})
        description: "Elastic Cluster Yellow status\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # 1m delay allows a restart without triggering an alert.
    - alert: ElasticsearchHealthyNodes
      expr: 'elasticsearch_cluster_health_number_of_nodes < 3'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Elasticsearch Healthy Nodes (instance {{ $labels.instance }})
        description: "Missing node in Elasticsearch cluster\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # 1m delay allows a restart without triggering an alert.
    - alert: ElasticsearchHealthyDataNodes
      expr: 'elasticsearch_cluster_health_number_of_data_nodes < 3'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -115,7 +117,7 @@ groups:

    - alert: ElasticsearchUnassignedShards
      expr: 'elasticsearch_cluster_health_unassigned_shards > 0'
-      for: 0m
+      for: 2m
      labels:
        severity: critical
      annotations:
@ -141,7 +143,7 @@ groups:
        description: "No new documents for 10 min!\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchHighIndexingLatency
-      expr: 'increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005'
+      expr: 'increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and increase(elasticsearch_indices_indexing_index_total[1m]) > 0'
      for: 10m
      labels:
        severity: warning
@ -168,7 +170,7 @@ groups:
        description: "The query rate on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchHighQueryLatency
-      expr: 'increase(elasticsearch_indices_search_fetch_time_seconds[1m]) / increase(elasticsearch_indices_search_fetch_total[1m]) > 1'
+      expr: 'increase(elasticsearch_indices_search_query_time_seconds[1m]) / increase(elasticsearch_indices_search_query_total[1m]) > 1 and increase(elasticsearch_indices_search_query_total[1m]) > 0'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/hadoop/jmx_exporter.yml
+++ b/dist/rules/hadoop/jmx_exporter.yml
@ -42,7 +42,7 @@ groups:
        description: "Available HDFS disk space is running low.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopMapReduceTaskFailures
-      expr: 'hadoop_mapreduce_task_failures_total > 100'
+      expr: 'increase(hadoop_mapreduce_task_failures_total[1h]) > 100'
      for: 10m
      labels:
        severity: critical
@ -60,7 +60,7 @@ groups:
        description: "The Hadoop ResourceManager is approaching its memory limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopYarnContainerAllocationFailures
-      expr: 'hadoop_yarn_container_allocation_failures_total > 10'
+      expr: 'increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10'
      for: 10m
      labels:
        severity: warning
@ -78,10 +78,10 @@ groups:
        description: "The HBase cluster has an unusually high number of regions.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopHbaseRegionServerHeapLow
-      expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2'
+      expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8'
      for: 10m
      labels:
-        severity: critical
+        severity: warning
      annotations:
        summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
        description: "HBase Region Servers are running low on heap space.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/kafka/danielqsj-kafka-exporter.yml
+++ b/dist/rules/kafka/danielqsj-kafka-exporter.yml
@ -14,11 +14,11 @@ groups:
        summary: Kafka topics replicas (instance {{ $labels.instance }})
        description: "Kafka topic in-sync partition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: KafkaConsumersGroup
-      expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 50'
+    - alert: KafkaConsumerGroupLag
+      expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 10000'
      for: 1m
      labels:
-        severity: critical
+        severity: warning
      annotations:
-        summary: Kafka consumers group (instance {{ $labels.instance }})
-        description: "Kafka consumers group\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kafka consumer group lag (instance {{ $labels.instance }})
+        description: "Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/meilisearch/embedded-exporter.yml
+++ b/dist/rules/meilisearch/embedded-exporter.yml
@ -12,7 +12,7 @@ groups:
        severity: warning
      annotations:
        summary: Meilisearch index is empty (instance {{ $labels.instance }})
-        description: "Meilisearch instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Meilisearch index {{ $labels.index }} has zero documents\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MeilisearchHttpResponseTime
      expr: 'meilisearch_http_response_time_seconds > 0.5'
--- a/dist/rules/mongodb/dcu-mongodb-exporter.yml
+++ b/dist/rules/mongodb/dcu-mongodb-exporter.yml
@ -78,19 +78,10 @@ groups:
        description: "Too many cursors are timing out\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbTooManyConnections
-      expr: 'avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80'
+      expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MongoDB too many connections (instance {{ $labels.instance }})
        description: "Too many connections (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: MongodbVirtualMemoryUsage
-      expr: '(sum(mongodb_memory{type="virtual"}) BY (instance) / sum(mongodb_memory{type="mapped"}) BY (instance)) > 3'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
-        description: "High memory usage\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/mongodb/percona-mongodb-exporter.yml
+++ b/dist/rules/mongodb/percona-mongodb-exporter.yml
@ -5,18 +5,20 @@ groups:
  
  rules:

+    # 1m delay allows a restart without triggering an alert.
    - alert: MongodbDown
      expr: 'mongodb_up == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
        summary: MongoDB Down (instance {{ $labels.instance }})
        description: "MongoDB instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # 1m delay allows a restart without triggering an alert.
    - alert: MongodbReplicaMemberUnhealthy
      expr: 'mongodb_rs_members_health == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -32,6 +34,7 @@ groups:
        summary: MongoDB replication lag (instance {{ $labels.instance }})
        description: "Mongodb replication lag is more than 10s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both.
    - alert: MongodbReplicationHeadroom
      expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0'
      for: 0m
@ -60,7 +63,7 @@ groups:
        description: "Too many cursors are timing out\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbTooManyConnections
-      expr: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80'
+      expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80'
      for: 2m
      labels:
        severity: warning
--- a/dist/rules/mysql/mysqld-exporter.yml
+++ b/dist/rules/mysql/mysqld-exporter.yml
@ -5,9 +5,10 @@ groups:
  
  rules:

+    # 1m delay allows a restart without triggering an alert.
    - alert: MysqlDown
      expr: 'mysql_up == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -41,18 +42,20 @@ groups:
        summary: MySQL high threads running (instance {{ $labels.instance }})
        description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # 1m delay allows a restart without triggering an alert.
    - alert: MysqlSlaveIoThreadNotRunning
      expr: '( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
        summary: MySQL Slave IO thread not running (instance {{ $labels.instance }})
        description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # 1m delay allows a restart without triggering an alert.
    - alert: MysqlSlaveSqlThreadNotRunning
      expr: '( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
--- a/dist/rules/nats/nats-exporter.yml
+++ b/dist/rules/nats/nats-exporter.yml
@ -5,24 +5,6 @@ groups:
  
  rules:

-    - alert: NatsHighConnectionCount
-      expr: 'gnatsd_varz_connections > 100'
-      for: 3m
-      labels:
-        severity: warning
-      annotations:
-        summary: Nats high connection count (instance {{ $labels.instance }})
-        description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: NatsHighSubscriptionsCount
-      expr: 'gnatsd_connz_subscriptions > 50'
-      for: 3m
-      labels:
-        severity: warning
-      annotations:
-        summary: Nats high subscriptions count (instance {{ $labels.instance }})
-        description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
    - alert: NatsHighRoutesCount
      expr: 'gnatsd_varz_routes > 10'
      for: 3m
@ -59,8 +41,9 @@ groups:
        summary: Nats server down (instance {{ $labels.instance }})
        description: "NATS server has been down for more than 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale).
    - alert: NatsHighCpuUsage
-      expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8'
+      expr: 'gnatsd_varz_cpu > 80'
      for: 5m
      labels:
        severity: warning
@ -78,7 +61,7 @@ groups:
        description: "NATS server has more than 1000 active connections\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsHighJetstreamStoreUsage
-      expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8'
+      expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0'
      for: 5m
      labels:
        severity: warning
@ -87,7 +70,7 @@ groups:
        description: "JetStream store usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsHighJetstreamMemoryUsage
-      expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8'
+      expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0'
      for: 5m
      labels:
        severity: warning
@ -122,56 +105,20 @@ groups:
        summary: Nats too many errors (instance {{ $labels.instance }})
        description: "NATS server has encountered errors in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: NatsJetstreamConsumersExceeded
+    - alert: NatsJetstreamAccountsExceeded
      expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
      for: 5m
      labels:
        severity: warning
      annotations:
-        summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }})
-        description: "JetStream has more than 100 active consumers\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: NatsFrequentAuthenticationTimeouts
-      expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: Nats frequent authentication timeouts (instance {{ $labels.instance }})
-        description: "There have been more than 5 authentication timeouts in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: NatsMaxPayloadSizeExceeded
-      expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024'
-      for: 5m
-      labels:
-        severity: critical
-      annotations:
-        summary: Nats max payload size exceeded (instance {{ $labels.instance }})
-        description: "The max payload size allowed by NATS has been exceeded (1MB)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Nats JetStream accounts exceeded (instance {{ $labels.instance }})
+        description: "JetStream has more than 100 active accounts\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsLeafNodeConnectionIssue
-      expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0'
-      for: 5m
-      labels:
-        severity: critical
-      annotations:
-        summary: Nats leaf node connection issue (instance {{ $labels.instance }})
-        description: "No leaf node connections have been established in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: NatsMaxPingOperationsExceeded
-      expr: 'gnatsd_varz_ping_max > 50'
+      expr: 'gnatsd_varz_leafnodes == 0'
      for: 5m
      labels:
        severity: warning
      annotations:
-        summary: Nats max ping operations exceeded (instance {{ $labels.instance }})
-        description: "The maximum number of ping operations in NATS has exceeded 50\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: NatsWriteDeadlineExceeded
-      expr: 'gnatsd_varz_write_deadline > 10'
-      for: 5m
-      labels:
-        severity: critical
-      annotations:
-        summary: Nats write deadline exceeded (instance {{ $labels.instance }})
-        description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Nats leaf node connection issue (instance {{ $labels.instance }})
+        description: "No leaf node connections on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/patroni/embedded-exporter-patroni.yml
+++ b/dist/rules/patroni/embedded-exporter-patroni.yml
@ -5,9 +5,10 @@ groups:
  
  rules:

+    # 1m delay allows a restart without triggering an alert.
    - alert: PatroniHasNoLeader
-      expr: '(max by (scope) (patroni_master) < 1) and (max by (scope) (patroni_standby_leader) < 1)'
-      for: 0m
+      expr: '(max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1)'
+      for: 1m
      labels:
        severity: critical
      annotations:
--- a/dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml
+++ b/dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml
@ -21,10 +21,10 @@ groups:
        severity: warning
      annotations:
        summary: PGBouncer errors (instance {{ $labels.instance }})
-        description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PgbouncerMaxConnections
-      expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[30s]) > 0'
+      expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0'
      for: 0m
      labels:
        severity: critical
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@ -5,9 +5,10 @@ groups:
  
  rules:

+    # 1m delay allows a restart without triggering an alert.
    - alert: PostgresqlDown
      expr: 'pg_up == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -104,24 +105,6 @@ groups:
        summary: Postgresql low XID consumption (instance {{ $labels.instance }})
        description: "Postgresql seems to be consuming transaction IDs very slowly\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: PostgresqlHighRateStatementTimeout
-      expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
-        description: "Postgres transactions showing high rate of statement timeouts\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PostgresqlHighRateDeadlock
-      expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
-        description: "Postgres detected deadlocks\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
    - alert: PostgresqlUnusedReplicationSlot
      expr: '(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)'
      for: 1m
@ -150,7 +133,7 @@ groups:
        description: "Postgres Database configuration change has occurred\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlSslCompressionActive
-      expr: 'sum(pg_stat_ssl_compression) > 0'
+      expr: 'sum by (instance) (pg_stat_ssl_compression) > 0'
      for: 0m
      labels:
        severity: warning
@ -159,7 +142,7 @@ groups:
        description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlTooManyLocksAcquired
-      expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
+      expr: '((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
      for: 2m
      labels:
        severity: critical
--- a/dist/rules/pulsar/embedded-exporter.yml
+++ b/dist/rules/pulsar/embedded-exporter.yml
@ -24,7 +24,7 @@ groups:
        description: "The number of subscription backlog entries is over 100k\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarTopicLargeBacklogStorageSize
-      expr: 'sum(pulsar_storage_size > 5*1024*1024*1024) by (topic)'
+      expr: 'sum(pulsar_storage_size) by (topic) > 5*1024*1024*1024'
      for: 1h
      labels:
        severity: warning
@ -33,7 +33,7 @@ groups:
        description: "The topic backlog storage size is over 5 GB\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarTopicVeryLargeBacklogStorageSize
-      expr: 'sum(pulsar_storage_size > 20*1024*1024*1024) by (topic)'
+      expr: 'sum(pulsar_storage_size) by (topic) > 20*1024*1024*1024'
      for: 1h
      labels:
        severity: critical
@ -78,7 +78,7 @@ groups:
        description: "Observing Readonly Bookies\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarHighNumberOfFunctionErrors
-      expr: 'sum((rate(pulsar_function_user_exceptions_total{}[1m]) + rate(pulsar_function_system_exceptions_total{}[1m])) > 10) by (name)'
+      expr: 'sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10'
      for: 1m
      labels:
        severity: critical
@ -87,7 +87,7 @@ groups:
        description: "Observing more than 10 Function errors per minute\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarHighNumberOfSinkErrors
-      expr: 'sum(rate(pulsar_sink_sink_exceptions_total{}[1m]) > 10) by (name)'
+      expr: 'sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10'
      for: 1m
      labels:
        severity: critical
--- a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
@ -5,18 +5,20 @@ groups:
  
  rules:

+    # 1m delay allows a restart without triggering an alert.
    - alert: RabbitmqDown
      expr: 'rabbitmq_up == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
        summary: RabbitMQ down (instance {{ $labels.instance }})
        description: "RabbitMQ node down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # 1m delay allows a restart without triggering an alert.
    - alert: RabbitmqClusterDown
      expr: 'sum(rabbitmq_running) < 3'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -33,13 +35,13 @@ groups:
        description: "Cluster partition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqOutOfMemory
-      expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90'
+      expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ out of memory (instance {{ $labels.instance }})
-        description: "Memory available for RabbmitMQ is low (< 10%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Memory available for RabbitMQ is low (< 10%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqTooManyConnections
      expr: 'rabbitmq_connectionsTotal > 1000'
@ -80,9 +82,10 @@ groups:
        summary: RabbitMQ slow queue consuming (instance {{ $labels.instance }})
        description: "Queue messages are consumed slowly (> 60s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Allows a short service restart.
    - alert: RabbitmqNoConsumer
      expr: 'rabbitmq_queue_consumers == 0'
-      for: 1m
+      for: 5m
      labels:
        severity: critical
      annotations:
@ -100,11 +103,11 @@ groups:
        description: "Queue should have only 1 consumer\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Indicate the exchange name in dedicated label.
-    - alert: RabbitmqUnactiveExchange
+    - alert: RabbitmqInactiveExchange
      expr: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: RabbitMQ unactive exchange (instance {{ $labels.instance }})
+        summary: RabbitMQ inactive exchange (instance {{ $labels.instance }})
        description: "Exchange receive less than 5 msgs per second\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/rabbitmq/rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml
@ -5,18 +5,20 @@ groups:
  
  rules:

+    # 1m delay allows a restart without triggering an alert.
    - alert: RabbitmqNodeDown
      expr: 'sum(rabbitmq_build_info) < 3'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
        summary: RabbitMQ node down (instance {{ $labels.instance }})
        description: "Less than 3 nodes running in RabbitMQ cluster\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # 1m delay allows a restart without triggering an alert.
    - alert: RabbitmqNodeNotDistributed
      expr: 'erlang_vm_dist_node_state < 3'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -33,7 +35,7 @@ groups:
        description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqMemoryHigh
-      expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90'
+      expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0'
      for: 2m
      labels:
        severity: warning
@ -42,7 +44,7 @@ groups:
        description: "A node use more than 90% of allocated RAM\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqFileDescriptorsUsage
-      expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90'
+      expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0'
      for: 2m
      labels:
        severity: warning
@ -57,7 +59,7 @@ groups:
        severity: warning
      annotations:
        summary: RabbitMQ too many ready messages (instance {{ $labels.instance }})
-        description: "RabbitMQ too many ready messages on {{ $labels.instace }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "RabbitMQ too many ready messages on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqTooManyUnackMessages
      expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
--- a/dist/rules/redis/oliver006-redis-exporter.yml
+++ b/dist/rules/redis/oliver006-redis-exporter.yml
@ -5,9 +5,10 @@ groups:
  
  rules:

+    # 1m delay allows a restart without triggering an alert.
    - alert: RedisDown
      expr: 'redis_up == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -23,9 +24,10 @@ groups:
        summary: Redis missing master (instance {{ $labels.instance }})
        description: "Redis cluster has no node marked as master.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # 1m delay allows a restart without triggering an alert.
    - alert: RedisTooManyMasters
      expr: 'count(redis_instance_info{role="master"}) > 1'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -60,13 +62,13 @@ groups:
        description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RedisMissingBackup
-      expr: 'time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24'
+      expr: 'time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 48'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Redis missing backup (instance {{ $labels.instance }})
-        description: "Redis has not been backuped for 24 hours\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Redis has not been backed up for 48 hours\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
    - alert: RedisOutOfSystemMemory
@ -106,10 +108,10 @@ groups:
        description: "Redis instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RedisRejectedConnections
-      expr: 'increase(redis_rejected_connections_total[1m]) > 0'
+      expr: 'increase(redis_rejected_connections_total[1m]) > 5'
      for: 0m
      labels:
-        severity: critical
+        severity: warning
      annotations:
        summary: Redis rejected connections (instance {{ $labels.instance }})
        description: "Some connections to Redis has been rejected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/solr/embedded-exporter.yml
+++ b/dist/rules/solr/embedded-exporter.yml
@ -30,7 +30,7 @@ groups:
        severity: critical
      annotations:
        summary: Solr replication errors (instance {{ $labels.instance }})
-        description: "Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SolrLowLiveNodeCount
      expr: 'solr_collections_live_nodes < 2'
--- a/dist/rules/sql-server/ozarklake-mssql-exporter.yml
+++ b/dist/rules/sql-server/ozarklake-mssql-exporter.yml
@ -5,9 +5,10 @@ groups:
  
  rules:

+    # 1m delay allows a restart without triggering an alert.
    - alert: SqlServerDown
      expr: 'mssql_up == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -15,10 +16,10 @@ groups:
        description: "SQL server instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SqlServerDeadlock
-      expr: 'increase(mssql_deadlocks[1m]) > 5'
-      for: 0m
+      expr: 'mssql_deadlocks > 5'
+      for: 1m
      labels:
        severity: warning
      annotations:
        summary: SQL Server deadlock (instance {{ $labels.instance }})
-        description: "SQL Server is having some deadlock.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "SQL Server {{ $labels.instance }} is experiencing deadlocks ({{ $value }}/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml
+++ b/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml
@ -5,9 +5,10 @@ groups:
  
  rules:

+    # 1m delay allows a restart without triggering an alert.
    - alert: ZookeeperDown
      expr: 'zk_up == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations: