Publish

2026-06-21 00:47:18 +08:00 · 2026-04-06 18:38:45 +00:00 · 2026-04-06 18:38:45 +00:00 · ed1515015a
commit ed1515015a
parent 2258835c30
65 changed files with 311 additions and 241 deletions
--- a/dist/rules/apache-flink/flink-prometheus-reporter.yml
+++ b/dist/rules/apache-flink/flink-prometheus-reporter.yml
@ -35,7 +35,7 @@ groups:

    # A single restart may be normal during deployments. Adjust threshold based on restart tolerance.
    - alert: FlinkJobRestartIncreasing
-      expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 1'
+      expr: 'delta(flink_jobmanager_job_numRestarts[5m]) > 1'
      for: 5m
      labels:
        severity: warning
@ -44,7 +44,7 @@ groups:
        description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FlinkCheckpointFailures
-      expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1'
+      expr: 'delta(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1'
      for: 5m
      labels:
        severity: warning
@ -82,8 +82,9 @@ groups:
        summary: Flink task high backpressure time (instance {{ $labels.instance }})
        description: "Flink task {{ $labels.task_name }} is spending {{ $value | humanize }}ms/sec in backpressure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Flink TaskManagers manage their own memory pool. High JVM heap usage (outside managed memory) may indicate memory leaks or misconfiguration.
    - alert: FlinkTaskmanagerHeapMemoryHigh
-      expr: 'flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9'
+      expr: 'flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_taskmanager_Status_JVM_Memory_Heap_Max > 0'
      for: 5m
      labels:
        severity: warning
@ -92,7 +93,7 @@ groups:
        description: "Flink TaskManager {{ $labels.instance }} heap memory usage is above 90%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FlinkJobmanagerHeapMemoryHigh
-      expr: 'flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9'
+      expr: 'flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_jobmanager_Status_JVM_Memory_Heap_Max > 0'
      for: 5m
      labels:
        severity: warning
@ -100,9 +101,10 @@ groups:
        summary: Flink JobManager heap memory high (instance {{ $labels.instance }})
        description: "Flink JobManager {{ $labels.instance }} heap memory usage is above 90%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Flink exposes GC time as a gauge (cumulative milliseconds), so deriv() is used instead of rate().
    # Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload.
    - alert: FlinkTaskmanagerGcTimeHigh
-      expr: 'rate(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100'
+      expr: 'deriv(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100'
      for: 5m
      labels:
        severity: warning
@ -112,7 +114,7 @@ groups:

    # Only fires for tasks that have previously received records, to avoid false positives during startup.
    - alert: FlinkNoRecordsProcessed
-      expr: 'rate(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0'
+      expr: 'delta(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/apache/lusitaniae-apache-exporter.yml
+++ b/dist/rules/apache/lusitaniae-apache-exporter.yml
@ -27,7 +27,7 @@ groups:
      expr: 'apache_uptime_seconds_total / 60 < 1'
      for: 0m
      labels:
-        severity: warning
+        severity: info
      annotations:
        summary: Apache restart (instance {{ $labels.instance }})
        description: "Apache has just been restarted.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/apc-ups/apcupsd_exporter.yml
+++ b/dist/rules/apc-ups/apcupsd_exporter.yml
@ -33,7 +33,7 @@ groups:
        description: "UPS now running on battery (since {{$value | humanizeDuration}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ApcUpsLowBatteryVoltage
-      expr: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95'
+      expr: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95 and apcupsd_battery_nominal_volts > 0'
      for: 0m
      labels:
        severity: warning
--- a/dist/rules/blackbox/blackbox-exporter.yml
+++ b/dist/rules/blackbox/blackbox-exporter.yml
@ -7,7 +7,7 @@ groups:

    - alert: BlackboxProbeFailed
      expr: 'probe_success == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -34,7 +34,7 @@ groups:

    - alert: BlackboxProbeHttpFailure
      expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
--- a/dist/rules/caddy/embedded-exporter.yml
+++ b/dist/rules/caddy/embedded-exporter.yml
@ -6,13 +6,13 @@ groups:
  rules:

    - alert: CaddyReverseProxyDown
-      expr: 'count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0'
+      expr: 'caddy_reverse_proxy_upstreams_healthy == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }})
-        description: "All Caddy reverse proxies are down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CaddyHighHttp4xxErrorRateService
      expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
--- a/dist/rules/cassandra/criteo-cassandra-exporter.yml
+++ b/dist/rules/cassandra/criteo-cassandra-exporter.yml
@ -33,7 +33,7 @@ groups:
        description: "High viewwrite latency on {{ $labels.instance }} cassandra node\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraAuthenticationFailures
-      expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
+      expr: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
      for: 2m
      labels:
        severity: warning
@ -97,7 +97,7 @@ groups:
        description: "Some Cassandra repair tasks are blocked\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraConnectionTimeoutsTotal(criteo)
-      expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
+      expr: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
      for: 2m
      labels:
        severity: critical
@ -142,7 +142,7 @@ groups:
        description: "Read failures have occurred because too many nodes are unavailable\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestWriteFailure(criteo)
-      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0'
+      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0.05'
      for: 0m
      labels:
        severity: critical
@ -151,7 +151,7 @@ groups:
        description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestReadFailure(criteo)
-      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0'
+      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0.05'
      for: 0m
      labels:
        severity: critical
@ -159,11 +159,12 @@ groups:
        summary: Cassandra client request read failure (Criteo) (instance {{ $labels.instance }})
        description: "A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # A low key cache hit rate increases disk I/O. Threshold is workload-dependent — adjust based on your data access patterns.
    - alert: CassandraCacheHitRateKeyCache
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85'
      for: 2m
      labels:
-        severity: critical
+        severity: warning
      annotations:
        summary: Cassandra cache hit rate key cache (instance {{ $labels.instance }})
        description: "Key cache hit rate is below 85%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/cassandra/instaclustr-cassandra-exporter.yml
+++ b/dist/rules/cassandra/instaclustr-cassandra-exporter.yml
@ -97,7 +97,7 @@ groups:
        description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestWriteFailure(instaclustr)
-      expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
+      expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 5'
      for: 2m
      labels:
        severity: critical
@ -106,7 +106,7 @@ groups:
        description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraClientRequestReadFailure(instaclustr)
-      expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
+      expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 5'
      for: 2m
      labels:
        severity: critical
--- a/dist/rules/ceph/embedded-exporter.yml
+++ b/dist/rules/ceph/embedded-exporter.yml
@ -5,9 +5,11 @@ groups:
  
  rules:

+    # ceph_health_status: 0=HEALTH_OK, 1=HEALTH_WARN, 2=HEALTH_ERR.
+    # This rule fires on any non-OK state. Split into separate warning/critical rules by using ==1 and ==2 thresholds if needed.
    - alert: CephState
      expr: 'ceph_health_status != 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -34,15 +36,16 @@ groups:

    - alert: CephOsdDown
      expr: 'ceph_osd_up == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Ceph OSD Down (instance {{ $labels.instance }})
        description: "Ceph Object Storage Daemon Down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 5000ms (5 seconds). Adjust based on your expected OSD performance.
    - alert: CephHighOsdLatency
-      expr: 'ceph_osd_perf_apply_latency_seconds > 5'
+      expr: 'ceph_osd_apply_latency_ms > 5000'
      for: 1m
      labels:
        severity: warning
@ -50,14 +53,16 @@ groups:
        summary: Ceph high OSD latency (instance {{ $labels.instance }})
        description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CephOsdLowSpace
-      expr: 'ceph_osd_utilization > 90'
-      for: 2m
+    # Ceph internally triggers OSD_NEARFULL based on the nearfull_ratio (default 85%).
+    # ceph_health_detail can also be used for more granular OSD space alerts.
+    - alert: CephOsdNearFull
+      expr: 'ceph_health_detail{name="OSD_NEARFULL"} == 1'
+      for: 5m
      labels:
        severity: warning
      annotations:
-        summary: Ceph OSD low space (instance {{ $labels.instance }})
-        description: "Ceph Object Storage Daemon is going out of space. Please add more disks.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Ceph OSD near full (instance {{ $labels.instance }})
+        description: "A Ceph OSD is dangerously full. Please add more disks.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephOsdReweighted
      expr: 'ceph_osd_weight < 1'
@ -115,7 +120,7 @@ groups:

    - alert: CephPgUnavailable
      expr: 'ceph_pg_total - ceph_pg_active > 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
--- a/dist/rules/cert-manager/embedded-exporter.yml
+++ b/dist/rules/cert-manager/embedded-exporter.yml
@ -33,9 +33,10 @@ groups:
        summary: Cert-Manager certificate not ready (instance {{ $labels.instance }})
        description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # In cert-manager 1.19+, the metric was renamed (dropped http_ prefix). Verify metric name against your version.
+    # Metric renamed in cert-manager v1.19+ (dropped the http_ prefix): certmanager_acme_client_request_count.
+    # For cert-manager < v1.19, use: certmanager_http_acme_client_request_count.
    - alert: Cert-managerHittingAcmeRateLimits
-      expr: 'sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0'
+      expr: 'sum by (host) (rate(certmanager_acme_client_request_count{status="429"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
--- a/dist/rules/cilium/embedded-exporter.yml
+++ b/dist/rules/cilium/embedded-exporter.yml
@ -45,7 +45,7 @@ groups:
        description: "Cilium agent {{ $labels.pod }} has {{ $value }} endpoint(s) in invalid state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentEndpointRegenerationFailures
-      expr: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0'
+      expr: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0.05'
      for: 5m
      labels:
        severity: warning
@ -54,7 +54,7 @@ groups:
        description: "Cilium agent {{ $labels.pod }} is failing to regenerate endpoints. Network policy enforcement may be stale.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentEndpointUpdateFailure
-      expr: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0'
+      expr: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0.05'
      for: 5m
      labels:
        severity: warning
@ -63,7 +63,7 @@ groups:
        description: "Cilium agent {{ $labels.pod }} is failing K8s endpoint update API calls ({{ $labels.method }} {{ $labels.return_code }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentEndpointCreateFailure
-      expr: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0'
+      expr: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0.05'
      for: 5m
      labels:
        severity: info
@ -72,7 +72,7 @@ groups:
        description: "Cilium agent {{ $labels.pod }} is failing CNI endpoint-create calls. New pods may fail to get networking.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentMapOperationFailures
-      expr: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0'
+      expr: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0.05'
      for: 5m
      labels:
        severity: warning
@ -100,7 +100,7 @@ groups:
        description: "Cilium agent {{ $labels.pod }} conntrack table is full, causing packet drops. Increase CT map size or investigate connection leaks.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentConntrackFailedGarbageCollection
-      expr: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0'
+      expr: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0.05'
      for: 5m
      labels:
        severity: warning
@ -128,7 +128,7 @@ groups:
        description: "Cilium agent {{ $labels.pod }} is dropping packets due to policy denial. Verify network policies are correct.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentHighDropRate
-      expr: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0'
+      expr: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0.05'
      for: 5m
      labels:
        severity: warning
@ -146,7 +146,7 @@ groups:
        description: "Cilium agent {{ $labels.pod }} policy BPF map is above 90% utilization. New policies may fail to apply.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentPolicyImportErrors
-      expr: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0'
+      expr: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0.05'
      for: 5m
      labels:
        severity: warning
@ -156,7 +156,7 @@ groups:

    # Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity.
    - alert: CiliumAgentPolicyImplementationDelay
-      expr: 'histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay[5m])) by (le, pod)) > 60'
+      expr: 'histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay_bucket[5m])) by (le, pod)) > 60'
      for: 5m
      labels:
        severity: warning
@ -203,7 +203,7 @@ groups:

    # Some Cilium versions may not have a status label on this metric. Verify against your Cilium version.
    - alert: CiliumOperatorIpamInterfaceCreationFailures
-      expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0'
+      expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0.05'
      for: 10m
      labels:
        severity: warning
@ -212,7 +212,7 @@ groups:
        description: "Cilium operator is failing to create IPAM network interfaces. IP allocation may be impacted.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentApiErrors
-      expr: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0'
+      expr: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0.05'
      for: 5m
      labels:
        severity: warning
@ -221,7 +221,7 @@ groups:
        description: "Cilium agent {{ $labels.pod }} API is returning 5xx errors ({{ $labels.return_code }}). Agent may be unhealthy.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumAgentKubernetesClientErrors
-      expr: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0'
+      expr: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0.05'
      for: 5m
      labels:
        severity: info
@ -239,13 +239,13 @@ groups:
        description: "Cilium ClusterMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumClustermeshRemoteClusterFailing
-      expr: 'sum(rate(cilium_clustermesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0'
+      expr: 'sum(cilium_clustermesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Cilium ClusterMesh remote cluster failing (instance {{ $labels.instance }})
-        description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing ({{ $value }} failures).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumKvstoremeshRemoteClusterNotReady
      expr: 'count(cilium_kvstoremesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0'
@ -257,16 +257,16 @@ groups:
        description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumKvstoremeshRemoteClusterFailing
-      expr: 'sum(rate(cilium_kvstoremesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0'
+      expr: 'sum(cilium_kvstoremesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Cilium KVStoreMesh remote cluster failing (instance {{ $labels.instance }})
-        description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures ({{ $value }} failures).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumKvstoremeshSyncErrors
-      expr: 'sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0'
+      expr: 'sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0.05'
      for: 5m
      labels:
        severity: critical
@ -275,7 +275,7 @@ groups:
        description: "Cilium KVStoreMesh from {{ $labels.source_cluster }} is experiencing kvstore sync errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CiliumHubbleLostEvents
-      expr: 'sum(rate(hubble_lost_events_total[5m])) by (pod) > 0'
+      expr: 'sum(rate(hubble_lost_events_total[5m])) by (pod) > 0.05'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/clickhouse/embedded-exporter.yml
+++ b/dist/rules/clickhouse/embedded-exporter.yml
@ -135,7 +135,7 @@ groups:
        description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseRejectedInsertQueries
-      expr: 'increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0'
+      expr: 'increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 2'
      for: 1m
      labels:
        severity: warning
@ -144,7 +144,7 @@ groups:
        description: "INSERTs rejected due to too many active data parts. Reduce insert frequency.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseDelayedInsertQueries
-      expr: 'increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0'
+      expr: 'increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 10'
      for: 2m
      labels:
        severity: warning
@ -172,7 +172,7 @@ groups:
        description: "High network usage. ClickHouse network usage exceeds 100MB/s.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseDistributedRejectedInserts
-      expr: 'increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 0'
+      expr: 'increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 3'
      for: 2m
      labels:
        severity: critical
--- a/dist/rules/cortex/embedded-exporter.yml
+++ b/dist/rules/cortex/embedded-exporter.yml
@ -24,23 +24,23 @@ groups:
        description: "Cortex not connected to Alertmanager (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
-    - alert: CortexNotificationAreBeingDropped
+    - alert: CortexNotificationsAreBeingDropped
      expr: 'rate(cortex_prometheus_notifications_dropped_total[5m]) > 0.05'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: Cortex notification are being dropped (instance {{ $labels.instance }})
-        description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Cortex notifications are being dropped (instance {{ $labels.instance }})
+        description: "Cortex notifications are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
-    - alert: CortexNotificationError
+    - alert: CortexNotificationErrors
      expr: 'rate(cortex_prometheus_notifications_errors_total[5m]) > 0.05'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: Cortex notification error (instance {{ $labels.instance }})
+        summary: Cortex notification errors (instance {{ $labels.instance }})
        description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CortexIngesterUnhealthy
--- a/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml
+++ b/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml
@ -23,6 +23,7 @@ groups:
        summary: CouchDB atom memory usage critical (instance {{ $labels.instance }})
        description: "Atom memory usage is above 90% of limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # The default max_dbs_open is 500. Adjust the threshold (currently 0.9 * 1000) to match your max_dbs_open setting.
    - alert: CouchdbOpenDatabasesCritical
      expr: 'couchdb_httpd_open_databases > 0.9 * 1000'
      for: 5m
@ -32,6 +33,7 @@ groups:
        summary: CouchDB open databases critical (instance {{ $labels.instance }})
        description: "Number of open databases exceeds 90% of node capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Adjust 65535 to match your system's file descriptor limit (ulimit -n).
    - alert: CouchdbOpenOsFilesCritical
      expr: 'couchdb_httpd_open_os_files > 0.9 * 65535'
      for: 5m
@ -159,7 +161,7 @@ groups:
        description: "CouchDB process has restarted recently\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CouchdbCriticalLogEntries
-      expr: 'increase(couchdb_server_couch_log{level=~"error|critical"}[5m]) > 0'
+      expr: 'increase(couchdb_server_couch_log{level=~"error|critical"}[5m]) > 5'
      for: 1m
      labels:
        severity: critical
--- a/dist/rules/digitalocean/digitalocean-exporter.yml
+++ b/dist/rules/digitalocean/digitalocean-exporter.yml
@ -78,7 +78,7 @@ groups:
        description: "DigitalOcean platform has {{ $value }} active incident(s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: DigitaloceanExporterCollectionErrors
-      expr: 'increase(digitalocean_errors_total[5m]) > 0'
+      expr: 'increase(digitalocean_errors_total[5m]) > 3'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/docker-containers/google-cadvisor.yml
+++ b/dist/rules/docker-containers/google-cadvisor.yml
@ -73,7 +73,7 @@ groups:
        description: "This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ContainerLowCpuUtilization
-      expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
+      expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0'
      for: 7d
      labels:
        severity: info
--- a/dist/rules/ebpf/ebpf-exporter.yml
+++ b/dist/rules/ebpf/ebpf-exporter.yml
@ -16,7 +16,7 @@ groups:
        description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EbpfExporterDecoderErrors
-      expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0'
+      expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
+++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
@ -142,8 +142,9 @@ groups:
        summary: Elasticsearch no new documents (instance {{ $labels.instance }})
        description: "No new documents for 10 min!\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 10ms (0.01s) per indexing operation is a rough default. Adjust based on your document size and cluster performance.
    - alert: ElasticsearchHighIndexingLatency
-      expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0'
+      expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[5m]) / rate(elasticsearch_indices_indexing_index_total[5m]) > 0.01 and rate(elasticsearch_indices_indexing_index_total[5m]) > 0'
      for: 10m
      labels:
        severity: warning
@ -151,6 +152,7 @@ groups:
        summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
        description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 10000 ops/s is a rough default. Adjust based on your cluster capacity and expected workload.
    - alert: ElasticsearchHighIndexingRate
      expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
      for: 5m
@ -160,6 +162,7 @@ groups:
        summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }})
        description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 100 queries/s is very low for most production clusters. Adjust based on your expected query volume.
    - alert: ElasticsearchHighQueryRate
      expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100'
      for: 5m
--- a/dist/rules/envoy/embedded-exporter.yml
+++ b/dist/rules/envoy/embedded-exporter.yml
@ -66,7 +66,7 @@ groups:
        severity: warning
      annotations:
        summary: Envoy cluster membership degraded (instance {{ $labels.instance }})
-        description: "More than 25% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are unhealthy\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Only {{ $value | printf \"%.1f\" }}% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are healthy (threshold: 75%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighClusterUpstreamConnectionFailures
      expr: 'increase(envoy_cluster_upstream_cx_connect_fail[5m]) > 10'
@ -159,7 +159,7 @@ groups:
        description: "Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyNoHealthyUpstream
-      expr: 'increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 0'
+      expr: 'increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 3'
      for: 0m
      labels:
        severity: critical
--- a/dist/rules/etcd/embedded-exporter.yml
+++ b/dist/rules/etcd/embedded-exporter.yml
@ -61,6 +61,7 @@ groups:
        summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
        description: "GRPC requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x.
    - alert: EtcdHighNumberOfFailedHttpRequestsWarning
      expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
      for: 2m
@ -70,6 +71,7 @@ groups:
        summary: Etcd high number of failed HTTP requests warning (instance {{ $labels.instance }})
        description: "More than 1% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x.
    - alert: EtcdHighNumberOfFailedHttpRequestsCritical
      expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
      for: 2m
@ -79,6 +81,7 @@ groups:
        summary: Etcd high number of failed HTTP requests critical (instance {{ $labels.instance }})
        description: "More than 5% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # This etcd_http_* metric is from the etcd v2 API and does not exist in etcd 3.x. Remove this rule if running etcd 3.x.
    - alert: EtcdHttpRequestsSlow
      expr: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15'
      for: 2m
@ -89,7 +92,7 @@ groups:
        description: "HTTP requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdMemberCommunicationSlow
-      expr: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15'
+      expr: 'histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) by (instance, le)) > 0.15'
      for: 2m
      labels:
        severity: warning
@ -107,7 +110,7 @@ groups:
        description: "Etcd server got {{ $value }} failed proposals in the past hour\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdHighFsyncDurations
-      expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5'
+      expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le)) > 0.5'
      for: 2m
      labels:
        severity: warning
@ -116,7 +119,7 @@ groups:
        description: "Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdHighCommitDurations
-      expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25'
+      expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le)) > 0.25'
      for: 2m
      labels:
        severity: warning
--- a/dist/rules/freeswitch/znerol-freeswitch-exporter.yml
+++ b/dist/rules/freeswitch/znerol-freeswitch-exporter.yml
@ -7,12 +7,12 @@ groups:

    - alert: FreeswitchDown
      expr: 'freeswitch_up == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Freeswitch down (instance {{ $labels.instance }})
-        description: "Freeswitch is unresponsive\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Freeswitch {{ $labels.instance }} is unresponsive.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FreeswitchSessionsWarning
      expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80 and freeswitch_session_limit > 0'
--- a/dist/rules/gitlab-ci/gitaly.yml
+++ b/dist/rules/gitlab-ci/gitaly.yml
@ -5,9 +5,9 @@ groups:
  
  rules:

-    # grpc_code!="OK" includes non-error codes like NotFound, AlreadyExists. Consider filtering to specific error codes for less noise.
+    # Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
    - alert: GitlabGitalyHighGrpcErrorRate
-      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
+      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown|DataLoss"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
      for: 5m
      labels:
        severity: warning
@ -17,7 +17,6 @@ groups:

    # ResourceExhausted errors from Gitaly mean Git operations are being rejected due to
    # concurrency limits. This directly impacts users trying to push, pull, or clone.
-    # This alert is derived from the GitLab Omnibus default rules.
    - alert: GitlabGitalyResourceExhausted
      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
      for: 5m
@ -36,8 +35,9 @@ groups:
        summary: GitLab Gitaly high RPC latency (instance {{ $labels.instance }})
        description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Brief throttling spikes are normal. Threshold of 0.1s/s (10% of CPU time throttled) filters out transient noise.
    - alert: GitlabGitalyCpuThrottled
-      expr: 'rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0'
+      expr: 'rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0.1'
      for: 5m
      labels:
        severity: warning
@ -46,7 +46,7 @@ groups:
        description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabGitalyAuthenticationFailures
-      expr: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 0'
+      expr: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 3'
      for: 0m
      labels:
        severity: warning
--- a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml
+++ b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml
@ -138,7 +138,7 @@ groups:

    # This metric may not exist in all GitLab versions. Verify against your GitLab installation.
    - alert: GitlabCiPipelineFailuresIncreasing
-      expr: 'rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0'
+      expr: 'deriv(gitlab_ci_pipeline_failure_reasons[5m]) > 0.05'
      for: 10m
      labels:
        severity: warning
@ -179,7 +179,7 @@ groups:
        description: "GitLab Ruby heap fragmentation on {{ $labels.instance }} is {{ $value }}. High fragmentation wastes memory.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GitlabRackUncaughtErrors
-      expr: 'rate(rack_uncaught_errors_total[5m]) > 0'
+      expr: 'rate(rack_uncaught_errors_total[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/golang/golang-exporter.yml
+++ b/dist/rules/golang/golang-exporter.yml
@ -57,10 +57,10 @@ groups:
        summary: Go heap objects count high (instance {{ $labels.instance }})
        description: "Go heap has too many live objects (> 10M), high GC pressure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # go_memstats_gc_cpu_fraction is deprecated since Go 1.20 and may return 0 in newer versions.
-    # Consider using runtime/metrics-based alternatives if running Go >= 1.20.
+    # rate(go_gc_duration_seconds_sum) approximates the fraction of wall-clock time spent in GC.
+    # This replaces go_memstats_gc_cpu_fraction which was removed in client_golang v1.12+.
    - alert: GoGcCpuFractionHigh
-      expr: 'go_memstats_gc_cpu_fraction > 0.05'
+      expr: 'rate(go_gc_duration_seconds_sum[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
@ -68,23 +68,27 @@ groups:
        summary: Go GC CPU fraction high (instance {{ $labels.instance }})
        description: "Go GC is consuming too much CPU (> 5%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # A threshold of 100/s only catches catastrophic leaks (30k goroutines in 5m). 10/s catches gradual leaks (~3k in 5m).
+    # Adjust based on your application's expected concurrency patterns.
    - alert: GoGoroutineSpike
-      expr: 'deriv(go_goroutines[5m]) > 100'
+      expr: 'deriv(go_goroutines[5m]) > 10'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Go goroutine spike (instance {{ $labels.instance }})
-        description: "Go goroutine count is growing rapidly\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Go goroutine count is growing rapidly ({{ $value | printf \"%.0f\" }} goroutines/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: GoHeapFragmentation
-      expr: 'go_memstats_heap_idle_bytes / go_memstats_heap_sys_bytes > 0.9'
-      for: 5m
+    # Alerts when heap in-use grows by more than 10MB/s sustained over 10 minutes.
+    # Adjust threshold based on your workload.
+    - alert: GoHeapIn-useGrowing
+      expr: 'deriv(go_memstats_heap_inuse_bytes[10m]) > 1e7'
+      for: 0m
      labels:
        severity: warning
      annotations:
-        summary: Go heap fragmentation (instance {{ $labels.instance }})
-        description: "Go heap has high idle ratio (> 90%), indicating memory fragmentation\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Go heap in-use growing (instance {{ $labels.instance }})
+        description: "Go heap in-use memory is growing steadily, potential memory leak or under-sized heap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: GoMemoryLeak
      expr: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9'
--- a/dist/rules/grafana-mimir/embedded-exporter.yml
+++ b/dist/rules/grafana-mimir/embedded-exporter.yml
@ -178,8 +178,9 @@ groups:
        summary: Mimir distributor inflight requests high (instance {{ $labels.instance }})
        description: "Mimir distributor {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its inflight push requests limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirIngesterTsdbHeadCompactionFailed
-      expr: 'rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0'
+      expr: 'rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0.05'
      for: 15m
      labels:
        severity: critical
@ -187,26 +188,29 @@ groups:
        summary: Mimir ingester TSDB head compaction failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirIngesterTsdbHeadTruncationFailed
-      expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0'
-      for: 0m
+      expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0.05'
+      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Mimir ingester TSDB head truncation failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirIngesterTsdbCheckpointCreationFailed
-      expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0'
-      for: 0m
+      expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0.05'
+      for: 15m
      labels:
        severity: critical
      annotations:
        summary: Mimir ingester TSDB checkpoint creation failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirIngesterTsdbCheckpointDeletionFailed
-      expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0'
+      expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0.05'
      for: 0m
      labels:
        severity: critical
@ -214,8 +218,9 @@ groups:
        summary: Mimir ingester TSDB checkpoint deletion failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirIngesterTsdbWalTruncationFailed
-      expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0'
+      expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0.05'
      for: 0m
      labels:
        severity: warning
@ -223,8 +228,9 @@ groups:
        summary: Mimir ingester TSDB WAL truncation failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirIngesterTsdbWalWritesFailed
-      expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0'
+      expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0.05'
      for: 3m
      labels:
        severity: critical
@ -232,7 +238,7 @@ groups:
        summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # Threshold aligned with official Mimir mixin (30 minutes).
+    # Threshold of 30 minutes. Adjust based on your sync interval.
    - alert: MimirStoreGatewayHasNotSyncedBucket
      expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0'
      for: 5m
@ -240,7 +246,7 @@ groups:
        severity: critical
      annotations:
        summary: Mimir store gateway has not synced bucket (instance {{ $labels.instance }})
-        description: "Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 30 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirStoreGatewayNoSyncedTenants
      expr: '(min by (instance, job) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0) and on (instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"} offset 1h > 0)'
@ -287,8 +293,9 @@ groups:
        summary: Mimir compactor has consecutive failures (instance {{ $labels.instance }})
        description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # cortex_compactor_disk_out_of_space_errors_total is declared as gauge by Mimir despite the _total suffix, so delta() is used instead of increase().
    - alert: MimirCompactorHasRunOutOfDiskSpace
-      expr: 'increase(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1'
+      expr: 'delta(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1'
      for: 0m
      labels:
        severity: critical
@ -305,7 +312,7 @@ groups:
        summary: Mimir compactor has not uploaded blocks (instance {{ $labels.instance }})
        description: "Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # Using 24h window per official mixin — compaction skips are rare events.
+    # Using a 24h window as compaction skips are rare events.
    - alert: MimirCompactorSkippedBlocks
      expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0'
      for: 5m
@ -352,8 +359,9 @@ groups:
        summary: Mimir ruler failed ring check (instance {{ $labels.instance }})
        description: "Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirAlertmanagerSyncConfigsFailing
-      expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0'
+      expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0.05'
      for: 30m
      labels:
        severity: critical
@ -361,8 +369,9 @@ groups:
        summary: Mimir alertmanager sync configs failing (instance {{ $labels.instance }})
        description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirAlertmanagerRingCheckFailing
-      expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0'
+      expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0.05'
      for: 10m
      labels:
        severity: critical
@ -370,8 +379,9 @@ groups:
        summary: Mimir alertmanager ring check failing (instance {{ $labels.instance }})
        description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirAlertmanagerStateMergeFailing
-      expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0'
+      expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0.05'
      for: 10m
      labels:
        severity: critical
@ -379,8 +389,9 @@ groups:
        summary: Mimir alertmanager state merge failing (instance {{ $labels.instance }})
        description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirAlertmanagerReplicationFailing
-      expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0'
+      expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0.05'
      for: 10m
      labels:
        severity: critical
@ -388,8 +399,9 @@ groups:
        summary: Mimir alertmanager replication failing (instance {{ $labels.instance }})
        description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirAlertmanagerPersistStateFailing
-      expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0'
+      expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0.05'
      for: 1h
      labels:
        severity: critical
--- a/dist/rules/grafana-tempo/embedded-exporter.yml
+++ b/dist/rules/grafana-tempo/embedded-exporter.yml
@ -117,7 +117,7 @@ groups:
        summary: Tempo compaction too many outstanding blocks warning (instance {{ $labels.instance }})
        description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration.
+    # Threshold of 100 blocks per compactor instance. Normalize by backend-worker count if needed. Adjust based on your environment.
    - alert: TempoCompactionTooManyOutstandingBlocksCritical
      expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 250'
      for: 24h
@ -127,8 +127,9 @@ groups:
        summary: Tempo compaction too many outstanding blocks critical (instance {{ $labels.instance }})
        description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Increase compactor resources immediately.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: TempoDistributorUsageTrackerErrors
-      expr: 'sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0'
+      expr: 'sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0.05'
      for: 30m
      labels:
        severity: critical
@ -137,7 +138,7 @@ groups:
        description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoMetricsGeneratorProcessorUpdatesFailing
-      expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 0'
+      expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 2'
      for: 15m
      labels:
        severity: critical
@ -146,7 +147,7 @@ groups:
        description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoMetricsGeneratorServiceGraphsDroppingSpans
-      expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
+      expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans_total[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
      for: 15m
      labels:
        severity: warning
--- a/dist/rules/graph-node/embedded-exporter.yml
+++ b/dist/rules/graph-node/embedded-exporter.yml
@ -41,6 +41,7 @@ groups:
        summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
        description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 10ms. Adjust based on your expected database latency.
    - alert: StoreConnectionSlow
      expr: 'store_connection_wait_time_ms > 10'
      for: 0m
@ -50,6 +51,7 @@ groups:
        summary: Store connection slow (instance {{ $labels.instance }})
        description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 20ms. Adjust based on your expected database latency.
    - alert: StoreConnectionVerySlow
      expr: 'store_connection_wait_time_ms > 20'
      for: 0m
--- a/dist/rules/hadoop/jmx_exporter.yml
+++ b/dist/rules/hadoop/jmx_exporter.yml
@ -5,6 +5,9 @@ groups:
  
  rules:

+    # When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
+    # so this alert may not fire. Prefer application-level availability metrics if available.
+    # Rename job="hadoop-namenode" to match the actual job name in your Prometheus scrape config.
    - alert: HadoopNameNodeDown
      expr: 'up{job="hadoop-namenode"} == 0'
      for: 5m
@ -14,6 +17,9 @@ groups:
        summary: Hadoop Name Node Down (instance {{ $labels.instance }})
        description: "The Hadoop NameNode service is unavailable.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
+    # so this alert may not fire. Prefer application-level availability metrics if available.
+    # Rename job="hadoop-resourcemanager" to match the actual job name in your Prometheus scrape config.
    - alert: HadoopResourceManagerDown
      expr: 'up{job="hadoop-resourcemanager"} == 0'
      for: 5m
@ -51,7 +57,7 @@ groups:
        description: "There is an unusually high number of MapReduce task failures.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopResourceManagerMemoryHigh
-      expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8'
+      expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 and hadoop_resourcemanager_memory_max_bytes > 0'
      for: 15m
      labels:
        severity: warning
@ -78,7 +84,7 @@ groups:
        description: "The HBase cluster has an unusually high number of regions.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HadoopHbaseRegionServerHeapLow
-      expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8'
+      expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 and hadoop_hbase_region_server_max_heap_bytes > 0'
      for: 10m
      labels:
        severity: warning
--- a/dist/rules/haproxy/embedded-exporter-v2.yml
+++ b/dist/rules/haproxy/embedded-exporter-v2.yml
@ -12,7 +12,7 @@ groups:
        severity: critical
      annotations:
        summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
-        description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.proxy }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHighHttp5xxErrorRateBackend
      expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0'
@ -21,7 +21,7 @@ groups:
        severity: critical
      annotations:
        summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
-        description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.proxy }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHighHttp4xxErrorRateServer
      expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
@ -57,7 +57,7 @@ groups:
        severity: critical
      annotations:
        summary: HAProxy backend connection errors (instance {{ $labels.instance }})
-        description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Too many connection errors to {{ $labels.proxy }} backend (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyServerConnectionErrors
      expr: '(sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100'
@ -66,10 +66,10 @@ groups:
        severity: critical
      annotations:
        summary: HAProxy server connection errors (instance {{ $labels.instance }})
-        description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Too many connection errors to {{ $labels.proxy }} (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyBackendMaxActiveSession>80%
-      expr: '((haproxy_backend_current_sessions >0) * 100) / (haproxy_backend_limit_sessions > 0) > 80'
+      expr: '(haproxy_backend_current_sessions / haproxy_backend_limit_sessions * 100) > 80 and haproxy_backend_limit_sessions > 0'
      for: 2m
      labels:
        severity: warning
@ -94,7 +94,7 @@ groups:
        severity: warning
      annotations:
        summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
-        description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "HAProxy backend max total time is above 1s on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyRetryHigh
      expr: 'sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
@ -124,8 +124,8 @@ groups:
        description: "HAProxy is blocking requests for security reason\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyServerHealthcheckFailure
-      expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
-      for: 1m
+      expr: 'increase(haproxy_server_check_failures_total[1m]) > 2'
+      for: 0m
      labels:
        severity: warning
      annotations:
--- a/dist/rules/haproxy/haproxy-exporter-v1.yml
+++ b/dist/rules/haproxy/haproxy-exporter-v1.yml
@ -15,22 +15,22 @@ groups:
        description: "HAProxy down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHighHttp4xxErrorRateBackend(v1)
-      expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
+      expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy high HTTP 4xx error rate backend (v1) (instance {{ $labels.instance }})
-        description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHighHttp5xxErrorRateBackend(v1)
-      expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
+      expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: HAProxy high HTTP 5xx error rate backend (v1) (instance {{ $labels.instance }})
-        description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHighHttp4xxErrorRateServer(v1)
      expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
@ -66,7 +66,7 @@ groups:
        severity: critical
      annotations:
        summary: HAProxy backend connection errors (v1) (instance {{ $labels.instance }})
-        description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Too many connection errors to {{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyServerConnectionErrors(v1)
      expr: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100'
@ -84,7 +84,7 @@ groups:
        severity: warning
      annotations:
        summary: HAProxy backend max active session (instance {{ $labels.instance }})
-        description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "HAProxy backend {{ $labels.backend }} is reaching session limit (> 80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyPendingRequests(v1)
      expr: 'sum by (backend) (haproxy_backend_current_queue) > 0'
@ -93,7 +93,7 @@ groups:
        severity: warning
      annotations:
        summary: HAProxy pending requests (v1) (instance {{ $labels.instance }})
-        description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Some HAProxy requests are pending on {{ $labels.backend }} backend\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyHttpSlowingDown(v1)
      expr: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1'
@ -111,7 +111,7 @@ groups:
        severity: warning
      annotations:
        summary: HAProxy retry high (v1) (instance {{ $labels.instance }})
-        description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "High rate of retry on {{ $labels.backend }} backend\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyBackendDown
      expr: 'haproxy_backend_up == 0'
@ -141,8 +141,8 @@ groups:
        description: "HAProxy is blocking requests for security reason\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyServerHealthcheckFailure(v1)
-      expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
-      for: 1m
+      expr: 'increase(haproxy_server_check_failures_total[1m]) > 2'
+      for: 0m
      labels:
        severity: warning
      annotations:
--- a/dist/rules/hashicorp-vault/embedded-exporter.yml
+++ b/dist/rules/hashicorp-vault/embedded-exporter.yml
@ -7,7 +7,7 @@ groups:

    - alert: VaultSealed
      expr: 'vault_core_unsealed == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
@ -21,7 +21,7 @@ groups:
        severity: warning
      annotations:
        summary: Vault too many pending tokens (instance {{ $labels.instance }})
-        description: "Too many pending tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Too many pending tokens on {{ $labels.instance }}: {{ $value }} tokens created but not yet stored.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: VaultTooManyInfinityTokens
      expr: 'vault_token_count_by_ttl{creation_ttl="+Inf"} > 3'
@ -30,13 +30,13 @@ groups:
        severity: warning
      annotations:
        summary: Vault too many infinity tokens (instance {{ $labels.instance }})
-        description: "Too many infinity tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Too many non-expiring tokens on {{ $labels.instance }}: {{ $value }} tokens with infinite TTL.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: VaultClusterHealth
-      expr: 'sum(vault_core_active) / count(vault_core_active) <= 0.5'
+      expr: 'sum(vault_core_active) / count(vault_core_active) <= 0.5 and count(vault_core_active) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Vault cluster health (instance {{ $labels.instance }})
-        description: "Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Vault cluster is not healthy: only {{ $value | humanizePercentage }} of nodes are active.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@ -14,8 +14,9 @@ groups:
        summary: Host out of memory (instance {{ $labels.instance }})
        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # node_vmstat_pgmajfault is exposed as untyped/gauge by node_exporter (from /proc/vmstat), so deriv() is used instead of rate().
    - alert: HostMemoryUnderMemoryPressure
-      expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
+      expr: '(deriv(node_vmstat_pgmajfault[5m]) > 1000)'
      for: 0m
      labels:
        severity: warning
@ -173,13 +174,13 @@ groups:
        severity: warning
      annotations:
        summary: Host unusual disk IO (instance {{ $labels.instance }})
-        description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # x2 context switches is an arbitrary number.
    # The alert threshold depends on the nature of the application.
    # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
    - alert: HostContextSwitchingHigh
-      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
+      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 and rate(node_context_switches_total[1d]) > 0'
      for: 0m
      labels:
        severity: warning
@ -223,7 +224,7 @@ groups:
        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin.
+    # Uses ignoring(state) to handle additional labels on node_md_disks.
    - alert: HostSoftwareRaidInsufficientDrives
      expr: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)'
      for: 0m
@ -253,7 +254,7 @@ groups:

    # When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger.
    - alert: HostOomKillDetected
-      expr: '(increase(node_vmstat_oom_kill[30m]) > 0)'
+      expr: '(delta(node_vmstat_oom_kill[30m]) > 0)'
      for: 0m
      labels:
        severity: warning
@ -268,7 +269,7 @@ groups:
        severity: info
      annotations:
        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
-        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 1 minute.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostEdacUncorrectableErrorsDetected
      expr: '(node_edac_uncorrectable_errors_total > 0)'
@ -277,7 +278,7 @@ groups:
        severity: warning
      annotations:
        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
-        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNetworkReceiveErrors
      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0'
--- a/dist/rules/istio/embedded-exporter.yml
+++ b/dist/rules/istio/embedded-exporter.yml
@ -12,17 +12,18 @@ groups:
        severity: warning
      annotations:
        summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
-        description: "Gateway pods have dropped. Inbound traffic will likely be affected.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Istio ingress gateway has only {{ $value }} available pod(s). Inbound traffic will likely be affected.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: IstioPilotHighTotalRequestRate
+    - alert: IstioPilotHighPushErrorRate
      expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0'
      for: 1m
      labels:
        severity: warning
      annotations:
-        summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
+        summary: Istio Pilot high push error rate (instance {{ $labels.instance }})
        description: "Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Mixer was deprecated in Istio 1.5 and removed in Istio 1.8+. This alert only applies to Istio < 1.8.
    - alert: IstioMixerPrometheusDispatchesLow
      expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
      for: 1m
@ -32,6 +33,7 @@ groups:
        summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
        description: "Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 1000 req/s is a rough default. Adjust to your expected peak traffic.
    - alert: IstioHighTotalRequestRate
      expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
      for: 2m
@ -39,8 +41,9 @@ groups:
        severity: warning
      annotations:
        summary: Istio high total request rate (instance {{ $labels.instance }})
-        description: "Global request rate in the service mesh is unusually high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Global request rate in the service mesh is unusually high ({{ $value | printf \"%.2f\" }} req/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 100 req/s is a rough default. Adjust to your expected baseline traffic. This alert may fire on startup or low-traffic environments.
    - alert: IstioLowTotalRequestRate
      expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
      for: 2m
@ -48,7 +51,7 @@ groups:
        severity: warning
      annotations:
        summary: Istio low total request rate (instance {{ $labels.instance }})
-        description: "Global request rate in the service mesh is unusually low.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Global request rate in the service mesh is unusually low ({{ $value | printf \"%.2f\" }} req/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioHigh4xxErrorRate
      expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0'
@ -57,7 +60,7 @@ groups:
        severity: warning
      annotations:
        summary: Istio high 4xx error rate (instance {{ $labels.instance }})
-        description: "High percentage of HTTP 4xx responses in Istio (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "High percentage of HTTP 4xx responses in Istio ({{ $value | printf \"%.1f\" }}% > 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioHigh5xxErrorRate
      expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0'
@ -66,7 +69,7 @@ groups:
        severity: warning
      annotations:
        summary: Istio high 5xx error rate (instance {{ $labels.instance }})
-        description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "High percentage of HTTP 5xx responses in Istio ({{ $value | printf \"%.1f\" }}% > 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioHighRequestLatency
      expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100 and rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 0'
@ -75,22 +78,22 @@ groups:
        severity: warning
      annotations:
        summary: Istio high request latency (instance {{ $labels.instance }})
-        description: "Istio average requests execution is longer than 100ms.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Istio average request duration is {{ $value }}ms (> 100ms).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioLatency99Percentile
-      expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
+      expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, le)) > 1000'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Istio latency 99 percentile (instance {{ $labels.instance }})
-        description: "Istio 1% slowest requests are longer than 1000ms.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Istio p99 request latency is {{ $value }}ms (threshold: 1000ms).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: IstioPilotDuplicateEntry
-      expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'
+      expr: 'sum(pilot_duplicate_envoy_clusters{}) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
-        description: "Istio pilot duplicate entry error.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Istio Pilot has detected {{ $value }} duplicate Envoy cluster(s), indicating misconfigured DestinationRules or ServiceEntries.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/jenkins/metric-plugin.yml
+++ b/dist/rules/jenkins/metric-plugin.yml
@ -51,7 +51,7 @@ groups:
        description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: JenkinsRunFailureTotal
-      expr: 'delta(jenkins_runs_failure_total[1h]) > 100'
+      expr: 'increase(jenkins_runs_failure_total[1h]) > 100'
      for: 0m
      labels:
        severity: warning
--- a/dist/rules/kafka/danielqsj-kafka-exporter.yml
+++ b/dist/rules/kafka/danielqsj-kafka-exporter.yml
@ -12,7 +12,7 @@ groups:
        severity: critical
      annotations:
        summary: Kafka topics replicas (instance {{ $labels.instance }})
-        description: "Kafka topic in-sync partition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Kafka topic {{ $labels.topic }} has fewer than 3 in-sync replicas ({{ $value }}), data durability is at risk.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KafkaConsumerGroupLag
      expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 10000'
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@ -134,7 +134,7 @@ groups:
        description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesPersistentvolumeError
-      expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
+      expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending"} > 0'
      for: 0m
      labels:
        severity: critical
--- a/dist/rules/loki/embedded-exporter.yml
+++ b/dist/rules/loki/embedded-exporter.yml
@ -24,19 +24,19 @@ groups:
        description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf \"%.2f\" $value }}% errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: LokiRequestPanic
-      expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
-      for: 5m
+      expr: 'sum(increase(loki_panic_total[5m])) by (namespace, job) > 0'
+      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Loki request panic (instance {{ $labels.instance }})
-        description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "{{ $labels.job }} is experiencing {{ $value | humanize }} panic(s) in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: LokiRequestLatency
-      expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1'
+      expr: 'histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (namespace, job, route, le)) > 1'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Loki request latency (instance {{ $labels.instance }})
-        description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/memcached/memcached-exporter.yml
+++ b/dist/rules/memcached/memcached-exporter.yml
@ -34,13 +34,13 @@ groups:
        description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MemcachedOutOfMemoryErrors
-      expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0'
+      expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Memcached out of memory errors (instance {{ $labels.instance }})
-        description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Memcached is returning out-of-memory errors on {{ $labels.instance }} ({{ $value }} errors/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions.
    - alert: MemcachedMemoryUsageHigh(>90%)
@ -73,7 +73,7 @@ groups:
        description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MemcachedConnectionsRejected
-      expr: 'increase(memcached_connections_rejected_total[5m]) > 0'
+      expr: 'increase(memcached_connections_rejected_total[5m]) > 3'
      for: 5m
      labels:
        severity: warning
@ -82,7 +82,7 @@ groups:
        description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MemcachedItemsTooLarge
-      expr: 'increase(memcached_item_too_large_total[5m]) > 0'
+      expr: 'increase(memcached_item_too_large_total[5m]) > 3'
      for: 5m
      labels:
        severity: info
--- a/dist/rules/mysql/mysqld-exporter.yml
+++ b/dist/rules/mysql/mysqld-exporter.yml
@ -71,17 +71,19 @@ groups:
        summary: MySQL Slave replication lag (instance {{ $labels.instance }})
        description: "MySQL replication lag on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so delta() is used instead of increase().
    - alert: MysqlSlowQueries
-      expr: 'increase(mysql_global_status_slow_queries[1m]) > 0'
+      expr: 'delta(mysql_global_status_slow_queries[1m]) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL slow queries (instance {{ $labels.instance }})
-        description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "MySQL server has some new slow queries ({{ $value }} in the last minute).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of rate().
    - alert: MysqlInnodbLogWaits
-      expr: 'rate(mysql_global_status_innodb_log_waits[15m]) > 10'
+      expr: 'deriv(mysql_global_status_innodb_log_waits[15m]) > 10'
      for: 0m
      labels:
        severity: warning
@ -98,8 +100,9 @@ groups:
        summary: MySQL restarted (instance {{ $labels.instance }})
        description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of irate().
    - alert: MysqlHighQps
-      expr: 'irate(mysql_global_status_questions[1m]) > 10000'
+      expr: 'deriv(mysql_global_status_questions[1m]) > 10000'
      for: 2m
      labels:
        severity: info
--- a/dist/rules/nats/nats-exporter.yml
+++ b/dist/rules/nats/nats-exporter.yml
@ -32,6 +32,7 @@ groups:
        summary: Nats slow consumers (instance {{ $labels.instance }})
        description: "There are slow consumers in NATS for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Replace job="nats" with the actual job name in your Prometheus configuration.
    - alert: NatsServerDown
      expr: 'absent(up{job="nats"})'
      for: 5m
@ -79,7 +80,7 @@ groups:
        description: "JetStream memory usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsHighNumberOfSubscriptions
-      expr: 'gnatsd_connz_subscriptions > 1000'
+      expr: 'gnatsd_varz_subscriptions > 1000'
      for: 5m
      labels:
        severity: warning
@ -97,7 +98,7 @@ groups:
        description: "NATS server has more than 100,000 pending bytes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsTooManyErrors
-      expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0'
+      expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 5'
      for: 5m
      labels:
        severity: warning
@ -114,6 +115,8 @@ groups:
        summary: Nats JetStream accounts exceeded (instance {{ $labels.instance }})
        description: "JetStream has more than 100 active accounts\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Only enable this alert if your deployment requires leaf node connections.
+    # This will fire spuriously if leaf nodes are not configured.
    - alert: NatsLeafNodeConnectionIssue
      expr: 'gnatsd_varz_leafnodes == 0'
      for: 5m
--- a/dist/rules/nomad/embedded-exporter.yml
+++ b/dist/rules/nomad/embedded-exporter.yml
@ -12,7 +12,7 @@ groups:
        severity: warning
      annotations:
        summary: Nomad job failed (instance {{ $labels.instance }})
-        description: "Nomad job failed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Nomad job {{ $labels.job }} has {{ $value }} failed allocations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NomadJobLost
      expr: 'nomad_nomad_job_summary_lost > 0'
@ -21,7 +21,7 @@ groups:
        severity: warning
      annotations:
        summary: Nomad job lost (instance {{ $labels.instance }})
-        description: "Nomad job lost\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Nomad job {{ $labels.job }} has {{ $value }} lost allocations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NomadJobQueued
      expr: 'nomad_nomad_job_summary_queued > 0'
@ -30,7 +30,7 @@ groups:
        severity: warning
      annotations:
        summary: Nomad job queued (instance {{ $labels.instance }})
-        description: "Nomad job queued\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Nomad job {{ $labels.job }} has {{ $value }} queued allocations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NomadBlockedEvaluation
      expr: 'nomad_nomad_blocked_evals_total_blocked > 0'
@ -39,4 +39,4 @@ groups:
        severity: warning
      annotations:
        summary: Nomad blocked evaluation (instance {{ $labels.instance }})
-        description: "Nomad blocked evaluation\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Nomad has {{ $value }} blocked evaluations. The cluster may lack resources to place allocations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/openstack/openstack-exporter.yml
+++ b/dist/rules/openstack/openstack-exporter.yml
@ -5,6 +5,7 @@ groups:
  
  rules:

+    # Adjust the job label regex to match the actual job name in your Prometheus scrape config.
    - alert: OpenstackExporterDown
      expr: 'up{job=~".*openstack.*"} == 0'
      for: 2m
--- a/dist/rules/opentelemetry-collector/embedded-exporter.yml
+++ b/dist/rules/opentelemetry-collector/embedded-exporter.yml
@ -8,6 +8,7 @@ groups:
  
  rules:

+    # Adjust the job label regex to match the actual job name in your Prometheus scrape config.
    - alert: OpentelemetryCollectorDown
      expr: 'up{job=~".*otel.*collector.*"} == 0'
      for: 1m
@ -17,8 +18,9 @@ groups:
        summary: OpenTelemetry Collector down (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector instance has disappeared or is not being scraped\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorReceiverRefusedSpans
-      expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0'
+      expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0.05'
      for: 5m
      labels:
        severity: critical
@ -26,8 +28,9 @@ groups:
        summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorReceiverRefusedMetricPoints
-      expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
+      expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0.05'
      for: 5m
      labels:
        severity: critical
@ -35,8 +38,9 @@ groups:
        summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorReceiverRefusedLogRecords
-      expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
+      expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0.05'
      for: 5m
      labels:
        severity: critical
@ -84,6 +88,7 @@ groups:
        description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
+    # These processor metrics are deprecated since collector v0.110.0.
    - alert: OpentelemetryCollectorProcessorRefusedSpans
      expr: 'rate(otelcol_processor_refused_spans[5m]) > 0.05'
      for: 5m
@ -94,6 +99,7 @@ groups:
        description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
+    # These processor metrics are deprecated since collector v0.110.0.
    - alert: OpentelemetryCollectorProcessorRefusedMetricPoints
      expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05'
      for: 5m
@ -104,7 +110,7 @@ groups:
        description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpentelemetryCollectorHighMemoryUsage
-      expr: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
+      expr: '(otelcol_process_runtime_heap_alloc_bytes / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes) > 0.9'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@ -70,7 +70,7 @@ groups:
        description: "PostgreSQL instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlDeadLocks
-      expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
+      expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres",datid!="0"}[1m]) > 5'
      for: 0m
      labels:
        severity: warning
@ -79,7 +79,7 @@ groups:
        description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlHighRollbackRate
-      expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
+      expr: 'sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / (sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + sum by (namespace,datname,instance) (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))) > 0.02 and (sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + sum by (namespace,datname,instance) (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))) > 0'
      for: 0m
      labels:
        severity: warning
@ -96,6 +96,7 @@ groups:
        summary: Postgresql commit rate low (instance {{ $labels.instance }})
        description: "Postgresql seems to be processing very few transactions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # pg_txid_current is not a default postgres_exporter metric. You need to define a custom query. See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
    - alert: PostgresqlLowXidConsumption
      expr: 'rate(pg_txid_current[1m]) < 5'
      for: 2m
@ -132,6 +133,7 @@ groups:
        summary: Postgresql configuration changed (instance {{ $labels.instance }})
        description: "Postgres Database configuration change has occurred\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # pg_stat_ssl_compression is not a default postgres_exporter metric and is only available on PostgreSQL 9.5-13 (removed in PG 14). See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
    - alert: PostgresqlSslCompressionActive
      expr: 'sum by (instance) (pg_stat_ssl_compression) > 0'
      for: 0m
--- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
+++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
@ -143,7 +143,7 @@ groups:
        description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusAlertmanagerNotificationFailing
-      expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0'
+      expr: 'rate(alertmanager_notifications_failed_total[3m]) > 0.05'
      for: 0m
      labels:
        severity: critical
--- a/dist/rules/promtail/embedded-exporter.yml
+++ b/dist/rules/promtail/embedded-exporter.yml
@ -15,7 +15,7 @@ groups:
        description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PromtailRequestLatency
-      expr: 'histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1'
+      expr: 'histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (namespace, job, route, le)) > 1'
      for: 5m
      labels:
        severity: critical
--- a/dist/rules/pulsar/embedded-exporter.yml
+++ b/dist/rules/pulsar/embedded-exporter.yml
@ -41,24 +41,30 @@ groups:
        summary: Pulsar topic very large backlog storage size (instance {{ $labels.instance }})
        description: "The topic backlog storage size is over 20 GB\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # pulsar_storage_write_latency_le_overflow is the overflow bucket of Pulsar's non-standard histogram.
+    # It counts write operations exceeding all defined latency bounds (> 1000ms).
    - alert: PulsarHighWriteLatency
-      expr: 'sum(pulsar_storage_write_latency_overflow > 0) by (topic)'
+      expr: 'sum(pulsar_storage_write_latency_le_overflow > 0) by (topic)'
      for: 1h
      labels:
        severity: critical
      annotations:
        summary: Pulsar high write latency (instance {{ $labels.instance }})
-        description: "Messages cannot be written in a timely fashion\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Pulsar topic {{ $labels.topic }} has {{ $value }} storage write operations exceeding the maximum latency bucket (> 1000ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # pulsar_entry_size_le_overflow is the overflow bucket of Pulsar's non-standard histogram.
+    # It counts message entries exceeding all defined size bounds.
    - alert: PulsarLargeMessagePayload
-      expr: 'sum(pulsar_entry_size_overflow > 0) by (topic)'
+      expr: 'sum(pulsar_entry_size_le_overflow > 0) by (topic)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Pulsar large message payload (instance {{ $labels.instance }})
-        description: "Observing large message payload (> 1MB)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Pulsar topic {{ $labels.topic }} has {{ $value }} message entries exceeding the maximum size bucket (> 1MB)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # This metric name is path-dependent and may differ based on your BookKeeper data directory configuration.
+    # Adjust the metric name to match your actual ledger directory path.
    - alert: PulsarHighLedgerDiskUsage
      expr: 'sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75'
      for: 1h
@ -84,7 +90,7 @@ groups:
        severity: critical
      annotations:
        summary: Pulsar high number of function errors (instance {{ $labels.instance }})
-        description: "Observing more than 10 Function errors per minute\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Pulsar function {{ $labels.name }} has more than 10 errors per second ({{ $value | printf \"%.2f\" }}/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PulsarHighNumberOfSinkErrors
      expr: 'sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10'
@ -93,4 +99,4 @@ groups:
        severity: critical
      annotations:
        summary: Pulsar high number of sink errors (instance {{ $labels.instance }})
-        description: "Observing more than 10 Sink errors per minute\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Pulsar sink {{ $labels.name }} has more than 10 errors per second ({{ $value | printf \"%.2f\" }}/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/python/python-exporter.yml
+++ b/dist/rules/python/python-exporter.yml
@ -6,13 +6,13 @@ groups:
  rules:

    - alert: PythonGcObjectsUncollectable
-      expr: 'increase(python_gc_objects_uncollectable_total[5m]) > 0'
+      expr: 'increase(python_gc_objects_uncollectable_total[5m]) > 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Python GC objects uncollectable (instance {{ $labels.instance }})
-        description: "Python has uncollectable objects, potential memory leak via reference cycles\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Python has uncollectable objects ({{ $value }}), potential memory leak via reference cycles\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PythonGcCollectionsHigh
      expr: 'rate(python_gc_objects_collected_total[5m]) > 10000'
--- a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
@ -32,7 +32,7 @@ groups:
        severity: critical
      annotations:
        summary: RabbitMQ cluster partition (instance {{ $labels.instance }})
-        description: "Cluster partition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "RabbitMQ cluster has a network partition ({{ $value }} partitions detected). Messages may be lost or duplicated.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqOutOfMemory
      expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0'
@ -44,7 +44,7 @@ groups:
        description: "Memory available for RabbitMQ is low (< 10%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqInstanceTooManyConnections
-      expr: 'rabbitmq_connectionsTotal > 1000'
+      expr: 'rabbitmq_connections > 1000'
      for: 2m
      labels:
        severity: warning
--- a/dist/rules/rabbitmq/rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml
@ -23,7 +23,7 @@ groups:
        severity: critical
      annotations:
        summary: RabbitMQ node not distributed (instance {{ $labels.instance }})
-        description: "Distribution link state is not 'up'\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Distribution link to peer {{ $labels.peer }} is not 'up' (state {{ $value }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqInstancesDifferentVersions
      expr: 'count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1'
@ -59,7 +59,7 @@ groups:
        severity: warning
      annotations:
        summary: RabbitMQ too many ready messages (instance {{ $labels.instance }})
-        description: "RabbitMQ too many ready messages on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "RabbitMQ too many ready messages on queue {{ $labels.queue }} ({{ $value }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqTooManyUnackMessages
      expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
@ -68,7 +68,7 @@ groups:
        severity: warning
      annotations:
        summary: RabbitMQ too many unack messages (instance {{ $labels.instance }})
-        description: "Too many unacknowledged messages\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Too many unacknowledged messages on queue {{ $labels.queue }} ({{ $value }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: RabbitmqTooManyConnections
      expr: 'rabbitmq_connections > 1000'
@ -88,11 +88,12 @@ groups:
        summary: RabbitMQ no queue consumer (instance {{ $labels.instance }})
        description: "A queue has less than 1 consumer\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 3 avoids noise from occasional misroutes. Adjust based on your expected traffic patterns.
    - alert: RabbitmqUnroutableMessages
-      expr: 'increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0'
+      expr: 'increase(rabbitmq_channel_messages_unroutable_returned_total[5m]) > 3 or increase(rabbitmq_channel_messages_unroutable_dropped_total[5m]) > 3'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ unroutable messages (instance {{ $labels.instance }})
-        description: "A queue has unroutable messages ({{ $value }} in the last 1m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "A queue has unroutable messages ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/ruby/ruby-exporter.yml
+++ b/dist/rules/ruby/ruby-exporter.yml
@ -24,9 +24,9 @@ groups:
        summary: Ruby heap free slots high (instance {{ $labels.instance }})
        description: "Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # Major GC rate > 5/s is extremely high. Consider lowering to > 1 or > 2 for earlier detection.
+    # Major GC rate > 5/s only fires if the app is essentially non-functional. Threshold of 2/s provides earlier detection.
    - alert: RubyMajorGcRateHigh
-      expr: 'rate(ruby_major_gc_ops_total[5m]) > 5'
+      expr: 'rate(ruby_major_gc_ops_total[5m]) > 2'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
+++ b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
@ -30,7 +30,7 @@ groups:
        severity: critical
      annotations:
        summary: SMART device temperature over trip value (instance {{ $labels.instance }})
-        description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SmartDeviceTemperatureNearingTripValue
      expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
@ -39,7 +39,7 @@ groups:
        severity: warning
      annotations:
        summary: SMART device temperature nearing trip value (instance {{ $labels.instance }})
-        description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SmartStatus
      expr: 'smartctl_device_smart_status != 1'
@ -48,7 +48,7 @@ groups:
        severity: critical
      annotations:
        summary: SMART status (instance {{ $labels.instance }})
-        description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SmartCriticalWarning
      expr: 'smartctl_device_critical_warning > 0'
@ -57,7 +57,7 @@ groups:
        severity: critical
      annotations:
        summary: SMART critical warning (instance {{ $labels.instance }})
-        description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SmartMediaErrors
      expr: 'smartctl_device_media_errors > 0'
@ -66,7 +66,7 @@ groups:
        severity: critical
      annotations:
        summary: SMART media errors (instance {{ $labels.instance }})
-        description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SmartWearoutIndicator
      expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
@ -75,4 +75,4 @@ groups:
        severity: critical
      annotations:
        summary: SMART Wearout Indicator (instance {{ $labels.instance }})
-        description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/sidekiq/strech-sidekiq-exporter.yml
+++ b/dist/rules/sidekiq/strech-sidekiq-exporter.yml
@ -6,16 +6,16 @@ groups:
  rules:

    - alert: SidekiqQueueSize
-      expr: 'sidekiq_queue_size > 100'
+      expr: 'sidekiq_queue_enqueued_jobs > 100'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Sidekiq queue size (instance {{ $labels.instance }})
-        description: "Sidekiq queue {{ $labels.name }} is growing\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Sidekiq queue {{ $labels.name }} is growing ({{ $value }} enqueued jobs)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SidekiqSchedulingLatencyTooHigh
-      expr: 'max(sidekiq_queue_latency) > 60'
+      expr: 'max(sidekiq_queue_latency_seconds) > 60'
      for: 0m
      labels:
        severity: critical
--- a/dist/rules/snmp/snmp-exporter.yml
+++ b/dist/rules/snmp/snmp-exporter.yml
@ -7,7 +7,7 @@ groups:
  
  rules:

-    # From the official snmp-mixin.
+    # Rename job=~"snmp.*" to match the actual job name in your Prometheus scrape config.
    - alert: SnmpTargetDown
      expr: 'up{job=~"snmp.*"} == 0'
      for: 5m
--- a/dist/rules/spinnaker/embedded-exporter.yml
+++ b/dist/rules/spinnaker/embedded-exporter.yml
@ -36,24 +36,24 @@ groups:
        description: "Orca queue message lag is {{ $value }}s. Pipeline stages are waiting too long before being processed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SpinnakerDeadMessages
-      expr: 'rate(queue_dead_messages_total[5m]) > 0'
+      expr: 'rate(queue_dead_messages_total[5m]) > 0.05'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Spinnaker dead messages (instance {{ $labels.instance }})
-        description: "Orca is producing dead-lettered messages ({{ $value }} per second). These are tasks that exhausted all retries and will not be executed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Orca is producing dead-lettered messages ({{ $value | humanize }}/s). These are tasks that exhausted all retries and will not be executed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Zombies are pipeline executions that are running but have lost their queue entry.
    # See https://spinnaker.io/docs/guides/runbooks/orca-zombie-executions/
    - alert: SpinnakerZombieExecutions
-      expr: 'rate(queue_zombies_total[5m]) > 0'
+      expr: 'rate(queue_zombies_total[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Spinnaker zombie executions (instance {{ $labels.instance }})
-        description: "{{ $value }} zombie pipeline executions detected. These are executions with no corresponding queue messages.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Zombie pipeline executions rate is {{ $value | humanize }}/s. These are executions with no corresponding queue messages.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SpinnakerThreadPoolExhaustion
      expr: 'threadpool_blockingQueueSize > 0'
@ -76,7 +76,7 @@ groups:
        description: "Igor polling monitor {{ $labels.monitor }} for {{ $labels.partition }} has exceeded its item threshold, preventing pipeline triggers.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SpinnakerPollingMonitorFailures
-      expr: 'rate(pollingMonitor_failed_total[5m]) > 0'
+      expr: 'rate(pollingMonitor_failed_total[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
@ -95,7 +95,7 @@ groups:
        description: "Spinnaker API 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SpinnakerApiRateLimitThrottling
-      expr: 'rate(rateLimitThrottling_total[5m]) > 0'
+      expr: 'rate(rateLimitThrottling_total[5m]) > 0.05'
      for: 2m
      labels:
        severity: warning
--- a/dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml
+++ b/dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml
@ -7,21 +7,21 @@ groups:

    - alert: SslCertificateProbeFailed
      expr: 'ssl_probe_success == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: critical
      annotations:
        summary: SSL certificate probe failed (instance {{ $labels.instance }})
        description: "Failed to fetch SSL information {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: SslCertificateOscpStatusUnknown
+    - alert: SslCertificateOcspStatusUnknown
      expr: 'ssl_ocsp_response_status == 2'
      for: 0m
      labels:
        severity: warning
      annotations:
-        summary: SSL certificate OSCP status unknown (instance {{ $labels.instance }})
-        description: "Failed to get the OSCP status {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: SSL certificate OCSP status unknown (instance {{ $labels.instance }})
+        description: "Failed to get the OCSP status for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SslCertificateRevoked
      expr: 'ssl_ocsp_response_status == 1'
--- a/dist/rules/systemd/systemd-exporter.yml
+++ b/dist/rules/systemd/systemd-exporter.yml
@ -42,8 +42,9 @@ groups:
        summary: Systemd unit tasks near limit (instance {{ $labels.instance }})
        description: "Systemd unit {{ $labels.name }} is using {{ $value | humanizePercentage }} of its task limit. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # systemd_socket_refused_connections_total is declared as Gauge by the exporter despite the _total suffix, so delta() is used instead of increase().
    - alert: SystemdSocketRefusedConnections
-      expr: 'increase(systemd_socket_refused_connections_total[5m]) > 0'
+      expr: 'delta(systemd_socket_refused_connections_total[5m]) > 3'
      for: 2m
      labels:
        severity: warning
--- a/dist/rules/thanos/thanos-bucket-replicate.yml
+++ b/dist/rules/thanos/thanos-bucket-replicate.yml
@ -6,7 +6,7 @@ groups:
  rules:

    - alert: ThanosBucketReplicateErrorRate
-      expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) > 0'
+      expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total[5m])) > 0'
      for: 5m
      labels:
        severity: critical
@ -15,7 +15,7 @@ groups:
        description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosBucketReplicateRunLatency
-      expr: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and  sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
+      expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket[5m]))) > 20 and  sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_count[5m])) > 0)'
      for: 5m
      labels:
        severity: critical
--- a/dist/rules/thanos/thanos-compactor.yml
+++ b/dist/rules/thanos/thanos-compactor.yml
@ -15,7 +15,7 @@ groups:
        description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosCompactorHalted
-      expr: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
+      expr: 'thanos_compact_halted == 1'
      for: 5m
      labels:
        severity: warning
@ -24,7 +24,7 @@ groups:
        description: "Thanos Compact {{$labels.job}} has failed to run and now is halted.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosCompactorHighCompactionFailures
-      expr: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) > 0'
+      expr: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total[5m])) > 0'
      for: 15m
      labels:
        severity: warning
--- a/dist/rules/thanos/thanos-query.yml
+++ b/dist/rules/thanos/thanos-query.yml
@ -32,8 +32,9 @@ groups:
        summary: Thanos Query Grpc Server Error Rate (instance {{ $labels.instance }})
        description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Filters to actual error codes only. grpc_code!="OK" would include benign codes like NotFound, AlreadyExists, and Cancelled.
    - alert: ThanosQueryGrpcClientErrorRate
-      expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0'
+      expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded|ResourceExhausted", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0'
      for: 5m
      labels:
        severity: warning
@ -42,7 +43,7 @@ groups:
        description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryHighDNSFailures
-      expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) > 0'
+      expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m])) > 0'
      for: 15m
      labels:
        severity: warning
@ -51,7 +52,7 @@ groups:
        description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosQueryInstantLatencyHigh
-      expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
+      expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
      for: 10m
      labels:
        severity: critical
--- a/dist/rules/thanos/thanos-receiver.yml
+++ b/dist/rules/thanos/thanos-receiver.yml
@ -24,7 +24,7 @@ groups:
        description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosReceiveHighReplicationFailures
-      expr: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
+      expr: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error"}[5m])) / sum by (job) (rate(thanos_receive_replications_total[5m]))) > (max by (job) (floor((thanos_receive_replication_factor+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes))) * 100'
      for: 5m
      labels:
        severity: warning
@ -33,7 +33,7 @@ groups:
        description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosReceiveHighForwardRequestFailures
-      expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/  sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) > 0'
+      expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error"}[5m]))/  sum by (job) (rate(thanos_receive_forward_requests_total[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total[5m])) > 0'
      for: 5m
      labels:
        severity: info
@ -42,7 +42,7 @@ groups:
        description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosReceiveHighHashringFileRefreshFailures
-      expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0'
+      expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0'
      for: 15m
      labels:
        severity: warning
@ -51,7 +51,7 @@ groups:
        description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosReceiveConfigReloadFailure
-      expr: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
+      expr: 'avg by (job) (thanos_receive_config_last_reload_successful) != 1'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/thanos/thanos-ruler.yml
+++ b/dist/rules/thanos/thanos-ruler.yml
@ -6,7 +6,7 @@ groups:
  rules:

    - alert: ThanosRuleQueueIsDroppingAlerts
-      expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
+      expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total[5m])) > 0'
      for: 5m
      labels:
        severity: critical
@ -15,7 +15,7 @@ groups:
        description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleSenderIsFailingAlerts
-      expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
+      expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total[5m])) > 0'
      for: 5m
      labels:
        severity: critical
@ -34,7 +34,7 @@ groups:

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: ThanosRuleHighRuleEvaluationWarnings
-      expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05'
+      expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total[5m])) > 0.05'
      for: 15m
      labels:
        severity: info
@ -61,7 +61,7 @@ groups:
        description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleConfigReloadFailure
-      expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
+      expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful) != 1'
      for: 5m
      labels:
        severity: info
@ -70,7 +70,7 @@ groups:
        description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleQueryHighDNSFailures
-      expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0'
+      expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) > 0'
      for: 15m
      labels:
        severity: warning
@ -79,7 +79,7 @@ groups:
        description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleAlertmanagerHighDNSFailures
-      expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0'
+      expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) > 0'
      for: 15m
      labels:
        severity: warning
@ -97,7 +97,7 @@ groups:
        description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosNoRuleEvaluations
-      expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0  and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
+      expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0  and sum by (job, instance) (thanos_rule_loaded_rules) > 0'
      for: 5m
      labels:
        severity: critical
--- a/dist/rules/thanos/thanos-sidecar.yml
+++ b/dist/rules/thanos/thanos-sidecar.yml
@ -16,7 +16,7 @@ groups:
        description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosSidecarNoConnectionToStartedPrometheus
-      expr: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
+      expr: 'thanos_sidecar_prometheus_up == 0 and on (namespace, pod) prometheus_tsdb_data_replay_duration_seconds != 0'
      for: 5m
      labels:
        severity: critical
--- a/dist/rules/thanos/thanos-store.yml
+++ b/dist/rules/thanos/thanos-store.yml
@ -15,7 +15,7 @@ groups:
        description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosStoreSeriesGateLatencyHigh
-      expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
+      expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count[5m])) > 0)'
      for: 10m
      labels:
        severity: warning
--- a/dist/rules/zfs/zfs_exporter.yml
+++ b/dist/rules/zfs/zfs_exporter.yml
@ -6,13 +6,13 @@ groups:
  rules:

    - alert: ZfsPoolOutOfSpace
-      expr: 'zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0'
+      expr: 'zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: ZFS pool out of space (instance {{ $labels.instance }})
-        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "ZFS pool {{ $labels.pool }} is almost full (< 10% left).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # 0: ONLINE
    # 1: DEGRADED
--- a/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml
+++ b/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml
@ -31,7 +31,7 @@ groups:
        severity: critical
      annotations:
        summary: Zookeeper Too Many Leaders (instance {{ $labels.instance }})
-        description: "Zookeeper cluster has too many nodes marked as leader\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Zookeeper cluster has {{ $value }} nodes marked as leader (expected 1), indicating a split-brain\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ZookeeperNotOk
      expr: 'zk_ruok == 0'
@ -40,4 +40,4 @@ groups:
        severity: warning
      annotations:
        summary: Zookeeper Not Ok (instance {{ $labels.instance }})
-        description: "Zookeeper instance is not ok\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Zookeeper instance {{ $labels.instance }} is not ok (ruok check failed)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"