Publish

2026-06-21 00:47:18 +08:00 · 2026-03-18 20:41:01 +00:00 · 2026-03-18 20:41:01 +00:00 · af2f277830
commit af2f277830
parent e3a7165a65
32 changed files with 226 additions and 217 deletions
--- a/dist/rules/blackbox/blackbox-exporter.yml
+++ b/dist/rules/blackbox/blackbox-exporter.yml
@ -24,7 +24,7 @@ groups:
        description: "Blackbox configuration reload failure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: BlackboxSlowProbe
-      expr: 'avg_over_time(probe_duration_seconds[1m]) > 1'
+      expr: 'probe_duration_seconds > 1'
      for: 1m
      labels:
        severity: warning
@ -73,7 +73,7 @@ groups:
        description: "SSL certificate has expired already\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: BlackboxProbeSlowHttp
-      expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
+      expr: 'probe_http_duration_seconds > 1'
      for: 1m
      labels:
        severity: warning
@ -82,7 +82,7 @@ groups:
        description: "HTTP request took more than 1s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: BlackboxProbeSlowPing
-      expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
+      expr: 'probe_icmp_duration_seconds > 1'
      for: 1m
      labels:
        severity: warning
--- a/dist/rules/cassandra/criteo-cassandra-exporter.yml
+++ b/dist/rules/cassandra/criteo-cassandra-exporter.yml
@ -51,31 +51,31 @@ groups:
        summary: Cassandra node down (instance {{ $labels.instance }})
        description: "Cassandra node down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraCommitlogPendingTasks
+    - alert: CassandraCommitlogPendingTasks(criteo)
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Cassandra commitlog pending tasks (instance {{ $labels.instance }})
+        summary: Cassandra commitlog pending tasks (Criteo) (instance {{ $labels.instance }})
        description: "Unexpected number of Cassandra commitlog pending tasks\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraCompactionExecutorBlockedTasks
+    - alert: CassandraCompactionExecutorBlockedTasks(criteo)
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Cassandra compaction executor blocked tasks (instance {{ $labels.instance }})
+        summary: Cassandra compaction executor blocked tasks (Criteo) (instance {{ $labels.instance }})
        description: "Some Cassandra compaction executor tasks are blocked\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraFlushWriterBlockedTasks
+    - alert: CassandraFlushWriterBlockedTasks(criteo)
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Cassandra flush writer blocked tasks (instance {{ $labels.instance }})
+        summary: Cassandra flush writer blocked tasks (Criteo) (instance {{ $labels.instance }})
        description: "Some Cassandra flush writer tasks are blocked\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraRepairPendingTasks
@ -96,67 +96,67 @@ groups:
        summary: Cassandra repair blocked tasks (instance {{ $labels.instance }})
        description: "Some Cassandra repair tasks are blocked\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraConnectionTimeoutsTotal
+    - alert: CassandraConnectionTimeoutsTotal(criteo)
      expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra connection timeouts total (instance {{ $labels.instance }})
+        summary: Cassandra connection timeouts total (Criteo) (instance {{ $labels.instance }})
        description: "Some connection between nodes are ending in timeout\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraStorageExceptions
+    - alert: CassandraStorageExceptions(criteo)
      expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra storage exceptions (instance {{ $labels.instance }})
+        summary: Cassandra storage exceptions (Criteo) (instance {{ $labels.instance }})
        description: "Something is going wrong with cassandra storage\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraTombstoneDump
+    - alert: CassandraTombstoneDump(criteo)
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra tombstone dump (instance {{ $labels.instance }})
+        summary: Cassandra tombstone dump (Criteo) (instance {{ $labels.instance }})
        description: "Too much tombstones scanned in queries\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraClientRequestUnavailableWrite
+    - alert: CassandraClientRequestUnavailableWrite(criteo)
      expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra client request unavailable write (instance {{ $labels.instance }})
+        summary: Cassandra client request unavailable write (Criteo) (instance {{ $labels.instance }})
        description: "Write failures have occurred because too many nodes are unavailable\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraClientRequestUnavailableRead
+    - alert: CassandraClientRequestUnavailableRead(criteo)
      expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra client request unavailable read (instance {{ $labels.instance }})
+        summary: Cassandra client request unavailable read (Criteo) (instance {{ $labels.instance }})
        description: "Read failures have occurred because too many nodes are unavailable\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraClientRequestWriteFailure
+    - alert: CassandraClientRequestWriteFailure(criteo)
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra client request write failure (instance {{ $labels.instance }})
+        summary: Cassandra client request write failure (Criteo) (instance {{ $labels.instance }})
        description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraClientRequestReadFailure
+    - alert: CassandraClientRequestReadFailure(criteo)
      expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra client request read failure (instance {{ $labels.instance }})
+        summary: Cassandra client request read failure (Criteo) (instance {{ $labels.instance }})
        description: "A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CassandraCacheHitRateKeyCache
--- a/dist/rules/cassandra/instaclustr-cassandra-exporter.yml
+++ b/dist/rules/cassandra/instaclustr-cassandra-exporter.yml
@ -24,92 +24,92 @@ groups:
        summary: Cassandra many compaction tasks are pending (instance {{ $labels.instance }})
        description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraCommitlogPendingTasks
+    - alert: CassandraCommitlogPendingTasks(instaclustr)
      expr: 'cassandra_commit_log_pending_tasks > 15'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Cassandra commitlog pending tasks (instance {{ $labels.instance }})
+        summary: Cassandra commitlog pending tasks (Instaclustr) (instance {{ $labels.instance }})
        description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraCompactionExecutorBlockedTasks
+    - alert: CassandraCompactionExecutorBlockedTasks(instaclustr)
      expr: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Cassandra compaction executor blocked tasks (instance {{ $labels.instance }})
+        summary: Cassandra compaction executor blocked tasks (Instaclustr) (instance {{ $labels.instance }})
        description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraFlushWriterBlockedTasks
+    - alert: CassandraFlushWriterBlockedTasks(instaclustr)
      expr: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Cassandra flush writer blocked tasks (instance {{ $labels.instance }})
+        summary: Cassandra flush writer blocked tasks (Instaclustr) (instance {{ $labels.instance }})
        description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraConnectionTimeoutsTotal
+    - alert: CassandraConnectionTimeoutsTotal(instaclustr)
      expr: 'sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra connection timeouts total (instance {{ $labels.instance }})
+        summary: Cassandra connection timeouts total (Instaclustr) (instance {{ $labels.instance }})
        description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraStorageExceptions
+    - alert: CassandraStorageExceptions(instaclustr)
      expr: 'changes(cassandra_storage_exceptions_total[1m]) > 1'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra storage exceptions (instance {{ $labels.instance }})
+        summary: Cassandra storage exceptions (Instaclustr) (instance {{ $labels.instance }})
        description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraTombstoneDump
+    - alert: CassandraTombstoneDump(instaclustr)
      expr: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra tombstone dump (instance {{ $labels.instance }})
+        summary: Cassandra tombstone dump (Instaclustr) (instance {{ $labels.instance }})
        description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraClientRequestUnavailableWrite
+    - alert: CassandraClientRequestUnavailableWrite(instaclustr)
      expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra client request unavailable write (instance {{ $labels.instance }})
+        summary: Cassandra client request unavailable write (Instaclustr) (instance {{ $labels.instance }})
        description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraClientRequestUnavailableRead
+    - alert: CassandraClientRequestUnavailableRead(instaclustr)
      expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra client request unavailable read (instance {{ $labels.instance }})
+        summary: Cassandra client request unavailable read (Instaclustr) (instance {{ $labels.instance }})
        description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraClientRequestWriteFailure
+    - alert: CassandraClientRequestWriteFailure(instaclustr)
      expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra client request write failure (instance {{ $labels.instance }})
+        summary: Cassandra client request write failure (Instaclustr) (instance {{ $labels.instance }})
        description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: CassandraClientRequestReadFailure
+    - alert: CassandraClientRequestReadFailure(instaclustr)
      expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Cassandra client request read failure (instance {{ $labels.instance }})
+        summary: Cassandra client request read failure (Instaclustr) (instance {{ $labels.instance }})
        description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/clickhouse/embedded-exporter.yml
+++ b/dist/rules/clickhouse/embedded-exporter.yml
@ -117,7 +117,7 @@ groups:
        description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseAuthenticationFailures
-      expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0'
+      expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 3'
      for: 0m
      labels:
        severity: info
@ -126,7 +126,7 @@ groups:
        description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ClickhouseAccessDeniedErrors
-      expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0'
+      expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 3'
      for: 0m
      labels:
        severity: info
--- a/dist/rules/cortex/embedded-exporter.yml
+++ b/dist/rules/cortex/embedded-exporter.yml
@ -31,7 +31,7 @@ groups:
        severity: critical
      annotations:
        summary: Cortex notification are being dropped (instance {{ $labels.instance }})
-        description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: CortexNotificationError
@ -41,7 +41,7 @@ groups:
        severity: critical
      annotations:
        summary: Cortex notification error (instance {{ $labels.instance }})
-        description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CortexIngesterUnhealthy
      expr: 'cortex_ring_members{state="Unhealthy", name="ingester"} > 0'
--- a/dist/rules/docker-containers/google-cadvisor.yml
+++ b/dist/rules/docker-containers/google-cadvisor.yml
@ -33,7 +33,7 @@ groups:
        severity: warning
      annotations:
        summary: Container High CPU utilization (instance {{ $labels.instance }})
-        description: "Container CPU utilization is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Container CPU utilization is above 80% (current: {{ $value | printf \"%.2f\" }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
    - alert: ContainerHighMemoryUsage
@ -55,13 +55,13 @@ groups:
        description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ContainerHighThrottleRate
-      expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0'
+      expr: 'sum(rate(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Container high throttle rate (instance {{ $labels.instance }})
-        description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Container is being throttled ({{ $value | humanizePercentage }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ContainerHighLowChangeCpuUsage
      expr: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25'
@ -79,7 +79,7 @@ groups:
        severity: info
      annotations:
        summary: Container Low CPU utilization (instance {{ $labels.instance }})
-        description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf \"%.2f\" }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ContainerLowMemoryUsage
      expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'
--- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
+++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
@ -143,13 +143,13 @@ groups:
        description: "No new documents for 10 min!\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchHighIndexingLatency
-      expr: 'increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and increase(elasticsearch_indices_indexing_index_total[1m]) > 0'
+      expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
-        description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchHighIndexingRate
      expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
@ -170,10 +170,10 @@ groups:
        description: "The query rate on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ElasticsearchHighQueryLatency
-      expr: 'increase(elasticsearch_indices_search_query_time_seconds[1m]) / increase(elasticsearch_indices_search_query_total[1m]) > 1 and increase(elasticsearch_indices_search_query_total[1m]) > 0'
+      expr: 'rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch High Query Latency (instance {{ $labels.instance }})
-        description: "The query latency on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/envoy/embedded-exporter.yml
+++ b/dist/rules/envoy/embedded-exporter.yml
@ -42,13 +42,13 @@ groups:
        description: "More than 10% of downstream HTTP responses are 4xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyDownstreamConnectionsOverflowing
-      expr: 'increase(envoy_listener_downstream_cx_overflow[5m]) > 0'
+      expr: 'increase(envoy_listener_downstream_cx_overflow[5m]) > 5'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Envoy downstream connections overflowing (instance {{ $labels.instance }})
-        description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyClusterMembershipEmpty
      expr: 'envoy_cluster_membership_healthy == 0'
@ -75,10 +75,10 @@ groups:
        severity: warning
      annotations:
        summary: Envoy high cluster upstream connection failures (instance {{ $labels.instance }})
-        description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighClusterUpstreamRequestTimeoutRate
-      expr: 'increase(envoy_cluster_upstream_rq_timeout[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0'
+      expr: 'rate(envoy_cluster_upstream_rq_timeout[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
      for: 5m
      labels:
        severity: warning
@ -87,7 +87,7 @@ groups:
        description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighClusterUpstream5xxErrorRate
-      expr: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0'
+      expr: 'rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
      for: 1m
      labels:
        severity: critical
@ -102,7 +102,7 @@ groups:
        severity: warning
      annotations:
        summary: Envoy cluster health check failures (instance {{ $labels.instance }})
-        description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyClusterOutlierDetectionEjectionsActive
      expr: 'envoy_cluster_outlier_detection_ejections_active > 0'
@ -114,22 +114,22 @@ groups:
        description: "There are active outlier detection ejections in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyListenerSslConnectionErrors
-      expr: 'increase(envoy_listener_ssl_connection_error[5m]) > 0'
+      expr: 'increase(envoy_listener_ssl_connection_error[5m]) > 5'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Envoy listener SSL connection errors (instance {{ $labels.instance }})
-        description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyGlobalDownstreamConnectionsOverflowing
-      expr: 'increase(envoy_listener_downstream_global_cx_overflow[5m]) > 0'
+      expr: 'increase(envoy_listener_downstream_global_cx_overflow[5m]) > 5'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Envoy global downstream connections overflowing (instance {{ $labels.instance }})
-        description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoySslCertificateExpiringSoon
      expr: 'envoy_server_days_until_first_cert_expiring < 7'
@ -165,7 +165,7 @@ groups:
        severity: critical
      annotations:
        summary: Envoy no healthy upstream (instance {{ $labels.instance }})
-        description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighDownstreamRequestTimeoutRate
      expr: 'increase(envoy_http_downstream_rq_timeout[5m]) > 5'
@ -174,4 +174,4 @@ groups:
        severity: warning
      annotations:
        summary: Envoy high downstream request timeout rate (instance {{ $labels.instance }})
-        description: "Downstream requests are timing out on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/etcd/embedded-exporter.yml
+++ b/dist/rules/etcd/embedded-exporter.yml
@ -30,26 +30,26 @@ groups:
        severity: warning
      annotations:
        summary: Etcd high number of leader changes (instance {{ $labels.instance }})
-        description: "Etcd leader changed more than 2 times during 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Etcd leader changed {{ $value }} times during 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
-    - alert: EtcdHighNumberOfFailedGrpcRequests
+    - alert: EtcdHighNumberOfFailedGrpcRequestsWarning
      expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
+        summary: Etcd high number of failed GRPC requests warning (instance {{ $labels.instance }})
        description: "More than 1% GRPC request failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
-    - alert: EtcdHighNumberOfFailedGrpcRequests
+    - alert: EtcdHighNumberOfFailedGrpcRequestsCritical
      expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
+        summary: Etcd high number of failed GRPC requests critical (instance {{ $labels.instance }})
        description: "More than 5% GRPC request failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdGrpcRequestsSlow
@ -61,22 +61,22 @@ groups:
        summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
        description: "GRPC requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: EtcdHighNumberOfFailedHttpRequests
+    - alert: EtcdHighNumberOfFailedHttpRequestsWarning
      expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
+        summary: Etcd high number of failed HTTP requests warning (instance {{ $labels.instance }})
        description: "More than 1% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: EtcdHighNumberOfFailedHttpRequests
+    - alert: EtcdHighNumberOfFailedHttpRequestsCritical
      expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
+        summary: Etcd high number of failed HTTP requests critical (instance {{ $labels.instance }})
        description: "More than 5% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdHttpRequestsSlow
@ -104,7 +104,7 @@ groups:
        severity: warning
      annotations:
        summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
-        description: "Etcd server got more than 5 failed proposals past hour\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Etcd server got {{ $value }} failed proposals in the past hour\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EtcdHighFsyncDurations
      expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5'
--- a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml
+++ b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml
@ -8,7 +8,7 @@ groups:
    # Queued connections indicate Puma workers are saturated.
    # Consider increasing puma['worker_processes'] or puma['max_threads'] in gitlab.rb.
    - alert: GitlabPumaHighQueuedConnections
-      expr: 'avg_over_time(puma_queued_connections[5m]) > 5'
+      expr: 'puma_queued_connections > 5'
      for: 5m
      labels:
        severity: warning
@ -85,7 +85,7 @@ groups:
        severity: warning
      annotations:
        summary: GitLab Sidekiq high job completion time (instance {{ $labels.instance }})
-        description: "GitLab Sidekiq job average completion time on {{ $labels.instance }} is above 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "GitLab Sidekiq job p95 completion time on {{ $labels.instance }} is above 5 minutes ({{ $value | humanizeDuration }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
    # High queue latency means jobs are stuck waiting. Check Sidekiq concurrency and queue sizes.
--- a/dist/rules/grafana-mimir/embedded-exporter.yml
+++ b/dist/rules/grafana-mimir/embedded-exporter.yml
@ -105,14 +105,15 @@ groups:
        summary: Mimir ingested data too far in the future (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.job }} has ingested samples with timestamps more than 1 hour in the future.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirStoreGatewayTooManyFailedOperations
-      expr: 'sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0'
+      expr: 'sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Mimir store gateway too many failed operations (instance {{ $labels.instance }})
-        description: "Mimir store-gateway {{ $labels.job }} bucket operations are failing.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir store-gateway {{ $labels.job }} bucket operations are failing ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirRingMembersMismatch
      expr: 'max by (name, job) (sum by (name, job, instance) (cortex_ring_members)) != min by (name, job) (sum by (name, job, instance) (cortex_ring_members))'
@ -184,7 +185,7 @@ groups:
        severity: critical
      annotations:
        summary: Mimir ingester TSDB head compaction failed (instance {{ $labels.instance }})
-        description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterTsdbHeadTruncationFailed
      expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0'
@ -193,7 +194,7 @@ groups:
        severity: critical
      annotations:
        summary: Mimir ingester TSDB head truncation failed (instance {{ $labels.instance }})
-        description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterTsdbCheckpointCreationFailed
      expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0'
@ -202,7 +203,7 @@ groups:
        severity: critical
      annotations:
        summary: Mimir ingester TSDB checkpoint creation failed (instance {{ $labels.instance }})
-        description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterTsdbCheckpointDeletionFailed
      expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0'
@ -211,7 +212,7 @@ groups:
        severity: critical
      annotations:
        summary: Mimir ingester TSDB checkpoint deletion failed (instance {{ $labels.instance }})
-        description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterTsdbWalTruncationFailed
      expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0'
@ -220,7 +221,7 @@ groups:
        severity: warning
      annotations:
        summary: Mimir ingester TSDB WAL truncation failed (instance {{ $labels.instance }})
-        description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirIngesterTsdbWalWritesFailed
      expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0'
@ -229,7 +230,7 @@ groups:
        severity: critical
      annotations:
        summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }})
-        description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold aligned with official Mimir mixin (30 minutes).
    - alert: MimirStoreGatewayHasNotSyncedBucket
@ -284,7 +285,7 @@ groups:
        severity: critical
      annotations:
        summary: Mimir compactor has consecutive failures (instance {{ $labels.instance }})
-        description: "Mimir compactor {{ $labels.instance }} has had 2+ compaction failures in the last 2 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirCompactorHasRunOutOfDiskSpace
      expr: 'increase(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1'
@ -312,7 +313,7 @@ groups:
        severity: warning
      annotations:
        summary: Mimir compactor skipped blocks (instance {{ $labels.instance }})
-        description: "Mimir compactor has found blocks that cannot be compacted (reason {{ $labels.reason }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirRulerTooManyFailedPushes
      expr: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 0'
@ -341,14 +342,15 @@ groups:
        summary: Mimir ruler missed evaluations (instance {{ $labels.instance }})
        description: "Mimir ruler {{ $labels.instance }} is missing {{ printf \"%.2f\" $value }}% of rule group evaluations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: MimirRulerFailedRingCheck
-      expr: 'sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0'
+      expr: 'sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0.05'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Mimir ruler failed ring check (instance {{ $labels.instance }})
-        description: "Mimir ruler {{ $labels.job }} is failing ring checks.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerSyncConfigsFailing
      expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0'
@ -357,7 +359,7 @@ groups:
        severity: critical
      annotations:
        summary: Mimir alertmanager sync configs failing (instance {{ $labels.instance }})
-        description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerRingCheckFailing
      expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0'
@ -366,7 +368,7 @@ groups:
        severity: critical
      annotations:
        summary: Mimir alertmanager ring check failing (instance {{ $labels.instance }})
-        description: "Mimir alertmanager {{ $labels.job }} is failing ring checks.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerStateMergeFailing
      expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0'
@ -375,7 +377,7 @@ groups:
        severity: critical
      annotations:
        summary: Mimir alertmanager state merge failing (instance {{ $labels.instance }})
-        description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerReplicationFailing
      expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0'
@ -384,7 +386,7 @@ groups:
        severity: critical
      annotations:
        summary: Mimir alertmanager replication failing (instance {{ $labels.instance }})
-        description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerPersistStateFailing
      expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0'
@ -393,7 +395,7 @@ groups:
        severity: critical
      annotations:
        summary: Mimir alertmanager persist state failing (instance {{ $labels.instance }})
-        description: "Mimir alertmanager {{ $labels.job }} is failing to persist state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Mimir alertmanager {{ $labels.job }} is failing to persist state ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirAlertmanagerInitialSyncFailed
      expr: 'increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0'
--- a/dist/rules/grafana-tempo/embedded-exporter.yml
+++ b/dist/rules/grafana-tempo/embedded-exporter.yml
@ -40,7 +40,7 @@ groups:
        severity: critical
      annotations:
        summary: Tempo compactions failing (instance {{ $labels.instance }})
-        description: "Greater than 2 compactions have failed in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "{{ $value }} compactions have failed in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoPollsFailing
      expr: 'sum by (job) (increase(tempodb_blocklist_poll_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_poll_errors_total[5m])) > 0'
@ -49,7 +49,7 @@ groups:
        severity: critical
      annotations:
        summary: Tempo polls failing (instance {{ $labels.instance }})
-        description: "Greater than 2 blocklist polls have failed in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "{{ $value }} blocklist polls have failed in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoTenantIndexFailures
      expr: 'sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[5m])) > 0'
@ -58,7 +58,7 @@ groups:
        severity: critical
      annotations:
        summary: Tempo tenant index failures (instance {{ $labels.instance }})
-        description: "Greater than 2 tenant index failures in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "{{ $value }} tenant index failures in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoNoTenantIndexBuilders
      expr: 'sum by (tenant) (tempodb_blocklist_tenant_index_builder) == 0 and on() max(tempodb_blocklist_length) > 0'
@ -105,7 +105,7 @@ groups:
        severity: critical
      annotations:
        summary: Tempo user configurable overrides reload failing (instance {{ $labels.instance }})
-        description: "Greater than 5 user-configurable overrides reloads have failed in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "{{ $value }} user-configurable overrides reloads have failed in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 100 blocks per compactor instance. Adjust based on your environment.
    - alert: TempoCompactionTooManyOutstandingBlocksWarning
@ -134,7 +134,7 @@ groups:
        severity: critical
      annotations:
        summary: Tempo distributor usage tracker errors (instance {{ $labels.instance }})
-        description: "Tempo distributor usage tracker errors for {{ $labels.job }} (reason {{ $labels.reason }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoMetricsGeneratorProcessorUpdatesFailing
      expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 0'
@ -143,7 +143,7 @@ groups:
        severity: critical
      annotations:
        summary: Tempo metrics generator processor updates failing (instance {{ $labels.instance }})
-        description: "Tempo metrics generator processor updates are failing for {{ $labels.job }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: TempoMetricsGeneratorServiceGraphsDroppingSpans
      expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
@ -161,7 +161,7 @@ groups:
        severity: critical
      annotations:
        summary: Tempo metrics generator collections failing (instance {{ $labels.instance }})
-        description: "Tempo metrics generator collections are failing for {{ $labels.job }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Tempo metrics generator collections are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Fires when the memcached error rate exceeds 20%. Only relevant if Tempo is configured with memcached caching.
    - alert: TempoMemcachedErrorsElevated
--- a/dist/rules/haproxy/embedded-exporter-v2.yml
+++ b/dist/rules/haproxy/embedded-exporter-v2.yml
@ -130,4 +130,4 @@ groups:
        severity: warning
      annotations:
        summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
-        description: "Some server healthcheck are failing on {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/haproxy/haproxy-exporter-v1.yml
+++ b/dist/rules/haproxy/haproxy-exporter-v1.yml
@ -14,71 +14,71 @@ groups:
        summary: HAProxy down (instance {{ $labels.instance }})
        description: "HAProxy down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HaproxyHighHttp4xxErrorRateBackend
+    - alert: HaproxyHighHttp4xxErrorRateBackend(v1)
      expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
-        summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
+        summary: HAProxy high HTTP 4xx error rate backend (v1) (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HaproxyHighHttp5xxErrorRateBackend
+    - alert: HaproxyHighHttp5xxErrorRateBackend(v1)
      expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
-        summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
+        summary: HAProxy high HTTP 5xx error rate backend (v1) (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HaproxyHighHttp4xxErrorRateServer
+    - alert: HaproxyHighHttp4xxErrorRateServer(v1)
      expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
-        summary: HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }})
+        summary: HAProxy high HTTP 4xx error rate server (v1) (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HaproxyHighHttp5xxErrorRateServer
+    - alert: HaproxyHighHttp5xxErrorRateServer(v1)
      expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
-        summary: HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }})
+        summary: HAProxy high HTTP 5xx error rate server (v1) (instance {{ $labels.instance }})
        description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HaproxyServerResponseErrors
+    - alert: HaproxyServerResponseErrors(v1)
      expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
-        summary: HAProxy server response errors (instance {{ $labels.instance }})
+        summary: HAProxy server response errors (v1) (instance {{ $labels.instance }})
        description: "Too many response errors to {{ $labels.server }} server (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HaproxyBackendConnectionErrors
+    - alert: HaproxyBackendConnectionErrors(v1)
      expr: 'sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100'
      for: 1m
      labels:
        severity: critical
      annotations:
-        summary: HAProxy backend connection errors (instance {{ $labels.instance }})
+        summary: HAProxy backend connection errors (v1) (instance {{ $labels.instance }})
        description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HaproxyServerConnectionErrors
+    - alert: HaproxyServerConnectionErrors(v1)
      expr: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: HAProxy server connection errors (instance {{ $labels.instance }})
+        summary: HAProxy server connection errors (v1) (instance {{ $labels.instance }})
        description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyBackendMaxActiveSession
-      expr: '((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80 and sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])) > 0'
+      expr: '((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0'
      for: 2m
      labels:
        severity: warning
@ -86,31 +86,31 @@ groups:
        summary: HAProxy backend max active session (instance {{ $labels.instance }})
        description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HaproxyPendingRequests
+    - alert: HaproxyPendingRequests(v1)
      expr: 'sum by (backend) (haproxy_backend_current_queue) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: HAProxy pending requests (instance {{ $labels.instance }})
+        summary: HAProxy pending requests (v1) (instance {{ $labels.instance }})
        description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HaproxyHttpSlowingDown
+    - alert: HaproxyHttpSlowingDown(v1)
      expr: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1'
      for: 1m
      labels:
        severity: warning
      annotations:
-        summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
+        summary: HAProxy HTTP slowing down (v1) (instance {{ $labels.instance }})
        description: "Average request time is increasing\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HaproxyRetryHigh
+    - alert: HaproxyRetryHigh(v1)
      expr: 'sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: HAProxy retry high (instance {{ $labels.instance }})
+        summary: HAProxy retry high (v1) (instance {{ $labels.instance }})
        description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HaproxyBackendDown
@ -131,20 +131,20 @@ groups:
        summary: HAProxy server down (instance {{ $labels.instance }})
        description: "HAProxy server is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HaproxyFrontendSecurityBlockedRequests
+    - alert: HaproxyFrontendSecurityBlockedRequests(v1)
      expr: 'sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }})
+        summary: HAProxy frontend security blocked requests (v1) (instance {{ $labels.instance }})
        description: "HAProxy is blocking requests for security reason\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HaproxyServerHealthcheckFailure
+    - alert: HaproxyServerHealthcheckFailure(v1)
      expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
      for: 1m
      labels:
        severity: warning
      annotations:
-        summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
-        description: "Some server healthcheck are failing on {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: HAProxy server healthcheck failure (v1) (instance {{ $labels.instance }})
+        description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@ -21,7 +21,7 @@ groups:
        severity: warning
      annotations:
        summary: Host memory under memory pressure (instance {{ $labels.instance }})
-        description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
    - alert: HostMemoryIsUnderutilized
--- a/dist/rules/juniper/czerwonk-junos-exporter.yml
+++ b/dist/rules/juniper/czerwonk-junos-exporter.yml
@ -14,20 +14,20 @@ groups:
        summary: Juniper switch down (instance {{ $labels.instance }})
        description: "The switch appears to be down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: JuniperHighBandwidthUsage1gib
+    - alert: JuniperCriticalBandwidthUsage1gib
      expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90'
      for: 1m
      labels:
        severity: critical
      annotations:
-        summary: Juniper high Bandwidth Usage 1GiB (instance {{ $labels.instance }})
+        summary: Juniper critical Bandwidth Usage 1GiB (instance {{ $labels.instance }})
        description: "Interface is highly saturated. (> 0.90GiB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: JuniperHighBandwidthUsage1gib
+    - alert: JuniperWarningBandwidthUsage1gib
      expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80'
      for: 1m
      labels:
        severity: warning
      annotations:
-        summary: Juniper high Bandwidth Usage 1GiB (instance {{ $labels.instance }})
+        summary: Juniper warning Bandwidth Usage 1GiB (instance {{ $labels.instance }})
        description: "Interface is getting saturated. (> 0.80GiB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@ -11,7 +11,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes Node ready (node {{ $labels.node }})
+        summary: Kubernetes Node not ready (instance {{ $labels.instance }})
        description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Kubernetes Node with disabled schedules are fine.
@ -22,7 +22,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes node scheduling disabled (node {{ $labels.node }})
+        summary: Kubernetes Node scheduling disabled (instance {{ $labels.instance }})
        description: "Node {{ $labels.node }} has been marked as unschedulable for more than 30 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesNodeMemoryPressure
@ -31,7 +31,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes memory pressure (node {{ $labels.node }})
+        summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
        description: "Node {{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesNodeDiskPressure
@ -40,7 +40,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes disk pressure (node {{ $labels.node }})
+        summary: Kubernetes Node disk pressure (instance {{ $labels.instance }})
        description: "Node {{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesNodeNetworkUnavailable
@ -67,7 +67,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
+        summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
        description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesJobFailed
@ -76,7 +76,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
+        summary: Kubernetes Job failed (instance {{ $labels.instance }})
        description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesJobNotStarting
@ -85,7 +85,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }})
+        summary: Kubernetes Job not starting (instance {{ $labels.instance }})
        description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesCronjobFailing
@ -94,7 +94,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes CronJob failing ({{ $labels.namespace }}/{{ $labels.cronjob }})
+        summary: Kubernetes CronJob failing (instance {{ $labels.instance }})
        description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is failing\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesCronjobSuspended
@ -103,7 +103,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
+        summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
        description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesPersistentvolumeclaimPending
@ -112,7 +112,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
+        summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
        description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesVolumeOutOfDiskSpace
@ -139,7 +139,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
+        summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
        description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesStatefulsetDown
@ -148,7 +148,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
+        summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesHpaScaleInability
@ -193,7 +193,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
+        summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
        description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesPodCrashLooping
@ -202,7 +202,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
+        summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
        description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesReplicasetReplicasMismatch
@ -211,7 +211,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
+        summary: Kubernetes ReplicaSet replicas mismatch (instance {{ $labels.instance }})
        description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesDeploymentReplicasMismatch
@ -220,7 +220,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
+        summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
        description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesStatefulsetReplicasMismatch
@ -238,7 +238,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
+        summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
        description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesStatefulsetGenerationMismatch
@ -247,7 +247,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
+        summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesStatefulsetUpdateNotRolledOut
@ -256,7 +256,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
+        summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesDaemonsetRolloutStuck
@ -265,7 +265,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
+        summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
        description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesDaemonsetMisscheduled
@ -274,7 +274,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
+        summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
        description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold should be customized for each cronjob name.
@ -284,7 +284,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
+        summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
        description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesJobSlowCompletion
@ -293,7 +293,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
+        summary: Kubernetes Job slow completion (instance {{ $labels.instance }})
        description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesApiServerErrors
@ -303,7 +303,7 @@ groups:
        severity: critical
      annotations:
        summary: Kubernetes API server errors (instance {{ $labels.instance }})
-        description: "Kubernetes API server is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Kubernetes API server is experiencing {{ $value | humanize }}% error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesApiClientErrors
      expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 and sum(rate(rest_client_requests_total[1m])) by (instance, job) > 0'
@ -312,7 +312,7 @@ groups:
        severity: critical
      annotations:
        summary: Kubernetes API client errors (instance {{ $labels.instance }})
-        description: "Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Kubernetes API client is experiencing {{ $value | humanize }}% error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: KubernetesClientCertificateExpiresNextWeek
      expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60'
--- a/dist/rules/loki/embedded-exporter.yml
+++ b/dist/rules/loki/embedded-exporter.yml
@ -21,7 +21,7 @@ groups:
        severity: critical
      annotations:
        summary: Loki request errors (instance {{ $labels.instance }})
-        description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf \"%.2f\" $value }}% errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: LokiRequestPanic
      expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
--- a/dist/rules/mongodb/dcu-mongodb-exporter.yml
+++ b/dist/rules/mongodb/dcu-mongodb-exporter.yml
@ -5,13 +5,13 @@ groups:
  
  rules:

-    - alert: MongodbReplicationLag
+    - alert: MongodbReplicationLag(dcu)
      expr: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: MongoDB replication lag (instance {{ $labels.instance }})
+        summary: MongoDB replication lag (DCU) (instance {{ $labels.instance }})
        description: "Mongodb replication lag is more than 10s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MongodbReplicationStatus3
@ -59,29 +59,29 @@ groups:
        summary: MongoDB replication Status 10 (instance {{ $labels.instance }})
        description: "MongoDB Replication set member was once in a replica set but was subsequently removed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: MongodbNumberCursorsOpen
+    - alert: MongodbNumberCursorsOpen(dcu)
      expr: 'mongodb_metrics_cursor_open{state="total_open"} > 10000'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: MongoDB number cursors open (instance {{ $labels.instance }})
+        summary: MongoDB number cursors open (DCU) (instance {{ $labels.instance }})
        description: "Too many cursors opened by MongoDB for clients (> 10k)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: MongodbCursorsTimeouts
+    - alert: MongodbCursorsTimeouts(dcu)
      expr: 'increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: MongoDB cursors timeouts (instance {{ $labels.instance }})
-        description: "Too many cursors are timing out\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: MongoDB cursors timeouts (DCU) (instance {{ $labels.instance }})
+        description: "Too many cursors are timing out ({{ $value }} in the last minute)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: MongodbTooManyConnections
+    - alert: MongodbTooManyConnections(dcu)
      expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: MongoDB too many connections (instance {{ $labels.instance }})
+        summary: MongoDB too many connections (DCU) (instance {{ $labels.instance }})
        description: "Too many connections (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/mongodb/percona-mongodb-exporter.yml
+++ b/dist/rules/mongodb/percona-mongodb-exporter.yml
@ -25,13 +25,13 @@ groups:
        summary: Mongodb replica member unhealthy (instance {{ $labels.instance }})
        description: "MongoDB replica member is not healthy\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: MongodbReplicationLag
+    - alert: MongodbReplicationLag(percona)
      expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: MongoDB replication lag (instance {{ $labels.instance }})
+        summary: MongoDB replication lag (Percona) (instance {{ $labels.instance }})
        description: "Mongodb replication lag is more than 10s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both.
@ -44,29 +44,29 @@ groups:
        summary: MongoDB replication headroom (instance {{ $labels.instance }})
        description: "MongoDB replication headroom is <= 0\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: MongodbNumberCursorsOpen
+    - alert: MongodbNumberCursorsOpen(percona)
      expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: MongoDB number cursors open (instance {{ $labels.instance }})
+        summary: MongoDB number cursors open (Percona) (instance {{ $labels.instance }})
        description: "Too many cursors opened by MongoDB for clients (> 10k)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: MongodbCursorsTimeouts
+    - alert: MongodbCursorsTimeouts(percona)
      expr: 'increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: MongoDB cursors timeouts (instance {{ $labels.instance }})
-        description: "Too many cursors are timing out\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: MongoDB cursors timeouts (Percona) (instance {{ $labels.instance }})
+        description: "Too many cursors are timing out ({{ $value }} in the last minute)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: MongodbTooManyConnections
+    - alert: MongodbTooManyConnections(percona)
      expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: MongoDB too many connections (instance {{ $labels.instance }})
+        summary: MongoDB too many connections (Percona) (instance {{ $labels.instance }})
        description: "Too many connections (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/mysql/mysqld-exporter.yml
+++ b/dist/rules/mysql/mysqld-exporter.yml
@ -78,7 +78,7 @@ groups:
        severity: warning
      annotations:
        summary: MySQL slow queries (instance {{ $labels.instance }})
-        description: "MySQL server mysql has some new slow query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlInnodbLogWaits
      expr: 'rate(mysql_global_status_innodb_log_waits[15m]) > 10'
@ -87,7 +87,7 @@ groups:
        severity: warning
      annotations:
        summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
-        description: "MySQL innodb log writes stalling\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "MySQL innodb log writes stalling ({{ $value }} waits/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MysqlRestarted
      expr: 'mysql_global_status_uptime < 60'
--- a/dist/rules/nats/nats-exporter.yml
+++ b/dist/rules/nats/nats-exporter.yml
@ -103,7 +103,7 @@ groups:
        severity: warning
      annotations:
        summary: Nats too many errors (instance {{ $labels.instance }})
-        description: "NATS server has encountered errors in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NatsJetstreamAccountsExceeded
      expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
--- a/dist/rules/netdata/embedded-exporter.yml
+++ b/dist/rules/netdata/embedded-exporter.yml
@ -15,13 +15,13 @@ groups:
        summary: Netdata high cpu usage (instance {{ $labels.instance }})
        description: "Netdata high CPU usage (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HostCpuStealNoisyNeighbor
+    - alert: NetdataCpuStealNoisyNeighbor
      expr: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10'
      for: 5m
      labels:
        severity: warning
      annotations:
-        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+        summary: Netdata CPU steal noisy neighbor (instance {{ $labels.instance }})
        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NetdataHighMemoryUsage
@ -67,7 +67,7 @@ groups:
        severity: info
      annotations:
        summary: Netdata disk reallocated sectors (instance {{ $labels.instance }})
-        description: "Reallocated sectors on disk\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk reallocated sectors detected ({{ $value }} sectors)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: NetdataDiskCurrentPendingSector
      expr: 'netdata_smartd_log_current_pending_sector_count_sectors_average > 0'
@ -85,4 +85,4 @@ groups:
        severity: warning
      annotations:
        summary: Netdata reported uncorrectable disk sectors (instance {{ $labels.instance }})
-        description: "Reported uncorrectable disk sectors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Reported uncorrectable disk sectors ({{ $value }} sectors)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/opentelemetry-collector/embedded-exporter.yml
+++ b/dist/rules/opentelemetry-collector/embedded-exporter.yml
@ -24,7 +24,7 @@ groups:
        severity: critical
      annotations:
        summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }})
-        description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpentelemetryCollectorReceiverRefusedMetricPoints
      expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
@ -33,7 +33,7 @@ groups:
        severity: critical
      annotations:
        summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }})
-        description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpentelemetryCollectorReceiverRefusedLogRecords
      expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
@ -42,34 +42,37 @@ groups:
        severity: critical
      annotations:
        summary: OpenTelemetry Collector receiver refused log records (instance {{ $labels.instance }})
-        description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s log records on {{ $labels.receiver }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorExporterFailedSpans
-      expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0'
+      expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenTelemetry Collector exporter failed spans (instance {{ $labels.instance }})
-        description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s spans via {{ $labels.exporter }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorExporterFailedMetricPoints
-      expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0'
+      expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenTelemetry Collector exporter failed metric points (instance {{ $labels.instance }})
-        description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s metric points via {{ $labels.exporter }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorExporterFailedLogRecords
-      expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0'
+      expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenTelemetry Collector exporter failed log records (instance {{ $labels.instance }})
-        description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s log records via {{ $labels.exporter }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpentelemetryCollectorExporterQueueNearlyFull
      expr: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
@ -80,23 +83,25 @@ groups:
        summary: OpenTelemetry Collector exporter queue nearly full (instance {{ $labels.instance }})
        description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorProcessorRefusedSpans
-      expr: 'rate(otelcol_processor_refused_spans[5m]) > 0'
+      expr: 'rate(otelcol_processor_refused_spans[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenTelemetry Collector processor refused spans (instance {{ $labels.instance }})
-        description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: OpentelemetryCollectorProcessorRefusedMetricPoints
-      expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0'
+      expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: OpenTelemetry Collector processor refused metric points (instance {{ $labels.instance }})
-        description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpentelemetryCollectorHighMemoryUsage
      expr: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
--- a/dist/rules/php-fpm/bakins-fpm-exporter.yml
+++ b/dist/rules/php-fpm/bakins-fpm-exporter.yml
@ -6,10 +6,10 @@ groups:
  rules:

    - alert: Php-fpmMax-childrenReached
-      expr: 'sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 0'
+      expr: 'sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 3'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: PHP-FPM max-children reached (instance {{ $labels.instance }})
-        description: "PHP-FPM reached max children - {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "PHP-FPM reached max children on {{ $labels.instance }} ({{ $value }} times in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@ -76,7 +76,7 @@ groups:
        severity: warning
      annotations:
        summary: Postgresql dead locks (instance {{ $labels.instance }})
-        description: "PostgreSQL has dead-locks\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlHighRollbackRate
      expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
--- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
+++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
@ -149,7 +149,7 @@ groups:
        severity: critical
      annotations:
        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
-        description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Alertmanager is failing sending notifications ({{ $value }} notifications/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTargetEmpty
      expr: 'prometheus_sd_discovered_targets == 0'
@ -176,16 +176,16 @@ groups:
        severity: warning
      annotations:
        summary: Prometheus large scrape (instance {{ $labels.instance }})
-        description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Prometheus has many scrapes that exceed the sample limit ({{ $value }} scrapes)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTargetScrapeDuplicate
-      expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0'
+      expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 3'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
-        description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Prometheus has many samples rejected due to duplicate timestamps but different values ({{ $value }} samples)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTsdbCheckpointCreationFailures
      expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0'
--- a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
@ -43,13 +43,13 @@ groups:
        summary: RabbitMQ out of memory (instance {{ $labels.instance }})
        description: "Memory available for RabbitMQ is low (< 10%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: RabbitmqTooManyConnections
+    - alert: RabbitmqInstanceTooManyConnections
      expr: 'rabbitmq_connectionsTotal > 1000'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: RabbitMQ too many connections (instance {{ $labels.instance }})
+        summary: RabbitMQ instance too many connections (instance {{ $labels.instance }})
        description: "RabbitMQ instance has too many connections (> 1000)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Indicate the queue name in dedicated label.
--- a/dist/rules/rabbitmq/rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml
@ -95,4 +95,4 @@ groups:
        severity: warning
      annotations:
        summary: RabbitMQ unroutable messages (instance {{ $labels.instance }})
-        description: "A queue has unroutable messages\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "A queue has unroutable messages ({{ $value }} in the last 1m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/systemd/systemd-exporter.yml
+++ b/dist/rules/systemd/systemd-exporter.yml
@ -49,7 +49,7 @@ groups:
        severity: warning
      annotations:
        summary: Systemd socket refused connections (instance {{ $labels.instance }})
-        description: "Systemd socket {{ $labels.name }} is refusing connections. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Systemd socket {{ $labels.name }} is refusing connections. ({{ $value }} refused in last 5m, instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 100 connections is arbitrary. Adjust to your workload.
    - alert: SystemdSocketHighConnections
--- a/dist/rules/thanos/thanos-ruler.yml
+++ b/dist/rules/thanos/thanos-ruler.yml
@ -12,7 +12,7 @@ groups:
        severity: critical
      annotations:
        summary: Thanos Rule Queue Is Dropping Alerts (instance {{ $labels.instance }})
-        description: "Thanos Rule {{$labels.instance}} is failing to queue alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleSenderIsFailingAlerts
      expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
@ -21,7 +21,7 @@ groups:
        severity: critical
      annotations:
        summary: Thanos Rule Sender Is Failing Alerts (instance {{ $labels.instance }})
-        description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleHighRuleEvaluationFailures
      expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) > 0'
@ -30,16 +30,17 @@ groups:
        severity: critical
      annotations:
        summary: Thanos Rule High Rule Evaluation Failures (instance {{ $labels.instance }})
-        description: "Thanos Rule {{$labels.instance}} is failing to evaluate rules.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Thanos Rule {{$labels.instance}} is failing to evaluate {{$value | humanize}}% of rules.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: ThanosRuleHighRuleEvaluationWarnings
-      expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
+      expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05'
      for: 15m
      labels:
        severity: info
      annotations:
        summary: Thanos Rule High Rule Evaluation Warnings (instance {{ $labels.instance }})
-        description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosRuleRuleEvaluationLatencyHigh
      expr: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
--- a/dist/rules/thanos/thanos-sidecar.yml
+++ b/dist/rules/thanos/thanos-sidecar.yml
@ -5,14 +5,15 @@ groups:
  
  rules:

+    # Threshold of 0.05/s avoids firing on transient single-event spikes.
    - alert: ThanosSidecarBucketOperationsFailed
-      expr: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0'
+      expr: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0.05'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Thanos Sidecar Bucket Operations Failed (instance {{ $labels.instance }})
-        description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing ({{ $value | humanize }}/s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ThanosSidecarNoConnectionToStartedPrometheus
      expr: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'