From 4fb1aa9ae4ca58f25ca96bca2ca405dbdf1e6a47 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Wed, 18 Mar 2026 11:23:25 +0000
Subject: [PATCH] Publish

---
 dist/rules/apache-flink/flink-prometheus-reporter.yml  | 10 ++++++----
 dist/rules/apache-spark/spark-prometheus.yml           |  9 +++++----
 .../aws-cloudwatch/prometheus-cloudwatch-exporter.yml  |  4 ++--
 dist/rules/cert-manager/embedded-exporter.yml          |  1 +
 dist/rules/cilium/embedded-exporter.yml                |  4 ++++
 dist/rules/digitalocean/digitalocean-exporter.yml      |  6 +++---
 dist/rules/ebpf/ebpf-exporter.yml                      |  6 +++---
 dist/rules/envoy/embedded-exporter.yml                 |  4 ++--
 dist/rules/gitlab-ci/gitaly.yml                        |  5 +++--
 dist/rules/gitlab-ci/gitlab-built-in-exporter.yml      |  7 ++++---
 dist/rules/gitlab-ci/workhorse.yml                     |  2 +-
 dist/rules/golang/golang-exporter.yml                  |  4 ++--
 dist/rules/grafana-alloy/embedded-exporter.yml         |  4 ++--
 dist/rules/grafana-mimir/embedded-exporter.yml         |  8 +++++---
 dist/rules/grafana-tempo/embedded-exporter.yml         |  1 +
 dist/rules/graph-node/embedded-exporter.yml            | 10 +++++-----
 dist/rules/ipmi/ipmi-exporter.yml                      |  2 +-
 dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml  |  4 ++--
 dist/rules/linkerd/embedded-exporter.yml               |  2 +-
 dist/rules/openstack/openstack-exporter.yml            |  2 +-
 .../oracle-database/iamseth-oracledb-exporter.yml      |  7 +++----
 dist/rules/process-exporter/process-exporter.yml       | 10 +++++-----
 dist/rules/proxmox-ve/prometheus-pve-exporter.yml      |  2 +-
 dist/rules/python/python-exporter.yml                  |  1 +
 dist/rules/ruby/ruby-exporter.yml                      |  1 +
 dist/rules/snmp/snmp-exporter.yml                      |  4 ++--
 dist/rules/spinnaker/embedded-exporter.yml             |  2 +-
 dist/rules/systemd/systemd-exporter.yml                |  6 +++---
 .../mindflavor-prometheus-wireguard-exporter.yml       |  1 +
 29 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/dist/rules/apache-flink/flink-prometheus-reporter.yml b/dist/rules/apache-flink/flink-prometheus-reporter.yml
index ffceaf8..c7cb9cd 100644
--- a/dist/rules/apache-flink/flink-prometheus-reporter.yml
+++ b/dist/rules/apache-flink/flink-prometheus-reporter.yml
@@ -33,9 +33,10 @@ groups:
         summary: Flink all task slots used (instance {{ $labels.instance }})
         description: "All Flink task slots are in use ({{ $value }} available). New jobs cannot be scheduled.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # A single restart may be normal during deployments. Adjust threshold based on restart tolerance.
     - alert: FlinkJobRestartIncreasing
-      expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 0'
-      for: 0m
+      expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 1'
+      for: 5m
       labels:
         severity: warning
       annotations:
@@ -43,14 +44,15 @@ groups:
         description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: FlinkCheckpointFailures
-      expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 0'
-      for: 0m
+      expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1'
+      for: 5m
       labels:
         severity: warning
       annotations:
         summary: Flink checkpoint failures (instance {{ $labels.instance }})
         description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # Value is in milliseconds. humanizeDuration expects seconds, so the template output may be misleading.
     # Threshold is 60 seconds. Adjust based on your checkpoint interval and state size.
     - alert: FlinkCheckpointDurationHigh
       expr: 'flink_jobmanager_job_lastCheckpointDuration > 60000'
diff --git a/dist/rules/apache-spark/spark-prometheus.yml b/dist/rules/apache-spark/spark-prometheus.yml
index 5571ed3..7d7ac12 100644
--- a/dist/rules/apache-spark/spark-prometheus.yml
+++ b/dist/rules/apache-spark/spark-prometheus.yml
@@ -51,7 +51,7 @@ groups:
     # Fires when more than 10% of executor time is spent in garbage collection.
     # This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/).
     - alert: SparkExecutorHighGcTime
-      expr: 'metrics_executor_totalGCTime / (metrics_executor_totalDuration > 0) > 0.1'
+      expr: 'metrics_executor_totalGCTime_seconds_total / (metrics_executor_totalDuration > 0) > 0.1'
       for: 5m
       labels:
         severity: warning
@@ -60,7 +60,7 @@ groups:
         description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: SparkExecutorAllTasksFailing
-      expr: 'metrics_executor_failedTasks > 0 and metrics_executor_completedTasks == 0'
+      expr: 'metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks == 0'
       for: 5m
       labels:
         severity: critical
@@ -69,7 +69,7 @@ groups:
         description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: SparkExecutorHighTaskFailureRate
-      expr: 'metrics_executor_failedTasks / (metrics_executor_totalTasks > 0) > 0.1'
+      expr: 'metrics_executor_failedTasks_total / (metrics_executor_totalTasks_total > 0) > 0.1'
       for: 5m
       labels:
         severity: warning
@@ -77,9 +77,10 @@ groups:
         summary: Spark executor high task failure rate (instance {{ $labels.instance }})
         description: "Spark executor {{ $labels.executor_id }} has a task failure rate above 10%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # diskUsed is a gauge, not a counter — do not use rate(). Threshold of 1GB is a rough default.
     # Disk spilling indicates insufficient memory for the workload.
     - alert: SparkExecutorHighDiskSpill
-      expr: 'rate(metrics_executor_diskUsed_bytes[5m]) > 0'
+      expr: 'metrics_executor_diskUsed_bytes > 1e9'
       for: 5m
       labels:
         severity: warning
diff --git a/dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml b/dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml
index dad2f44..0a258f3 100644
--- a/dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml
+++ b/dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml
@@ -112,7 +112,7 @@ groups:
 
     # Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics.
     - alert: AwsAlbHigh5xxErrorRate
-      expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5'
+      expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0'
       for: 5m
       labels:
         severity: critical
@@ -132,7 +132,7 @@ groups:
 
     # Requires Lambda Errors and Invocations metrics.
     - alert: AwsLambdaHighErrorRate
-      expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5'
+      expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0'
       for: 5m
       labels:
         severity: warning
diff --git a/dist/rules/cert-manager/embedded-exporter.yml b/dist/rules/cert-manager/embedded-exporter.yml
index 60e6f34..71edcf9 100644
--- a/dist/rules/cert-manager/embedded-exporter.yml
+++ b/dist/rules/cert-manager/embedded-exporter.yml
@@ -33,6 +33,7 @@ groups:
         summary: Cert-Manager certificate not ready (instance {{ $labels.instance }})
         description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # In cert-manager 1.19+, the metric was renamed (dropped http_ prefix). Verify metric name against your version.
     - alert: Cert-managerHittingAcmeRateLimits
       expr: 'sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0'
       for: 5m
diff --git a/dist/rules/cilium/embedded-exporter.yml b/dist/rules/cilium/embedded-exporter.yml
index aa788c3..bd6f786 100644
--- a/dist/rules/cilium/embedded-exporter.yml
+++ b/dist/rules/cilium/embedded-exporter.yml
@@ -5,6 +5,7 @@ groups:
   
   rules:
 
+    # Metric name depends on Cilium version. Use cilium_unreachable_nodes (older) or cilium_node_connectivity_status (1.14+).
     - alert: CiliumAgentUnreachableNodes
       expr: 'sum(cilium_unreachable_nodes{}) by (pod) > 0'
       for: 15m
@@ -14,6 +15,7 @@ groups:
         summary: Cilium agent unreachable nodes (instance {{ $labels.instance }})
         description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} node(s). Check network connectivity and node health.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # Metric name depends on Cilium version. Use cilium_unreachable_health_endpoints (older) or cilium_node_connectivity_status (1.14+).
     - alert: CiliumAgentUnreachableHealthEndpoints
       expr: 'sum(cilium_unreachable_health_endpoints{}) by (pod) > 0'
       for: 15m
@@ -23,6 +25,7 @@ groups:
         summary: Cilium agent unreachable health endpoints (instance {{ $labels.instance }})
         description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} health endpoint(s). Node-to-node health probes are failing.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # Metric name depends on Cilium version. Use cilium_controllers_failing (older) or cilium_controllers_runs_total (1.14+).
     - alert: CiliumAgentFailingControllers
       expr: 'sum(cilium_controllers_failing{}) by (pod) > 0'
       for: 5m
@@ -198,6 +201,7 @@ groups:
         summary: Cilium operator low available IPAM IPs (instance {{ $labels.instance }})
         description: "Cilium operator IPAM IP pool is over 90% utilized. Allocate more IPs to avoid exhaustion.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # Some Cilium versions may not have a status label on this metric. Verify against your Cilium version.
     - alert: CiliumOperatorIpamInterfaceCreationFailures
       expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0'
       for: 10m
diff --git a/dist/rules/digitalocean/digitalocean-exporter.yml b/dist/rules/digitalocean/digitalocean-exporter.yml
index 3b88156..662adf0 100644
--- a/dist/rules/digitalocean/digitalocean-exporter.yml
+++ b/dist/rules/digitalocean/digitalocean-exporter.yml
@@ -16,7 +16,7 @@ groups:
 
     - alert: DigitaloceanAccountNotActive
       expr: 'digitalocean_account_active != 1'
-      for: 0m
+      for: 5m
       labels:
         severity: critical
       annotations:
@@ -52,7 +52,7 @@ groups:
 
     - alert: DigitaloceanLoadBalancerNoBackends
       expr: 'digitalocean_loadbalancer_droplets == 0'
-      for: 0m
+      for: 1m
       labels:
         severity: warning
       annotations:
@@ -79,7 +79,7 @@ groups:
 
     - alert: DigitaloceanExporterCollectionErrors
       expr: 'increase(digitalocean_errors_total[5m]) > 0'
-      for: 0m
+      for: 5m
       labels:
         severity: warning
       annotations:
diff --git a/dist/rules/ebpf/ebpf-exporter.yml b/dist/rules/ebpf/ebpf-exporter.yml
index 432ca9d..9c27343 100644
--- a/dist/rules/ebpf/ebpf-exporter.yml
+++ b/dist/rules/ebpf/ebpf-exporter.yml
@@ -13,7 +13,7 @@ groups:
         severity: warning
       annotations:
         summary: eBPF exporter program not attached (instance {{ $labels.instance }})
-        description: "eBPF program {{ $labels.name }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: EbpfExporterDecoderErrors
       expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0'
@@ -22,10 +22,10 @@ groups:
         severity: warning
       annotations:
         summary: eBPF exporter decoder errors (instance {{ $labels.instance }})
-        description: "eBPF exporter is experiencing decoder errors for program {{ $labels.name }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: EbpfExporterNoEnabledConfigs
-      expr: 'ebpf_exporter_enabled_configs == 0'
+      expr: 'absent(ebpf_exporter_enabled_configs)'
       for: 5m
       labels:
         severity: warning
diff --git a/dist/rules/envoy/embedded-exporter.yml b/dist/rules/envoy/embedded-exporter.yml
index f489b0c..cb1138a 100644
--- a/dist/rules/envoy/embedded-exporter.yml
+++ b/dist/rules/envoy/embedded-exporter.yml
@@ -24,7 +24,7 @@ groups:
         description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: EnvoyHighDownstreamHttp5xxErrorRate
-      expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5'
+      expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
       for: 1m
       labels:
         severity: critical
@@ -33,7 +33,7 @@ groups:
         description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: EnvoyHighDownstreamHttp4xxErrorRate
-      expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10'
+      expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
       for: 5m
       labels:
         severity: warning
diff --git a/dist/rules/gitlab-ci/gitaly.yml b/dist/rules/gitlab-ci/gitaly.yml
index 208a3d1..adb6f9e 100644
--- a/dist/rules/gitlab-ci/gitaly.yml
+++ b/dist/rules/gitlab-ci/gitaly.yml
@@ -5,8 +5,9 @@ groups:
   
   rules:
 
+    # grpc_code!="OK" includes non-error codes like NotFound, AlreadyExists. Consider filtering to specific error codes for less noise.
     - alert: GitlabGitalyHighGrpcErrorRate
-      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5'
+      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
       for: 5m
       labels:
         severity: warning
@@ -18,7 +19,7 @@ groups:
     # concurrency limits. This directly impacts users trying to push, pull, or clone.
     # This alert is derived from the GitLab Omnibus default rules.
     - alert: GitlabGitalyResourceExhausted
-      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1'
+      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
       for: 5m
       labels:
         severity: critical
diff --git a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml
index fc46a92..6a8b34a 100644
--- a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml
+++ b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml
@@ -37,7 +37,7 @@ groups:
     # Threshold is 5% of all requests returning server errors.
     # Check GitLab logs at /var/log/gitlab/ for root cause.
     - alert: GitlabHighHttpErrorRate
-      expr: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5'
+      expr: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5 and sum(rate(http_requests_total[5m])) > 0'
       for: 5m
       labels:
         severity: critical
@@ -58,7 +58,7 @@ groups:
     # This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
     # A sustained failure rate indicates background processing issues.
     - alert: GitlabSidekiqJobsFailing
-      expr: 'rate(sidekiq_jobs_failed_total[5m]) > 0'
+      expr: 'rate(sidekiq_jobs_failed_total[5m]) > 0.1'
       for: 10m
       labels:
         severity: warning
@@ -136,6 +136,7 @@ groups:
         summary: GitLab CI pipeline creation slow (instance {{ $labels.instance }})
         description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # This metric may not exist in all GitLab versions. Verify against your GitLab installation.
     - alert: GitlabCiPipelineFailuresIncreasing
       expr: 'rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0'
       for: 10m
@@ -188,7 +189,7 @@ groups:
 
     # This may happen during a rolling deployment. If it persists, investigate incomplete upgrades.
     - alert: GitlabVersionMismatch
-      expr: 'count(count by (version) (deployments{version!=""})) > 1'
+      expr: 'count(count by (version) (gitlab_build_info)) > 1'
       for: 0m
       labels:
         severity: warning
diff --git a/dist/rules/gitlab-ci/workhorse.yml b/dist/rules/gitlab-ci/workhorse.yml
index e5a3b0d..5f44a39 100644
--- a/dist/rules/gitlab-ci/workhorse.yml
+++ b/dist/rules/gitlab-ci/workhorse.yml
@@ -8,7 +8,7 @@ groups:
     # Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying.
     # Threshold from GitLab Omnibus default rules: 10% for high-traffic instances.
     - alert: GitlabWorkhorseHighErrorRate
-      expr: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10'
+      expr: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10 and sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) > 0'
       for: 5m
       labels:
         severity: critical
diff --git a/dist/rules/golang/golang-exporter.yml b/dist/rules/golang/golang-exporter.yml
index cd5e777..13b251c 100644
--- a/dist/rules/golang/golang-exporter.yml
+++ b/dist/rules/golang/golang-exporter.yml
@@ -39,13 +39,13 @@ groups:
 
     # Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline.
     - alert: GoThreadCountHigh
-      expr: 'go_threads > 50'
+      expr: 'go_threads > 500'
       for: 5m
       labels:
         severity: warning
       annotations:
         summary: Go thread count high (instance {{ $labels.instance }})
-        description: "Go OS thread count is high (> 50), potential blocking syscall or CGo leak\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Go OS thread count is high (> 500), potential blocking syscall or CGo leak\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     # Threshold is a rough default. Adjust based on your application's normal object count.
     - alert: GoHeapObjectsCountHigh
diff --git a/dist/rules/grafana-alloy/embedded-exporter.yml b/dist/rules/grafana-alloy/embedded-exporter.yml
index 99003ec..f62c8ce 100644
--- a/dist/rules/grafana-alloy/embedded-exporter.yml
+++ b/dist/rules/grafana-alloy/embedded-exporter.yml
@@ -6,10 +6,10 @@ groups:
   rules:
 
     - alert: GrafanaAlloyServiceDown
-      expr: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m)  '
+      expr: 'count by (instance) (alloy_build_info offset 2h) unless count by (instance) (alloy_build_info)'
       for: 0m
       labels:
         severity: critical
       annotations:
         summary: Grafana Alloy service down (instance {{ $labels.instance }})
-        description: "Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Alloy on instance {{ $labels.instance }} is not responding or has stopped running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/dist/rules/grafana-mimir/embedded-exporter.yml b/dist/rules/grafana-mimir/embedded-exporter.yml
index bed1f46..bc1ce60 100644
--- a/dist/rules/grafana-mimir/embedded-exporter.yml
+++ b/dist/rules/grafana-mimir/embedded-exporter.yml
@@ -231,8 +231,9 @@ groups:
         summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }})
         description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # Threshold aligned with official Mimir mixin (30 minutes).
     - alert: MimirStoreGatewayHasNotSyncedBucket
-      expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 600) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0'
+      expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0'
       for: 5m
       labels:
         severity: critical
@@ -277,7 +278,7 @@ groups:
         description: "Mimir compactor {{ $labels.instance }} has not run compaction in the last 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: MimirCompactorHasConsecutiveFailures
-      expr: 'increase(cortex_compactor_runs_failed_total[2h]) > 1'
+      expr: 'increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) > 1'
       for: 0m
       labels:
         severity: critical
@@ -303,8 +304,9 @@ groups:
         summary: Mimir compactor has not uploaded blocks (instance {{ $labels.instance }})
         description: "Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # Using 24h window per official mixin — compaction skips are rare events.
     - alert: MimirCompactorSkippedBlocks
-      expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0'
+      expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0'
       for: 5m
       labels:
         severity: warning
diff --git a/dist/rules/grafana-tempo/embedded-exporter.yml b/dist/rules/grafana-tempo/embedded-exporter.yml
index a06f097..6e96623 100644
--- a/dist/rules/grafana-tempo/embedded-exporter.yml
+++ b/dist/rules/grafana-tempo/embedded-exporter.yml
@@ -117,6 +117,7 @@ groups:
         summary: Tempo compaction too many outstanding blocks warning (instance {{ $labels.instance }})
         description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration.
     - alert: TempoCompactionTooManyOutstandingBlocksCritical
       expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 250'
       for: 24h
diff --git a/dist/rules/graph-node/embedded-exporter.yml b/dist/rules/graph-node/embedded-exporter.yml
index b902605..9158bd0 100644
--- a/dist/rules/graph-node/embedded-exporter.yml
+++ b/dist/rules/graph-node/embedded-exporter.yml
@@ -41,20 +41,20 @@ groups:
         summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
         description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: StoreConnectionIsTooSlow
+    - alert: StoreConnectionSlow
       expr: 'store_connection_wait_time_ms > 10'
       for: 0m
       labels:
         severity: warning
       annotations:
-        summary: Store connection is too slow (instance {{ $labels.instance }})
+        summary: Store connection slow (instance {{ $labels.instance }})
         description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: StoreConnectionIsTooSlow
+    - alert: StoreConnectionVerySlow
       expr: 'store_connection_wait_time_ms > 20'
       for: 0m
       labels:
         severity: critical
       annotations:
-        summary: Store connection is too slow (instance {{ $labels.instance }})
-        description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Store connection very slow (instance {{ $labels.instance }})
+        description: "Store connection is very slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/dist/rules/ipmi/ipmi-exporter.yml b/dist/rules/ipmi/ipmi-exporter.yml
index 4aaa01e..7e34786 100644
--- a/dist/rules/ipmi/ipmi-exporter.yml
+++ b/dist/rules/ipmi/ipmi-exporter.yml
@@ -118,7 +118,7 @@ groups:
     # Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts.
     - alert: IpmiGenericSensorCritical
       expr: 'ipmi_sensor_state == 2'
-      for: 0m
+      for: 5m
       labels:
         severity: critical
       annotations:
diff --git a/dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml b/dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml
index 0aa767c..57e271c 100644
--- a/dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml
+++ b/dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml
@@ -56,9 +56,9 @@ groups:
         summary: Keycloak high registration failure rate (instance {{ $labels.instance }})
         description: "More than 10% of registration attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    # Threshold of 2 seconds is a rough default. Adjust based on your performance requirements.
+    # keycloak_request_duration is in milliseconds. Threshold of 2000ms (2 seconds) is a rough default.
     - alert: KeycloakSlowRequestResponseTime
-      expr: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0'
+      expr: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2000 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0'
       for: 5m
       labels:
         severity: warning
diff --git a/dist/rules/linkerd/embedded-exporter.yml b/dist/rules/linkerd/embedded-exporter.yml
index 6afaaf4..054e461 100644
--- a/dist/rules/linkerd/embedded-exporter.yml
+++ b/dist/rules/linkerd/embedded-exporter.yml
@@ -12,4 +12,4 @@ groups:
         severity: warning
       annotations:
         summary: Linkerd high error rate (instance {{ $labels.instance }})
-        description: "Linkerd error rate for {{ $labels.deployment | $labels.statefulset | $labels.daemonset }} is over 10%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/dist/rules/openstack/openstack-exporter.yml b/dist/rules/openstack/openstack-exporter.yml
index d55688b..a75ed7a 100644
--- a/dist/rules/openstack/openstack-exporter.yml
+++ b/dist/rules/openstack/openstack-exporter.yml
@@ -24,7 +24,7 @@ groups:
         description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: OpenstackNeutronAgentDown
-      expr: 'openstack_neutron_agent_state{adminState="enabled"} == 0'
+      expr: 'openstack_neutron_agent_state{adminState="up"} == 0'
       for: 2m
       labels:
         severity: critical
diff --git a/dist/rules/oracle-database/iamseth-oracledb-exporter.yml b/dist/rules/oracle-database/iamseth-oracledb-exporter.yml
index e5dce0f..2969215 100644
--- a/dist/rules/oracle-database/iamseth-oracledb-exporter.yml
+++ b/dist/rules/oracle-database/iamseth-oracledb-exporter.yml
@@ -65,7 +65,7 @@ groups:
 
     # Threshold is highly workload-dependent. Adjust 200 to suit your environment.
     - alert: OracleDbTooManyActiveSessions
-      expr: 'oracledb_sessions_activity{status="ACTIVE", type="USER"} > 200'
+      expr: 'oracledb_sessions_value{status="ACTIVE", type="USER"} > 200'
       for: 5m
       labels:
         severity: warning
@@ -73,10 +73,9 @@ groups:
         summary: Oracle DB too many active sessions (instance {{ $labels.instance }})
         description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    # High user I/O wait time indicates storage performance issues (slow disks, SAN latency, etc.).
-    # The metric is in centiseconds per second. Threshold 300 means 3 seconds of I/O wait per second of wall time.
+    # The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time.
     - alert: OracleDbHighWaitTime(userI/o)
-      expr: 'rate(oracledb_wait_time_user_io[5m]) > 300'
+      expr: 'oracledb_wait_time_user_io > 300'
       for: 5m
       labels:
         severity: warning
diff --git a/dist/rules/process-exporter/process-exporter.yml b/dist/rules/process-exporter/process-exporter.yml
index 8603ede..e8ea717 100644
--- a/dist/rules/process-exporter/process-exporter.yml
+++ b/dist/rules/process-exporter/process-exporter.yml
@@ -7,9 +7,9 @@ groups:
 
     - alert: ProcessExporterGroupDown
       expr: 'namedprocess_namegroup_num_procs == 0'
-      for: 2m
+      for: 5m
       labels:
-        severity: critical
+        severity: warning
       annotations:
         summary: Process exporter group down (instance {{ $labels.instance }})
         description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -63,7 +63,7 @@ groups:
         description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: ProcessExporterZombieProcesses
-      expr: 'namedprocess_namegroup_states{state="Zombie"} > 0'
+      expr: 'namedprocess_namegroup_states{state="Zombie"} > 5'
       for: 5m
       labels:
         severity: warning
@@ -71,9 +71,9 @@ groups:
         summary: Process exporter zombie processes (instance {{ $labels.instance }})
         description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    # Threshold of 10000 switches/s is a rough default. Adjust based on the workload profile.
+    # Filters to voluntary switches only — involuntary switches are normal under CPU contention. Threshold of 50000/s is a rough default. Adjust based on workload.
     - alert: ProcessExporterHighContextSwitching
-      expr: 'rate(namedprocess_namegroup_context_switches_total[5m]) > 10000'
+      expr: 'rate(namedprocess_namegroup_context_switches_total{ctxswitchtype="voluntary"}[5m]) > 50000'
       for: 5m
       labels:
         severity: warning
diff --git a/dist/rules/proxmox-ve/prometheus-pve-exporter.yml b/dist/rules/proxmox-ve/prometheus-pve-exporter.yml
index d0fbe27..2bfb8bb 100644
--- a/dist/rules/proxmox-ve/prometheus-pve-exporter.yml
+++ b/dist/rules/proxmox-ve/prometheus-pve-exporter.yml
@@ -36,7 +36,7 @@ groups:
         description: "Proxmox VE CPU usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf \"%.2f\" }}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PveHighMemoryUsage
-      expr: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90'
+      expr: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90 and pve_memory_size_bytes > 0'
       for: 5m
       labels:
         severity: warning
diff --git a/dist/rules/python/python-exporter.yml b/dist/rules/python/python-exporter.yml
index 2230ae5..d4211d5 100644
--- a/dist/rules/python/python-exporter.yml
+++ b/dist/rules/python/python-exporter.yml
@@ -33,6 +33,7 @@ groups:
         summary: Python file descriptors exhaustion (instance {{ $labels.instance }})
         description: "Python process is running out of file descriptors (> 90% used)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # Gen2 collection rate > 1/s is very high. In most applications, gen2 runs are infrequent. Adjust threshold based on your workload.
     - alert: PythonGcGeneration2CollectionsHigh
       expr: 'rate(python_gc_collections_total{generation="2"}[5m]) > 1'
       for: 5m
diff --git a/dist/rules/ruby/ruby-exporter.yml b/dist/rules/ruby/ruby-exporter.yml
index 0b526cb..1ac782e 100644
--- a/dist/rules/ruby/ruby-exporter.yml
+++ b/dist/rules/ruby/ruby-exporter.yml
@@ -24,6 +24,7 @@ groups:
         summary: Ruby heap free slots high (instance {{ $labels.instance }})
         description: "Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # Major GC rate > 5/s is extremely high. Consider lowering to > 1 or > 2 for earlier detection.
     - alert: RubyMajorGcRateHigh
       expr: 'rate(ruby_major_gc_ops_total[5m]) > 5'
       for: 5m
diff --git a/dist/rules/snmp/snmp-exporter.yml b/dist/rules/snmp/snmp-exporter.yml
index c7324e9..920f714 100644
--- a/dist/rules/snmp/snmp-exporter.yml
+++ b/dist/rules/snmp/snmp-exporter.yml
@@ -46,7 +46,7 @@ groups:
         summary: SNMP interface high outbound error rate (instance {{ $labels.instance }})
         description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an outbound error rate above 5%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    # Threshold is a rough default. Adjust based on your link capacity and traffic patterns.
+    # Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead.
     - alert: SnmpInterfaceHighBandwidthUsageInbound
       expr: 'rate(ifHCInOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0'
       for: 15m
@@ -56,7 +56,7 @@ groups:
         summary: SNMP interface high bandwidth usage inbound (instance {{ $labels.instance }})
         description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} inbound utilization is above 80%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    # Threshold is a rough default. Adjust based on your link capacity and traffic patterns.
+    # Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead.
     - alert: SnmpInterfaceHighBandwidthUsageOutbound
       expr: 'rate(ifHCOutOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0'
       for: 15m
diff --git a/dist/rules/spinnaker/embedded-exporter.yml b/dist/rules/spinnaker/embedded-exporter.yml
index e2b2e36..dac2885 100644
--- a/dist/rules/spinnaker/embedded-exporter.yml
+++ b/dist/rules/spinnaker/embedded-exporter.yml
@@ -27,7 +27,7 @@ groups:
 
     # The 30s threshold is a rough default. Adjust based on your pipeline SLOs.
     - alert: SpinnakerOrcaQueueMessageLagHigh
-      expr: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30'
+      expr: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30 and rate(queue_message_lag_seconds_count[5m]) > 0'
       for: 5m
       labels:
         severity: warning
diff --git a/dist/rules/systemd/systemd-exporter.yml b/dist/rules/systemd/systemd-exporter.yml
index ab71897..f8765e5 100644
--- a/dist/rules/systemd/systemd-exporter.yml
+++ b/dist/rules/systemd/systemd-exporter.yml
@@ -34,7 +34,7 @@ groups:
         description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: SystemdUnitTasksNearLimit
-      expr: 'systemd_unit_tasks_current / systemd_unit_tasks_max > 0.9 and systemd_unit_tasks_max > 0'
+      expr: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and systemd_unit_tasks_max > 0'
       for: 5m
       labels:
         severity: warning
@@ -44,7 +44,7 @@ groups:
 
     - alert: SystemdSocketRefusedConnections
       expr: 'increase(systemd_socket_refused_connections_total[5m]) > 0'
-      for: 0m
+      for: 2m
       labels:
         severity: warning
       annotations:
@@ -54,7 +54,7 @@ groups:
     # Threshold of 100 connections is arbitrary. Adjust to your workload.
     - alert: SystemdSocketHighConnections
       expr: 'systemd_socket_current_connections > 100'
-      for: 0m
+      for: 2m
       labels:
         severity: warning
       annotations:
diff --git a/dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml b/dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml
index 71a5ba1..b197a57 100644
--- a/dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml
+++ b/dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml
@@ -17,6 +17,7 @@ groups:
         summary: WireGuard peer handshake too old (instance {{ $labels.instance }})
         description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has not had a handshake for over 5 minutes. The tunnel may be down.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    # This alert will fire for all offline mobile/laptop peers. Consider filtering by expected-online peers.
     - alert: WireguardPeerHandshakeNeverEstablished
       expr: 'wireguard_latest_handshake_seconds == 0'
       for: 5m