Publish

2026-06-21 00:47:18 +08:00 · 2026-03-18 11:23:25 +00:00 · 2026-03-18 11:23:25 +00:00 · 4fb1aa9ae4
commit 4fb1aa9ae4
parent a4581ed322
29 changed files with 72 additions and 57 deletions
--- a/dist/rules/apache-flink/flink-prometheus-reporter.yml
+++ b/dist/rules/apache-flink/flink-prometheus-reporter.yml
@ -33,9 +33,10 @@ groups:
        summary: Flink all task slots used (instance {{ $labels.instance }})
        description: "All Flink task slots are in use ({{ $value }} available). New jobs cannot be scheduled.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # A single restart may be normal during deployments. Adjust threshold based on restart tolerance.
    - alert: FlinkJobRestartIncreasing
-      expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 0'
-      for: 0m
+      expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 1'
+      for: 5m
      labels:
        severity: warning
      annotations:
@ -43,14 +44,15 @@ groups:
        description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: FlinkCheckpointFailures
-      expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 0'
-      for: 0m
+      expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1'
+      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Flink checkpoint failures (instance {{ $labels.instance }})
        description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Value is in milliseconds. humanizeDuration expects seconds, so the template output may be misleading.
    # Threshold is 60 seconds. Adjust based on your checkpoint interval and state size.
    - alert: FlinkCheckpointDurationHigh
      expr: 'flink_jobmanager_job_lastCheckpointDuration > 60000'
--- a/dist/rules/apache-spark/spark-prometheus.yml
+++ b/dist/rules/apache-spark/spark-prometheus.yml
@ -51,7 +51,7 @@ groups:
    # Fires when more than 10% of executor time is spent in garbage collection.
    # This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/).
    - alert: SparkExecutorHighGcTime
-      expr: 'metrics_executor_totalGCTime / (metrics_executor_totalDuration > 0) > 0.1'
+      expr: 'metrics_executor_totalGCTime_seconds_total / (metrics_executor_totalDuration > 0) > 0.1'
      for: 5m
      labels:
        severity: warning
@ -60,7 +60,7 @@ groups:
        description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SparkExecutorAllTasksFailing
-      expr: 'metrics_executor_failedTasks > 0 and metrics_executor_completedTasks == 0'
+      expr: 'metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks == 0'
      for: 5m
      labels:
        severity: critical
@ -69,7 +69,7 @@ groups:
        description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SparkExecutorHighTaskFailureRate
-      expr: 'metrics_executor_failedTasks / (metrics_executor_totalTasks > 0) > 0.1'
+      expr: 'metrics_executor_failedTasks_total / (metrics_executor_totalTasks_total > 0) > 0.1'
      for: 5m
      labels:
        severity: warning
@ -77,9 +77,10 @@ groups:
        summary: Spark executor high task failure rate (instance {{ $labels.instance }})
        description: "Spark executor {{ $labels.executor_id }} has a task failure rate above 10%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # diskUsed is a gauge, not a counter — do not use rate(). Threshold of 1GB is a rough default.
    # Disk spilling indicates insufficient memory for the workload.
    - alert: SparkExecutorHighDiskSpill
-      expr: 'rate(metrics_executor_diskUsed_bytes[5m]) > 0'
+      expr: 'metrics_executor_diskUsed_bytes > 1e9'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml
+++ b/dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml
@ -112,7 +112,7 @@ groups:

    # Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics.
    - alert: AwsAlbHigh5xxErrorRate
-      expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5'
+      expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0'
      for: 5m
      labels:
        severity: critical
@ -132,7 +132,7 @@ groups:

    # Requires Lambda Errors and Invocations metrics.
    - alert: AwsLambdaHighErrorRate
-      expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5'
+      expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/cert-manager/embedded-exporter.yml
+++ b/dist/rules/cert-manager/embedded-exporter.yml
@ -33,6 +33,7 @@ groups:
        summary: Cert-Manager certificate not ready (instance {{ $labels.instance }})
        description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # In cert-manager 1.19+, the metric was renamed (dropped http_ prefix). Verify metric name against your version.
    - alert: Cert-managerHittingAcmeRateLimits
      expr: 'sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0'
      for: 5m
--- a/dist/rules/cilium/embedded-exporter.yml
+++ b/dist/rules/cilium/embedded-exporter.yml
@ -5,6 +5,7 @@ groups:
  
  rules:

+    # Metric name depends on Cilium version. Use cilium_unreachable_nodes (older) or cilium_node_connectivity_status (1.14+).
    - alert: CiliumAgentUnreachableNodes
      expr: 'sum(cilium_unreachable_nodes{}) by (pod) > 0'
      for: 15m
@ -14,6 +15,7 @@ groups:
        summary: Cilium agent unreachable nodes (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} node(s). Check network connectivity and node health.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Metric name depends on Cilium version. Use cilium_unreachable_health_endpoints (older) or cilium_node_connectivity_status (1.14+).
    - alert: CiliumAgentUnreachableHealthEndpoints
      expr: 'sum(cilium_unreachable_health_endpoints{}) by (pod) > 0'
      for: 15m
@ -23,6 +25,7 @@ groups:
        summary: Cilium agent unreachable health endpoints (instance {{ $labels.instance }})
        description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} health endpoint(s). Node-to-node health probes are failing.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Metric name depends on Cilium version. Use cilium_controllers_failing (older) or cilium_controllers_runs_total (1.14+).
    - alert: CiliumAgentFailingControllers
      expr: 'sum(cilium_controllers_failing{}) by (pod) > 0'
      for: 5m
@ -198,6 +201,7 @@ groups:
        summary: Cilium operator low available IPAM IPs (instance {{ $labels.instance }})
        description: "Cilium operator IPAM IP pool is over 90% utilized. Allocate more IPs to avoid exhaustion.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Some Cilium versions may not have a status label on this metric. Verify against your Cilium version.
    - alert: CiliumOperatorIpamInterfaceCreationFailures
      expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0'
      for: 10m
--- a/dist/rules/digitalocean/digitalocean-exporter.yml
+++ b/dist/rules/digitalocean/digitalocean-exporter.yml
@ -16,7 +16,7 @@ groups:

    - alert: DigitaloceanAccountNotActive
      expr: 'digitalocean_account_active != 1'
-      for: 0m
+      for: 5m
      labels:
        severity: critical
      annotations:
@ -52,7 +52,7 @@ groups:

    - alert: DigitaloceanLoadBalancerNoBackends
      expr: 'digitalocean_loadbalancer_droplets == 0'
-      for: 0m
+      for: 1m
      labels:
        severity: warning
      annotations:
@ -79,7 +79,7 @@ groups:

    - alert: DigitaloceanExporterCollectionErrors
      expr: 'increase(digitalocean_errors_total[5m]) > 0'
-      for: 0m
+      for: 5m
      labels:
        severity: warning
      annotations:
--- a/dist/rules/ebpf/ebpf-exporter.yml
+++ b/dist/rules/ebpf/ebpf-exporter.yml
@ -13,7 +13,7 @@ groups:
        severity: warning
      annotations:
        summary: eBPF exporter program not attached (instance {{ $labels.instance }})
-        description: "eBPF program {{ $labels.name }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EbpfExporterDecoderErrors
      expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0'
@ -22,10 +22,10 @@ groups:
        severity: warning
      annotations:
        summary: eBPF exporter decoder errors (instance {{ $labels.instance }})
-        description: "eBPF exporter is experiencing decoder errors for program {{ $labels.name }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EbpfExporterNoEnabledConfigs
-      expr: 'ebpf_exporter_enabled_configs == 0'
+      expr: 'absent(ebpf_exporter_enabled_configs)'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/envoy/embedded-exporter.yml
+++ b/dist/rules/envoy/embedded-exporter.yml
@ -24,7 +24,7 @@ groups:
        description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighDownstreamHttp5xxErrorRate
-      expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5'
+      expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
      for: 1m
      labels:
        severity: critical
@ -33,7 +33,7 @@ groups:
        description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: EnvoyHighDownstreamHttp4xxErrorRate
-      expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10'
+      expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/gitlab-ci/gitaly.yml
+++ b/dist/rules/gitlab-ci/gitaly.yml
@ -5,8 +5,9 @@ groups:
  
  rules:

+    # grpc_code!="OK" includes non-error codes like NotFound, AlreadyExists. Consider filtering to specific error codes for less noise.
    - alert: GitlabGitalyHighGrpcErrorRate
-      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5'
+      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
      for: 5m
      labels:
        severity: warning
@ -18,7 +19,7 @@ groups:
    # concurrency limits. This directly impacts users trying to push, pull, or clone.
    # This alert is derived from the GitLab Omnibus default rules.
    - alert: GitlabGitalyResourceExhausted
-      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1'
+      expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
      for: 5m
      labels:
        severity: critical
--- a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml
+++ b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml
@ -37,7 +37,7 @@ groups:
    # Threshold is 5% of all requests returning server errors.
    # Check GitLab logs at /var/log/gitlab/ for root cause.
    - alert: GitlabHighHttpErrorRate
-      expr: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5'
+      expr: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5 and sum(rate(http_requests_total[5m])) > 0'
      for: 5m
      labels:
        severity: critical
@ -58,7 +58,7 @@ groups:
    # This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
    # A sustained failure rate indicates background processing issues.
    - alert: GitlabSidekiqJobsFailing
-      expr: 'rate(sidekiq_jobs_failed_total[5m]) > 0'
+      expr: 'rate(sidekiq_jobs_failed_total[5m]) > 0.1'
      for: 10m
      labels:
        severity: warning
@ -136,6 +136,7 @@ groups:
        summary: GitLab CI pipeline creation slow (instance {{ $labels.instance }})
        description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # This metric may not exist in all GitLab versions. Verify against your GitLab installation.
    - alert: GitlabCiPipelineFailuresIncreasing
      expr: 'rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0'
      for: 10m
@ -188,7 +189,7 @@ groups:

    # This may happen during a rolling deployment. If it persists, investigate incomplete upgrades.
    - alert: GitlabVersionMismatch
-      expr: 'count(count by (version) (deployments{version!=""})) > 1'
+      expr: 'count(count by (version) (gitlab_build_info)) > 1'
      for: 0m
      labels:
        severity: warning
--- a/dist/rules/gitlab-ci/workhorse.yml
+++ b/dist/rules/gitlab-ci/workhorse.yml
@ -8,7 +8,7 @@ groups:
    # Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying.
    # Threshold from GitLab Omnibus default rules: 10% for high-traffic instances.
    - alert: GitlabWorkhorseHighErrorRate
-      expr: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10'
+      expr: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10 and sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) > 0'
      for: 5m
      labels:
        severity: critical
--- a/dist/rules/golang/golang-exporter.yml
+++ b/dist/rules/golang/golang-exporter.yml
@ -39,13 +39,13 @@ groups:

    # Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline.
    - alert: GoThreadCountHigh
-      expr: 'go_threads > 50'
+      expr: 'go_threads > 500'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Go thread count high (instance {{ $labels.instance }})
-        description: "Go OS thread count is high (> 50), potential blocking syscall or CGo leak\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Go OS thread count is high (> 500), potential blocking syscall or CGo leak\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold is a rough default. Adjust based on your application's normal object count.
    - alert: GoHeapObjectsCountHigh
--- a/dist/rules/grafana-alloy/embedded-exporter.yml
+++ b/dist/rules/grafana-alloy/embedded-exporter.yml
@ -6,10 +6,10 @@ groups:
  rules:

    - alert: GrafanaAlloyServiceDown
-      expr: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m)  '
+      expr: 'count by (instance) (alloy_build_info offset 2h) unless count by (instance) (alloy_build_info)'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Grafana Alloy service down (instance {{ $labels.instance }})
-        description: "Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Alloy on instance {{ $labels.instance }} is not responding or has stopped running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/grafana-mimir/embedded-exporter.yml
+++ b/dist/rules/grafana-mimir/embedded-exporter.yml
@ -231,8 +231,9 @@ groups:
        summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }})
        description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Threshold aligned with official Mimir mixin (30 minutes).
    - alert: MimirStoreGatewayHasNotSyncedBucket
-      expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 600) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0'
+      expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0'
      for: 5m
      labels:
        severity: critical
@ -277,7 +278,7 @@ groups:
        description: "Mimir compactor {{ $labels.instance }} has not run compaction in the last 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: MimirCompactorHasConsecutiveFailures
-      expr: 'increase(cortex_compactor_runs_failed_total[2h]) > 1'
+      expr: 'increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) > 1'
      for: 0m
      labels:
        severity: critical
@ -303,8 +304,9 @@ groups:
        summary: Mimir compactor has not uploaded blocks (instance {{ $labels.instance }})
        description: "Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Using 24h window per official mixin — compaction skips are rare events.
    - alert: MimirCompactorSkippedBlocks
-      expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0'
+      expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/grafana-tempo/embedded-exporter.yml
+++ b/dist/rules/grafana-tempo/embedded-exporter.yml
@ -117,6 +117,7 @@ groups:
        summary: Tempo compaction too many outstanding blocks warning (instance {{ $labels.instance }})
        description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration.
    - alert: TempoCompactionTooManyOutstandingBlocksCritical
      expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 250'
      for: 24h
--- a/dist/rules/graph-node/embedded-exporter.yml
+++ b/dist/rules/graph-node/embedded-exporter.yml
@ -41,20 +41,20 @@ groups:
        summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
        description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: StoreConnectionIsTooSlow
+    - alert: StoreConnectionSlow
      expr: 'store_connection_wait_time_ms > 10'
      for: 0m
      labels:
        severity: warning
      annotations:
-        summary: Store connection is too slow (instance {{ $labels.instance }})
+        summary: Store connection slow (instance {{ $labels.instance }})
        description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: StoreConnectionIsTooSlow
+    - alert: StoreConnectionVerySlow
      expr: 'store_connection_wait_time_ms > 20'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: Store connection is too slow (instance {{ $labels.instance }})
-        description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Store connection very slow (instance {{ $labels.instance }})
+        description: "Store connection is very slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/ipmi/ipmi-exporter.yml
+++ b/dist/rules/ipmi/ipmi-exporter.yml
@ -118,7 +118,7 @@ groups:
    # Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts.
    - alert: IpmiGenericSensorCritical
      expr: 'ipmi_sensor_state == 2'
-      for: 0m
+      for: 5m
      labels:
        severity: critical
      annotations:
--- a/dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml
+++ b/dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml
@ -56,9 +56,9 @@ groups:
        summary: Keycloak high registration failure rate (instance {{ $labels.instance }})
        description: "More than 10% of registration attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # Threshold of 2 seconds is a rough default. Adjust based on your performance requirements.
+    # keycloak_request_duration is in milliseconds. Threshold of 2000ms (2 seconds) is a rough default.
    - alert: KeycloakSlowRequestResponseTime
-      expr: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0'
+      expr: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2000 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/linkerd/embedded-exporter.yml
+++ b/dist/rules/linkerd/embedded-exporter.yml
@ -12,4 +12,4 @@ groups:
        severity: warning
      annotations:
        summary: Linkerd high error rate (instance {{ $labels.instance }})
-        description: "Linkerd error rate for {{ $labels.deployment | $labels.statefulset | $labels.daemonset }} is over 10%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/openstack/openstack-exporter.yml
+++ b/dist/rules/openstack/openstack-exporter.yml
@ -24,7 +24,7 @@ groups:
        description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: OpenstackNeutronAgentDown
-      expr: 'openstack_neutron_agent_state{adminState="enabled"} == 0'
+      expr: 'openstack_neutron_agent_state{adminState="up"} == 0'
      for: 2m
      labels:
        severity: critical
--- a/dist/rules/oracle-database/iamseth-oracledb-exporter.yml
+++ b/dist/rules/oracle-database/iamseth-oracledb-exporter.yml
@ -65,7 +65,7 @@ groups:

    # Threshold is highly workload-dependent. Adjust 200 to suit your environment.
    - alert: OracleDbTooManyActiveSessions
-      expr: 'oracledb_sessions_activity{status="ACTIVE", type="USER"} > 200'
+      expr: 'oracledb_sessions_value{status="ACTIVE", type="USER"} > 200'
      for: 5m
      labels:
        severity: warning
@ -73,10 +73,9 @@ groups:
        summary: Oracle DB too many active sessions (instance {{ $labels.instance }})
        description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # High user I/O wait time indicates storage performance issues (slow disks, SAN latency, etc.).
-    # The metric is in centiseconds per second. Threshold 300 means 3 seconds of I/O wait per second of wall time.
+    # The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time.
    - alert: OracleDbHighWaitTime(userI/o)
-      expr: 'rate(oracledb_wait_time_user_io[5m]) > 300'
+      expr: 'oracledb_wait_time_user_io > 300'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/process-exporter/process-exporter.yml
+++ b/dist/rules/process-exporter/process-exporter.yml
@ -7,9 +7,9 @@ groups:

    - alert: ProcessExporterGroupDown
      expr: 'namedprocess_namegroup_num_procs == 0'
-      for: 2m
+      for: 5m
      labels:
-        severity: critical
+        severity: warning
      annotations:
        summary: Process exporter group down (instance {{ $labels.instance }})
        description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@ -63,7 +63,7 @@ groups:
        description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: ProcessExporterZombieProcesses
-      expr: 'namedprocess_namegroup_states{state="Zombie"} > 0'
+      expr: 'namedprocess_namegroup_states{state="Zombie"} > 5'
      for: 5m
      labels:
        severity: warning
@ -71,9 +71,9 @@ groups:
        summary: Process exporter zombie processes (instance {{ $labels.instance }})
        description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # Threshold of 10000 switches/s is a rough default. Adjust based on the workload profile.
+    # Filters to voluntary switches only — involuntary switches are normal under CPU contention. Threshold of 50000/s is a rough default. Adjust based on workload.
    - alert: ProcessExporterHighContextSwitching
-      expr: 'rate(namedprocess_namegroup_context_switches_total[5m]) > 10000'
+      expr: 'rate(namedprocess_namegroup_context_switches_total{ctxswitchtype="voluntary"}[5m]) > 50000'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/proxmox-ve/prometheus-pve-exporter.yml
+++ b/dist/rules/proxmox-ve/prometheus-pve-exporter.yml
@ -36,7 +36,7 @@ groups:
        description: "Proxmox VE CPU usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf \"%.2f\" }}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PveHighMemoryUsage
-      expr: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90'
+      expr: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90 and pve_memory_size_bytes > 0'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/python/python-exporter.yml
+++ b/dist/rules/python/python-exporter.yml
@ -33,6 +33,7 @@ groups:
        summary: Python file descriptors exhaustion (instance {{ $labels.instance }})
        description: "Python process is running out of file descriptors (> 90% used)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Gen2 collection rate > 1/s is very high. In most applications, gen2 runs are infrequent. Adjust threshold based on your workload.
    - alert: PythonGcGeneration2CollectionsHigh
      expr: 'rate(python_gc_collections_total{generation="2"}[5m]) > 1'
      for: 5m
--- a/dist/rules/ruby/ruby-exporter.yml
+++ b/dist/rules/ruby/ruby-exporter.yml
@ -24,6 +24,7 @@ groups:
        summary: Ruby heap free slots high (instance {{ $labels.instance }})
        description: "Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # Major GC rate > 5/s is extremely high. Consider lowering to > 1 or > 2 for earlier detection.
    - alert: RubyMajorGcRateHigh
      expr: 'rate(ruby_major_gc_ops_total[5m]) > 5'
      for: 5m
--- a/dist/rules/snmp/snmp-exporter.yml
+++ b/dist/rules/snmp/snmp-exporter.yml
@ -46,7 +46,7 @@ groups:
        summary: SNMP interface high outbound error rate (instance {{ $labels.instance }})
        description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an outbound error rate above 5%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # Threshold is a rough default. Adjust based on your link capacity and traffic patterns.
+    # Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead.
    - alert: SnmpInterfaceHighBandwidthUsageInbound
      expr: 'rate(ifHCInOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0'
      for: 15m
@ -56,7 +56,7 @@ groups:
        summary: SNMP interface high bandwidth usage inbound (instance {{ $labels.instance }})
        description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} inbound utilization is above 80%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    # Threshold is a rough default. Adjust based on your link capacity and traffic patterns.
+    # Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead.
    - alert: SnmpInterfaceHighBandwidthUsageOutbound
      expr: 'rate(ifHCOutOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0'
      for: 15m
--- a/dist/rules/spinnaker/embedded-exporter.yml
+++ b/dist/rules/spinnaker/embedded-exporter.yml
@ -27,7 +27,7 @@ groups:

    # The 30s threshold is a rough default. Adjust based on your pipeline SLOs.
    - alert: SpinnakerOrcaQueueMessageLagHigh
-      expr: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30'
+      expr: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30 and rate(queue_message_lag_seconds_count[5m]) > 0'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/systemd/systemd-exporter.yml
+++ b/dist/rules/systemd/systemd-exporter.yml
@ -34,7 +34,7 @@ groups:
        description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: SystemdUnitTasksNearLimit
-      expr: 'systemd_unit_tasks_current / systemd_unit_tasks_max > 0.9 and systemd_unit_tasks_max > 0'
+      expr: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and systemd_unit_tasks_max > 0'
      for: 5m
      labels:
        severity: warning
@ -44,7 +44,7 @@ groups:

    - alert: SystemdSocketRefusedConnections
      expr: 'increase(systemd_socket_refused_connections_total[5m]) > 0'
-      for: 0m
+      for: 2m
      labels:
        severity: warning
      annotations:
@ -54,7 +54,7 @@ groups:
    # Threshold of 100 connections is arbitrary. Adjust to your workload.
    - alert: SystemdSocketHighConnections
      expr: 'systemd_socket_current_connections > 100'
-      for: 0m
+      for: 2m
      labels:
        severity: warning
      annotations:
--- a/dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml
+++ b/dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml
@ -17,6 +17,7 @@ groups:
        summary: WireGuard peer handshake too old (instance {{ $labels.instance }})
        description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has not had a handshake for over 5 minutes. The tunnel may be down.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+    # This alert will fire for all offline mobile/laptop peers. Consider filtering by expected-online peers.
    - alert: WireguardPeerHandshakeNeverEstablished
      expr: 'wireguard_latest_handshake_seconds == 0'
      for: 5m