From 4fb1aa9ae4ca58f25ca96bca2ca405dbdf1e6a47 Mon Sep 17 00:00:00 2001 From: samber Date: Wed, 18 Mar 2026 11:23:25 +0000 Subject: [PATCH] Publish --- dist/rules/apache-flink/flink-prometheus-reporter.yml | 10 ++++++---- dist/rules/apache-spark/spark-prometheus.yml | 9 +++++---- .../aws-cloudwatch/prometheus-cloudwatch-exporter.yml | 4 ++-- dist/rules/cert-manager/embedded-exporter.yml | 1 + dist/rules/cilium/embedded-exporter.yml | 4 ++++ dist/rules/digitalocean/digitalocean-exporter.yml | 6 +++--- dist/rules/ebpf/ebpf-exporter.yml | 6 +++--- dist/rules/envoy/embedded-exporter.yml | 4 ++-- dist/rules/gitlab-ci/gitaly.yml | 5 +++-- dist/rules/gitlab-ci/gitlab-built-in-exporter.yml | 7 ++++--- dist/rules/gitlab-ci/workhorse.yml | 2 +- dist/rules/golang/golang-exporter.yml | 4 ++-- dist/rules/grafana-alloy/embedded-exporter.yml | 4 ++-- dist/rules/grafana-mimir/embedded-exporter.yml | 8 +++++--- dist/rules/grafana-tempo/embedded-exporter.yml | 1 + dist/rules/graph-node/embedded-exporter.yml | 10 +++++----- dist/rules/ipmi/ipmi-exporter.yml | 2 +- dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml | 4 ++-- dist/rules/linkerd/embedded-exporter.yml | 2 +- dist/rules/openstack/openstack-exporter.yml | 2 +- .../oracle-database/iamseth-oracledb-exporter.yml | 7 +++---- dist/rules/process-exporter/process-exporter.yml | 10 +++++----- dist/rules/proxmox-ve/prometheus-pve-exporter.yml | 2 +- dist/rules/python/python-exporter.yml | 1 + dist/rules/ruby/ruby-exporter.yml | 1 + dist/rules/snmp/snmp-exporter.yml | 4 ++-- dist/rules/spinnaker/embedded-exporter.yml | 2 +- dist/rules/systemd/systemd-exporter.yml | 6 +++--- .../mindflavor-prometheus-wireguard-exporter.yml | 1 + 29 files changed, 72 insertions(+), 57 deletions(-) diff --git a/dist/rules/apache-flink/flink-prometheus-reporter.yml b/dist/rules/apache-flink/flink-prometheus-reporter.yml index ffceaf8..c7cb9cd 100644 --- a/dist/rules/apache-flink/flink-prometheus-reporter.yml +++ b/dist/rules/apache-flink/flink-prometheus-reporter.yml @@ -33,9 +33,10 @@ groups: summary: Flink all task slots used (instance {{ $labels.instance }}) description: "All Flink task slots are in use ({{ $value }} available). New jobs cannot be scheduled.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # A single restart may be normal during deployments. Adjust threshold based on restart tolerance. - alert: FlinkJobRestartIncreasing - expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 0' - for: 0m + expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 1' + for: 5m labels: severity: warning annotations: @@ -43,14 +44,15 @@ groups: description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FlinkCheckpointFailures - expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 0' - for: 0m + expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1' + for: 5m labels: severity: warning annotations: summary: Flink checkpoint failures (instance {{ $labels.instance }}) description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Value is in milliseconds. humanizeDuration expects seconds, so the template output may be misleading. # Threshold is 60 seconds. Adjust based on your checkpoint interval and state size. - alert: FlinkCheckpointDurationHigh expr: 'flink_jobmanager_job_lastCheckpointDuration > 60000' diff --git a/dist/rules/apache-spark/spark-prometheus.yml b/dist/rules/apache-spark/spark-prometheus.yml index 5571ed3..7d7ac12 100644 --- a/dist/rules/apache-spark/spark-prometheus.yml +++ b/dist/rules/apache-spark/spark-prometheus.yml @@ -51,7 +51,7 @@ groups: # Fires when more than 10% of executor time is spent in garbage collection. # This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/). - alert: SparkExecutorHighGcTime - expr: 'metrics_executor_totalGCTime / (metrics_executor_totalDuration > 0) > 0.1' + expr: 'metrics_executor_totalGCTime_seconds_total / (metrics_executor_totalDuration > 0) > 0.1' for: 5m labels: severity: warning @@ -60,7 +60,7 @@ groups: description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SparkExecutorAllTasksFailing - expr: 'metrics_executor_failedTasks > 0 and metrics_executor_completedTasks == 0' + expr: 'metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks == 0' for: 5m labels: severity: critical @@ -69,7 +69,7 @@ groups: description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SparkExecutorHighTaskFailureRate - expr: 'metrics_executor_failedTasks / (metrics_executor_totalTasks > 0) > 0.1' + expr: 'metrics_executor_failedTasks_total / (metrics_executor_totalTasks_total > 0) > 0.1' for: 5m labels: severity: warning @@ -77,9 +77,10 @@ groups: summary: Spark executor high task failure rate (instance {{ $labels.instance }}) description: "Spark executor {{ $labels.executor_id }} has a task failure rate above 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # diskUsed is a gauge, not a counter — do not use rate(). Threshold of 1GB is a rough default. # Disk spilling indicates insufficient memory for the workload. - alert: SparkExecutorHighDiskSpill - expr: 'rate(metrics_executor_diskUsed_bytes[5m]) > 0' + expr: 'metrics_executor_diskUsed_bytes > 1e9' for: 5m labels: severity: warning diff --git a/dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml b/dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml index dad2f44..0a258f3 100644 --- a/dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml +++ b/dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml @@ -112,7 +112,7 @@ groups: # Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics. - alert: AwsAlbHigh5xxErrorRate - expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5' + expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0' for: 5m labels: severity: critical @@ -132,7 +132,7 @@ groups: # Requires Lambda Errors and Invocations metrics. - alert: AwsLambdaHighErrorRate - expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5' + expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0' for: 5m labels: severity: warning diff --git a/dist/rules/cert-manager/embedded-exporter.yml b/dist/rules/cert-manager/embedded-exporter.yml index 60e6f34..71edcf9 100644 --- a/dist/rules/cert-manager/embedded-exporter.yml +++ b/dist/rules/cert-manager/embedded-exporter.yml @@ -33,6 +33,7 @@ groups: summary: Cert-Manager certificate not ready (instance {{ $labels.instance }}) description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # In cert-manager 1.19+, the metric was renamed (dropped http_ prefix). Verify metric name against your version. - alert: Cert-managerHittingAcmeRateLimits expr: 'sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0' for: 5m diff --git a/dist/rules/cilium/embedded-exporter.yml b/dist/rules/cilium/embedded-exporter.yml index aa788c3..bd6f786 100644 --- a/dist/rules/cilium/embedded-exporter.yml +++ b/dist/rules/cilium/embedded-exporter.yml @@ -5,6 +5,7 @@ groups: rules: + # Metric name depends on Cilium version. Use cilium_unreachable_nodes (older) or cilium_node_connectivity_status (1.14+). - alert: CiliumAgentUnreachableNodes expr: 'sum(cilium_unreachable_nodes{}) by (pod) > 0' for: 15m @@ -14,6 +15,7 @@ groups: summary: Cilium agent unreachable nodes (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} node(s). Check network connectivity and node health.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Metric name depends on Cilium version. Use cilium_unreachable_health_endpoints (older) or cilium_node_connectivity_status (1.14+). - alert: CiliumAgentUnreachableHealthEndpoints expr: 'sum(cilium_unreachable_health_endpoints{}) by (pod) > 0' for: 15m @@ -23,6 +25,7 @@ groups: summary: Cilium agent unreachable health endpoints (instance {{ $labels.instance }}) description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} health endpoint(s). Node-to-node health probes are failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Metric name depends on Cilium version. Use cilium_controllers_failing (older) or cilium_controllers_runs_total (1.14+). - alert: CiliumAgentFailingControllers expr: 'sum(cilium_controllers_failing{}) by (pod) > 0' for: 5m @@ -198,6 +201,7 @@ groups: summary: Cilium operator low available IPAM IPs (instance {{ $labels.instance }}) description: "Cilium operator IPAM IP pool is over 90% utilized. Allocate more IPs to avoid exhaustion.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Some Cilium versions may not have a status label on this metric. Verify against your Cilium version. - alert: CiliumOperatorIpamInterfaceCreationFailures expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0' for: 10m diff --git a/dist/rules/digitalocean/digitalocean-exporter.yml b/dist/rules/digitalocean/digitalocean-exporter.yml index 3b88156..662adf0 100644 --- a/dist/rules/digitalocean/digitalocean-exporter.yml +++ b/dist/rules/digitalocean/digitalocean-exporter.yml @@ -16,7 +16,7 @@ groups: - alert: DigitaloceanAccountNotActive expr: 'digitalocean_account_active != 1' - for: 0m + for: 5m labels: severity: critical annotations: @@ -52,7 +52,7 @@ groups: - alert: DigitaloceanLoadBalancerNoBackends expr: 'digitalocean_loadbalancer_droplets == 0' - for: 0m + for: 1m labels: severity: warning annotations: @@ -79,7 +79,7 @@ groups: - alert: DigitaloceanExporterCollectionErrors expr: 'increase(digitalocean_errors_total[5m]) > 0' - for: 0m + for: 5m labels: severity: warning annotations: diff --git a/dist/rules/ebpf/ebpf-exporter.yml b/dist/rules/ebpf/ebpf-exporter.yml index 432ca9d..9c27343 100644 --- a/dist/rules/ebpf/ebpf-exporter.yml +++ b/dist/rules/ebpf/ebpf-exporter.yml @@ -13,7 +13,7 @@ groups: severity: warning annotations: summary: eBPF exporter program not attached (instance {{ $labels.instance }}) - description: "eBPF program {{ $labels.name }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EbpfExporterDecoderErrors expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0' @@ -22,10 +22,10 @@ groups: severity: warning annotations: summary: eBPF exporter decoder errors (instance {{ $labels.instance }}) - description: "eBPF exporter is experiencing decoder errors for program {{ $labels.name }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EbpfExporterNoEnabledConfigs - expr: 'ebpf_exporter_enabled_configs == 0' + expr: 'absent(ebpf_exporter_enabled_configs)' for: 5m labels: severity: warning diff --git a/dist/rules/envoy/embedded-exporter.yml b/dist/rules/envoy/embedded-exporter.yml index f489b0c..cb1138a 100644 --- a/dist/rules/envoy/embedded-exporter.yml +++ b/dist/rules/envoy/embedded-exporter.yml @@ -24,7 +24,7 @@ groups: description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighDownstreamHttp5xxErrorRate - expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5' + expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0' for: 1m labels: severity: critical @@ -33,7 +33,7 @@ groups: description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighDownstreamHttp4xxErrorRate - expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10' + expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0' for: 5m labels: severity: warning diff --git a/dist/rules/gitlab-ci/gitaly.yml b/dist/rules/gitlab-ci/gitaly.yml index 208a3d1..adb6f9e 100644 --- a/dist/rules/gitlab-ci/gitaly.yml +++ b/dist/rules/gitlab-ci/gitaly.yml @@ -5,8 +5,9 @@ groups: rules: + # grpc_code!="OK" includes non-error codes like NotFound, AlreadyExists. Consider filtering to specific error codes for less noise. - alert: GitlabGitalyHighGrpcErrorRate - expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5' + expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' for: 5m labels: severity: warning @@ -18,7 +19,7 @@ groups: # concurrency limits. This directly impacts users trying to push, pull, or clone. # This alert is derived from the GitLab Omnibus default rules. - alert: GitlabGitalyResourceExhausted - expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1' + expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' for: 5m labels: severity: critical diff --git a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml index fc46a92..6a8b34a 100644 --- a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml +++ b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml @@ -37,7 +37,7 @@ groups: # Threshold is 5% of all requests returning server errors. # Check GitLab logs at /var/log/gitlab/ for root cause. - alert: GitlabHighHttpErrorRate - expr: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5' + expr: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5 and sum(rate(http_requests_total[5m])) > 0' for: 5m labels: severity: critical @@ -58,7 +58,7 @@ groups: # This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. # A sustained failure rate indicates background processing issues. - alert: GitlabSidekiqJobsFailing - expr: 'rate(sidekiq_jobs_failed_total[5m]) > 0' + expr: 'rate(sidekiq_jobs_failed_total[5m]) > 0.1' for: 10m labels: severity: warning @@ -136,6 +136,7 @@ groups: summary: GitLab CI pipeline creation slow (instance {{ $labels.instance }}) description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # This metric may not exist in all GitLab versions. Verify against your GitLab installation. - alert: GitlabCiPipelineFailuresIncreasing expr: 'rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0' for: 10m @@ -188,7 +189,7 @@ groups: # This may happen during a rolling deployment. If it persists, investigate incomplete upgrades. - alert: GitlabVersionMismatch - expr: 'count(count by (version) (deployments{version!=""})) > 1' + expr: 'count(count by (version) (gitlab_build_info)) > 1' for: 0m labels: severity: warning diff --git a/dist/rules/gitlab-ci/workhorse.yml b/dist/rules/gitlab-ci/workhorse.yml index e5a3b0d..5f44a39 100644 --- a/dist/rules/gitlab-ci/workhorse.yml +++ b/dist/rules/gitlab-ci/workhorse.yml @@ -8,7 +8,7 @@ groups: # Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying. # Threshold from GitLab Omnibus default rules: 10% for high-traffic instances. - alert: GitlabWorkhorseHighErrorRate - expr: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10' + expr: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10 and sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) > 0' for: 5m labels: severity: critical diff --git a/dist/rules/golang/golang-exporter.yml b/dist/rules/golang/golang-exporter.yml index cd5e777..13b251c 100644 --- a/dist/rules/golang/golang-exporter.yml +++ b/dist/rules/golang/golang-exporter.yml @@ -39,13 +39,13 @@ groups: # Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline. - alert: GoThreadCountHigh - expr: 'go_threads > 50' + expr: 'go_threads > 500' for: 5m labels: severity: warning annotations: summary: Go thread count high (instance {{ $labels.instance }}) - description: "Go OS thread count is high (> 50), potential blocking syscall or CGo leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Go OS thread count is high (> 500), potential blocking syscall or CGo leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is a rough default. Adjust based on your application's normal object count. - alert: GoHeapObjectsCountHigh diff --git a/dist/rules/grafana-alloy/embedded-exporter.yml b/dist/rules/grafana-alloy/embedded-exporter.yml index 99003ec..f62c8ce 100644 --- a/dist/rules/grafana-alloy/embedded-exporter.yml +++ b/dist/rules/grafana-alloy/embedded-exporter.yml @@ -6,10 +6,10 @@ groups: rules: - alert: GrafanaAlloyServiceDown - expr: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) ' + expr: 'count by (instance) (alloy_build_info offset 2h) unless count by (instance) (alloy_build_info)' for: 0m labels: severity: critical annotations: summary: Grafana Alloy service down (instance {{ $labels.instance }}) - description: "Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Alloy on instance {{ $labels.instance }} is not responding or has stopped running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/grafana-mimir/embedded-exporter.yml b/dist/rules/grafana-mimir/embedded-exporter.yml index bed1f46..bc1ce60 100644 --- a/dist/rules/grafana-mimir/embedded-exporter.yml +++ b/dist/rules/grafana-mimir/embedded-exporter.yml @@ -231,8 +231,9 @@ groups: summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold aligned with official Mimir mixin (30 minutes). - alert: MimirStoreGatewayHasNotSyncedBucket - expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 600) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0' + expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0' for: 5m labels: severity: critical @@ -277,7 +278,7 @@ groups: description: "Mimir compactor {{ $labels.instance }} has not run compaction in the last 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirCompactorHasConsecutiveFailures - expr: 'increase(cortex_compactor_runs_failed_total[2h]) > 1' + expr: 'increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) > 1' for: 0m labels: severity: critical @@ -303,8 +304,9 @@ groups: summary: Mimir compactor has not uploaded blocks (instance {{ $labels.instance }}) description: "Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Using 24h window per official mixin — compaction skips are rare events. - alert: MimirCompactorSkippedBlocks - expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0' + expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0' for: 5m labels: severity: warning diff --git a/dist/rules/grafana-tempo/embedded-exporter.yml b/dist/rules/grafana-tempo/embedded-exporter.yml index a06f097..6e96623 100644 --- a/dist/rules/grafana-tempo/embedded-exporter.yml +++ b/dist/rules/grafana-tempo/embedded-exporter.yml @@ -117,6 +117,7 @@ groups: summary: Tempo compaction too many outstanding blocks warning (instance {{ $labels.instance }}) description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration. - alert: TempoCompactionTooManyOutstandingBlocksCritical expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 250' for: 24h diff --git a/dist/rules/graph-node/embedded-exporter.yml b/dist/rules/graph-node/embedded-exporter.yml index b902605..9158bd0 100644 --- a/dist/rules/graph-node/embedded-exporter.yml +++ b/dist/rules/graph-node/embedded-exporter.yml @@ -41,20 +41,20 @@ groups: summary: Provider failed because get genesis timeout (instance {{ $labels.instance }}) description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: StoreConnectionIsTooSlow + - alert: StoreConnectionSlow expr: 'store_connection_wait_time_ms > 10' for: 0m labels: severity: warning annotations: - summary: Store connection is too slow (instance {{ $labels.instance }}) + summary: Store connection slow (instance {{ $labels.instance }}) description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: StoreConnectionIsTooSlow + - alert: StoreConnectionVerySlow expr: 'store_connection_wait_time_ms > 20' for: 0m labels: severity: critical annotations: - summary: Store connection is too slow (instance {{ $labels.instance }}) - description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Store connection very slow (instance {{ $labels.instance }}) + description: "Store connection is very slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/ipmi/ipmi-exporter.yml b/dist/rules/ipmi/ipmi-exporter.yml index 4aaa01e..7e34786 100644 --- a/dist/rules/ipmi/ipmi-exporter.yml +++ b/dist/rules/ipmi/ipmi-exporter.yml @@ -118,7 +118,7 @@ groups: # Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts. - alert: IpmiGenericSensorCritical expr: 'ipmi_sensor_state == 2' - for: 0m + for: 5m labels: severity: critical annotations: diff --git a/dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml b/dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml index 0aa767c..57e271c 100644 --- a/dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml +++ b/dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml @@ -56,9 +56,9 @@ groups: summary: Keycloak high registration failure rate (instance {{ $labels.instance }}) description: "More than 10% of registration attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Threshold of 2 seconds is a rough default. Adjust based on your performance requirements. + # keycloak_request_duration is in milliseconds. Threshold of 2000ms (2 seconds) is a rough default. - alert: KeycloakSlowRequestResponseTime - expr: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0' + expr: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2000 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0' for: 5m labels: severity: warning diff --git a/dist/rules/linkerd/embedded-exporter.yml b/dist/rules/linkerd/embedded-exporter.yml index 6afaaf4..054e461 100644 --- a/dist/rules/linkerd/embedded-exporter.yml +++ b/dist/rules/linkerd/embedded-exporter.yml @@ -12,4 +12,4 @@ groups: severity: warning annotations: summary: Linkerd high error rate (instance {{ $labels.instance }}) - description: "Linkerd error rate for {{ $labels.deployment | $labels.statefulset | $labels.daemonset }} is over 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/openstack/openstack-exporter.yml b/dist/rules/openstack/openstack-exporter.yml index d55688b..a75ed7a 100644 --- a/dist/rules/openstack/openstack-exporter.yml +++ b/dist/rules/openstack/openstack-exporter.yml @@ -24,7 +24,7 @@ groups: description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpenstackNeutronAgentDown - expr: 'openstack_neutron_agent_state{adminState="enabled"} == 0' + expr: 'openstack_neutron_agent_state{adminState="up"} == 0' for: 2m labels: severity: critical diff --git a/dist/rules/oracle-database/iamseth-oracledb-exporter.yml b/dist/rules/oracle-database/iamseth-oracledb-exporter.yml index e5dce0f..2969215 100644 --- a/dist/rules/oracle-database/iamseth-oracledb-exporter.yml +++ b/dist/rules/oracle-database/iamseth-oracledb-exporter.yml @@ -65,7 +65,7 @@ groups: # Threshold is highly workload-dependent. Adjust 200 to suit your environment. - alert: OracleDbTooManyActiveSessions - expr: 'oracledb_sessions_activity{status="ACTIVE", type="USER"} > 200' + expr: 'oracledb_sessions_value{status="ACTIVE", type="USER"} > 200' for: 5m labels: severity: warning @@ -73,10 +73,9 @@ groups: summary: Oracle DB too many active sessions (instance {{ $labels.instance }}) description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # High user I/O wait time indicates storage performance issues (slow disks, SAN latency, etc.). - # The metric is in centiseconds per second. Threshold 300 means 3 seconds of I/O wait per second of wall time. + # The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time. - alert: OracleDbHighWaitTime(userI/o) - expr: 'rate(oracledb_wait_time_user_io[5m]) > 300' + expr: 'oracledb_wait_time_user_io > 300' for: 5m labels: severity: warning diff --git a/dist/rules/process-exporter/process-exporter.yml b/dist/rules/process-exporter/process-exporter.yml index 8603ede..e8ea717 100644 --- a/dist/rules/process-exporter/process-exporter.yml +++ b/dist/rules/process-exporter/process-exporter.yml @@ -7,9 +7,9 @@ groups: - alert: ProcessExporterGroupDown expr: 'namedprocess_namegroup_num_procs == 0' - for: 2m + for: 5m labels: - severity: critical + severity: warning annotations: summary: Process exporter group down (instance {{ $labels.instance }}) description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -63,7 +63,7 @@ groups: description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ProcessExporterZombieProcesses - expr: 'namedprocess_namegroup_states{state="Zombie"} > 0' + expr: 'namedprocess_namegroup_states{state="Zombie"} > 5' for: 5m labels: severity: warning @@ -71,9 +71,9 @@ groups: summary: Process exporter zombie processes (instance {{ $labels.instance }}) description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Threshold of 10000 switches/s is a rough default. Adjust based on the workload profile. + # Filters to voluntary switches only — involuntary switches are normal under CPU contention. Threshold of 50000/s is a rough default. Adjust based on workload. - alert: ProcessExporterHighContextSwitching - expr: 'rate(namedprocess_namegroup_context_switches_total[5m]) > 10000' + expr: 'rate(namedprocess_namegroup_context_switches_total{ctxswitchtype="voluntary"}[5m]) > 50000' for: 5m labels: severity: warning diff --git a/dist/rules/proxmox-ve/prometheus-pve-exporter.yml b/dist/rules/proxmox-ve/prometheus-pve-exporter.yml index d0fbe27..2bfb8bb 100644 --- a/dist/rules/proxmox-ve/prometheus-pve-exporter.yml +++ b/dist/rules/proxmox-ve/prometheus-pve-exporter.yml @@ -36,7 +36,7 @@ groups: description: "Proxmox VE CPU usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf \"%.2f\" }}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PveHighMemoryUsage - expr: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90' + expr: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90 and pve_memory_size_bytes > 0' for: 5m labels: severity: warning diff --git a/dist/rules/python/python-exporter.yml b/dist/rules/python/python-exporter.yml index 2230ae5..d4211d5 100644 --- a/dist/rules/python/python-exporter.yml +++ b/dist/rules/python/python-exporter.yml @@ -33,6 +33,7 @@ groups: summary: Python file descriptors exhaustion (instance {{ $labels.instance }}) description: "Python process is running out of file descriptors (> 90% used)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Gen2 collection rate > 1/s is very high. In most applications, gen2 runs are infrequent. Adjust threshold based on your workload. - alert: PythonGcGeneration2CollectionsHigh expr: 'rate(python_gc_collections_total{generation="2"}[5m]) > 1' for: 5m diff --git a/dist/rules/ruby/ruby-exporter.yml b/dist/rules/ruby/ruby-exporter.yml index 0b526cb..1ac782e 100644 --- a/dist/rules/ruby/ruby-exporter.yml +++ b/dist/rules/ruby/ruby-exporter.yml @@ -24,6 +24,7 @@ groups: summary: Ruby heap free slots high (instance {{ $labels.instance }}) description: "Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Major GC rate > 5/s is extremely high. Consider lowering to > 1 or > 2 for earlier detection. - alert: RubyMajorGcRateHigh expr: 'rate(ruby_major_gc_ops_total[5m]) > 5' for: 5m diff --git a/dist/rules/snmp/snmp-exporter.yml b/dist/rules/snmp/snmp-exporter.yml index c7324e9..920f714 100644 --- a/dist/rules/snmp/snmp-exporter.yml +++ b/dist/rules/snmp/snmp-exporter.yml @@ -46,7 +46,7 @@ groups: summary: SNMP interface high outbound error rate (instance {{ $labels.instance }}) description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an outbound error rate above 5%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Threshold is a rough default. Adjust based on your link capacity and traffic patterns. + # Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead. - alert: SnmpInterfaceHighBandwidthUsageInbound expr: 'rate(ifHCInOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0' for: 15m @@ -56,7 +56,7 @@ groups: summary: SNMP interface high bandwidth usage inbound (instance {{ $labels.instance }}) description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} inbound utilization is above 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Threshold is a rough default. Adjust based on your link capacity and traffic patterns. + # Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead. - alert: SnmpInterfaceHighBandwidthUsageOutbound expr: 'rate(ifHCOutOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0' for: 15m diff --git a/dist/rules/spinnaker/embedded-exporter.yml b/dist/rules/spinnaker/embedded-exporter.yml index e2b2e36..dac2885 100644 --- a/dist/rules/spinnaker/embedded-exporter.yml +++ b/dist/rules/spinnaker/embedded-exporter.yml @@ -27,7 +27,7 @@ groups: # The 30s threshold is a rough default. Adjust based on your pipeline SLOs. - alert: SpinnakerOrcaQueueMessageLagHigh - expr: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30' + expr: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30 and rate(queue_message_lag_seconds_count[5m]) > 0' for: 5m labels: severity: warning diff --git a/dist/rules/systemd/systemd-exporter.yml b/dist/rules/systemd/systemd-exporter.yml index ab71897..f8765e5 100644 --- a/dist/rules/systemd/systemd-exporter.yml +++ b/dist/rules/systemd/systemd-exporter.yml @@ -34,7 +34,7 @@ groups: description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SystemdUnitTasksNearLimit - expr: 'systemd_unit_tasks_current / systemd_unit_tasks_max > 0.9 and systemd_unit_tasks_max > 0' + expr: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and systemd_unit_tasks_max > 0' for: 5m labels: severity: warning @@ -44,7 +44,7 @@ groups: - alert: SystemdSocketRefusedConnections expr: 'increase(systemd_socket_refused_connections_total[5m]) > 0' - for: 0m + for: 2m labels: severity: warning annotations: @@ -54,7 +54,7 @@ groups: # Threshold of 100 connections is arbitrary. Adjust to your workload. - alert: SystemdSocketHighConnections expr: 'systemd_socket_current_connections > 100' - for: 0m + for: 2m labels: severity: warning annotations: diff --git a/dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml b/dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml index 71a5ba1..b197a57 100644 --- a/dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml +++ b/dist/rules/wireguard/mindflavor-prometheus-wireguard-exporter.yml @@ -17,6 +17,7 @@ groups: summary: WireGuard peer handshake too old (instance {{ $labels.instance }}) description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has not had a handshake for over 5 minutes. The tunnel may be down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # This alert will fire for all offline mobile/laptop peers. Consider filtering by expected-online peers. - alert: WireguardPeerHandshakeNeverEstablished expr: 'wireguard_latest_handshake_seconds == 0' for: 5m