From ed1515015a55bee02fe057c2e78317371682e48f Mon Sep 17 00:00:00 2001 From: samber Date: Mon, 6 Apr 2026 18:38:45 +0000 Subject: [PATCH] Publish --- .../flink-prometheus-reporter.yml | 14 +++--- .../apache/lusitaniae-apache-exporter.yml | 2 +- dist/rules/apc-ups/apcupsd_exporter.yml | 2 +- dist/rules/blackbox/blackbox-exporter.yml | 4 +- dist/rules/caddy/embedded-exporter.yml | 4 +- .../cassandra/criteo-cassandra-exporter.yml | 11 +++-- .../instaclustr-cassandra-exporter.yml | 4 +- dist/rules/ceph/embedded-exporter.yml | 23 ++++++---- dist/rules/cert-manager/embedded-exporter.yml | 5 +- dist/rules/cilium/embedded-exporter.yml | 34 +++++++------- dist/rules/clickhouse/embedded-exporter.yml | 6 +-- dist/rules/cortex/embedded-exporter.yml | 10 ++-- .../gesellix-couchdb-prometheus-exporter.yml | 4 +- .../digitalocean/digitalocean-exporter.yml | 2 +- .../docker-containers/google-cadvisor.yml | 2 +- dist/rules/ebpf/ebpf-exporter.yml | 2 +- ...theus-community-elasticsearch-exporter.yml | 5 +- dist/rules/envoy/embedded-exporter.yml | 4 +- dist/rules/etcd/embedded-exporter.yml | 9 ++-- .../freeswitch/znerol-freeswitch-exporter.yml | 4 +- dist/rules/gitlab-ci/gitaly.yml | 10 ++-- .../gitlab-ci/gitlab-built-in-exporter.yml | 4 +- dist/rules/golang/golang-exporter.yml | 24 ++++++---- .../rules/grafana-mimir/embedded-exporter.yml | 46 ++++++++++++------- .../rules/grafana-tempo/embedded-exporter.yml | 9 ++-- dist/rules/graph-node/embedded-exporter.yml | 2 + dist/rules/hadoop/jmx_exporter.yml | 10 +++- dist/rules/haproxy/embedded-exporter-v2.yml | 16 +++---- dist/rules/haproxy/haproxy-exporter-v1.yml | 20 ++++---- .../hashicorp-vault/embedded-exporter.yml | 10 ++-- .../rules/host-and-hardware/node-exporter.yml | 15 +++--- dist/rules/istio/embedded-exporter.yml | 27 ++++++----- dist/rules/jenkins/metric-plugin.yml | 2 +- dist/rules/kafka/danielqsj-kafka-exporter.yml | 2 +- dist/rules/kubernetes/kubestate-exporter.yml | 2 +- dist/rules/loki/embedded-exporter.yml | 10 ++-- dist/rules/memcached/memcached-exporter.yml | 8 ++-- dist/rules/mysql/mysqld-exporter.yml | 11 +++-- dist/rules/nats/nats-exporter.yml | 7 ++- dist/rules/nomad/embedded-exporter.yml | 8 ++-- dist/rules/openstack/openstack-exporter.yml | 1 + .../embedded-exporter.yml | 14 ++++-- dist/rules/postgresql/postgres-exporter.yml | 6 ++- .../embedded-exporter.yml | 2 +- dist/rules/promtail/embedded-exporter.yml | 2 +- dist/rules/pulsar/embedded-exporter.yml | 18 +++++--- dist/rules/python/python-exporter.yml | 4 +- .../rabbitmq/kbudde-rabbitmq-exporter.yml | 4 +- dist/rules/rabbitmq/rabbitmq-exporter.yml | 11 +++-- dist/rules/ruby/ruby-exporter.yml | 4 +- .../smartctl-exporter.yml | 12 ++--- .../rules/sidekiq/strech-sidekiq-exporter.yml | 6 +-- dist/rules/snmp/snmp-exporter.yml | 2 +- dist/rules/spinnaker/embedded-exporter.yml | 12 ++--- .../rules/ssl/tls/ribbybibby-ssl-exporter.yml | 8 ++-- dist/rules/systemd/systemd-exporter.yml | 3 +- dist/rules/thanos/thanos-bucket-replicate.yml | 4 +- dist/rules/thanos/thanos-compactor.yml | 4 +- dist/rules/thanos/thanos-query.yml | 7 +-- dist/rules/thanos/thanos-receiver.yml | 8 ++-- dist/rules/thanos/thanos-ruler.yml | 14 +++--- dist/rules/thanos/thanos-sidecar.yml | 2 +- dist/rules/thanos/thanos-store.yml | 2 +- dist/rules/zfs/zfs_exporter.yml | 4 +- .../zookeeper/dabealu-zookeeper-exporter.yml | 4 +- 65 files changed, 311 insertions(+), 241 deletions(-) diff --git a/dist/rules/apache-flink/flink-prometheus-reporter.yml b/dist/rules/apache-flink/flink-prometheus-reporter.yml index 3bd48ce..1d3854d 100644 --- a/dist/rules/apache-flink/flink-prometheus-reporter.yml +++ b/dist/rules/apache-flink/flink-prometheus-reporter.yml @@ -35,7 +35,7 @@ groups: # A single restart may be normal during deployments. Adjust threshold based on restart tolerance. - alert: FlinkJobRestartIncreasing - expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 1' + expr: 'delta(flink_jobmanager_job_numRestarts[5m]) > 1' for: 5m labels: severity: warning @@ -44,7 +44,7 @@ groups: description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FlinkCheckpointFailures - expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1' + expr: 'delta(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1' for: 5m labels: severity: warning @@ -82,8 +82,9 @@ groups: summary: Flink task high backpressure time (instance {{ $labels.instance }}) description: "Flink task {{ $labels.task_name }} is spending {{ $value | humanize }}ms/sec in backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Flink TaskManagers manage their own memory pool. High JVM heap usage (outside managed memory) may indicate memory leaks or misconfiguration. - alert: FlinkTaskmanagerHeapMemoryHigh - expr: 'flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9' + expr: 'flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_taskmanager_Status_JVM_Memory_Heap_Max > 0' for: 5m labels: severity: warning @@ -92,7 +93,7 @@ groups: description: "Flink TaskManager {{ $labels.instance }} heap memory usage is above 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FlinkJobmanagerHeapMemoryHigh - expr: 'flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9' + expr: 'flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_jobmanager_Status_JVM_Memory_Heap_Max > 0' for: 5m labels: severity: warning @@ -100,9 +101,10 @@ groups: summary: Flink JobManager heap memory high (instance {{ $labels.instance }}) description: "Flink JobManager {{ $labels.instance }} heap memory usage is above 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Flink exposes GC time as a gauge (cumulative milliseconds), so deriv() is used instead of rate(). # Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload. - alert: FlinkTaskmanagerGcTimeHigh - expr: 'rate(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100' + expr: 'deriv(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100' for: 5m labels: severity: warning @@ -112,7 +114,7 @@ groups: # Only fires for tasks that have previously received records, to avoid false positives during startup. - alert: FlinkNoRecordsProcessed - expr: 'rate(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0' + expr: 'delta(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0' for: 5m labels: severity: warning diff --git a/dist/rules/apache/lusitaniae-apache-exporter.yml b/dist/rules/apache/lusitaniae-apache-exporter.yml index b17e11b..c633686 100644 --- a/dist/rules/apache/lusitaniae-apache-exporter.yml +++ b/dist/rules/apache/lusitaniae-apache-exporter.yml @@ -27,7 +27,7 @@ groups: expr: 'apache_uptime_seconds_total / 60 < 1' for: 0m labels: - severity: warning + severity: info annotations: summary: Apache restart (instance {{ $labels.instance }}) description: "Apache has just been restarted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/apc-ups/apcupsd_exporter.yml b/dist/rules/apc-ups/apcupsd_exporter.yml index dcc192f..9918eb5 100644 --- a/dist/rules/apc-ups/apcupsd_exporter.yml +++ b/dist/rules/apc-ups/apcupsd_exporter.yml @@ -33,7 +33,7 @@ groups: description: "UPS now running on battery (since {{$value | humanizeDuration}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ApcUpsLowBatteryVoltage - expr: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95' + expr: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95 and apcupsd_battery_nominal_volts > 0' for: 0m labels: severity: warning diff --git a/dist/rules/blackbox/blackbox-exporter.yml b/dist/rules/blackbox/blackbox-exporter.yml index 48f69b0..2022918 100644 --- a/dist/rules/blackbox/blackbox-exporter.yml +++ b/dist/rules/blackbox/blackbox-exporter.yml @@ -7,7 +7,7 @@ groups: - alert: BlackboxProbeFailed expr: 'probe_success == 0' - for: 0m + for: 1m labels: severity: critical annotations: @@ -34,7 +34,7 @@ groups: - alert: BlackboxProbeHttpFailure expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400' - for: 0m + for: 1m labels: severity: critical annotations: diff --git a/dist/rules/caddy/embedded-exporter.yml b/dist/rules/caddy/embedded-exporter.yml index 5348c0e..045e2ee 100644 --- a/dist/rules/caddy/embedded-exporter.yml +++ b/dist/rules/caddy/embedded-exporter.yml @@ -6,13 +6,13 @@ groups: rules: - alert: CaddyReverseProxyDown - expr: 'count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0' + expr: 'caddy_reverse_proxy_upstreams_healthy == 0' for: 0m labels: severity: critical annotations: summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }}) - description: "All Caddy reverse proxies are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CaddyHighHttp4xxErrorRateService expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0' diff --git a/dist/rules/cassandra/criteo-cassandra-exporter.yml b/dist/rules/cassandra/criteo-cassandra-exporter.yml index 02434b1..4a25ada 100644 --- a/dist/rules/cassandra/criteo-cassandra-exporter.yml +++ b/dist/rules/cassandra/criteo-cassandra-exporter.yml @@ -33,7 +33,7 @@ groups: description: "High viewwrite latency on {{ $labels.instance }} cassandra node\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraAuthenticationFailures - expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5' + expr: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5' for: 2m labels: severity: warning @@ -97,7 +97,7 @@ groups: description: "Some Cassandra repair tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraConnectionTimeoutsTotal(criteo) - expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5' + expr: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5' for: 2m labels: severity: critical @@ -142,7 +142,7 @@ groups: description: "Read failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestWriteFailure(criteo) - expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0' + expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0.05' for: 0m labels: severity: critical @@ -151,7 +151,7 @@ groups: description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestReadFailure(criteo) - expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0' + expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0.05' for: 0m labels: severity: critical @@ -159,11 +159,12 @@ groups: summary: Cassandra client request read failure (Criteo) (instance {{ $labels.instance }}) description: "A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # A low key cache hit rate increases disk I/O. Threshold is workload-dependent — adjust based on your data access patterns. - alert: CassandraCacheHitRateKeyCache expr: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85' for: 2m labels: - severity: critical + severity: warning annotations: summary: Cassandra cache hit rate key cache (instance {{ $labels.instance }}) description: "Key cache hit rate is below 85%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/cassandra/instaclustr-cassandra-exporter.yml b/dist/rules/cassandra/instaclustr-cassandra-exporter.yml index 1ed8fa6..4c875ee 100644 --- a/dist/rules/cassandra/instaclustr-cassandra-exporter.yml +++ b/dist/rules/cassandra/instaclustr-cassandra-exporter.yml @@ -97,7 +97,7 @@ groups: description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestWriteFailure(instaclustr) - expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0' + expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 5' for: 2m labels: severity: critical @@ -106,7 +106,7 @@ groups: description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestReadFailure(instaclustr) - expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0' + expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 5' for: 2m labels: severity: critical diff --git a/dist/rules/ceph/embedded-exporter.yml b/dist/rules/ceph/embedded-exporter.yml index 48d433b..c56612d 100644 --- a/dist/rules/ceph/embedded-exporter.yml +++ b/dist/rules/ceph/embedded-exporter.yml @@ -5,9 +5,11 @@ groups: rules: + # ceph_health_status: 0=HEALTH_OK, 1=HEALTH_WARN, 2=HEALTH_ERR. + # This rule fires on any non-OK state. Split into separate warning/critical rules by using ==1 and ==2 thresholds if needed. - alert: CephState expr: 'ceph_health_status != 0' - for: 0m + for: 1m labels: severity: critical annotations: @@ -34,15 +36,16 @@ groups: - alert: CephOsdDown expr: 'ceph_osd_up == 0' - for: 0m + for: 1m labels: severity: critical annotations: summary: Ceph OSD Down (instance {{ $labels.instance }}) description: "Ceph Object Storage Daemon Down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 5000ms (5 seconds). Adjust based on your expected OSD performance. - alert: CephHighOsdLatency - expr: 'ceph_osd_perf_apply_latency_seconds > 5' + expr: 'ceph_osd_apply_latency_ms > 5000' for: 1m labels: severity: warning @@ -50,14 +53,16 @@ groups: summary: Ceph high OSD latency (instance {{ $labels.instance }}) description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CephOsdLowSpace - expr: 'ceph_osd_utilization > 90' - for: 2m + # Ceph internally triggers OSD_NEARFULL based on the nearfull_ratio (default 85%). + # ceph_health_detail can also be used for more granular OSD space alerts. + - alert: CephOsdNearFull + expr: 'ceph_health_detail{name="OSD_NEARFULL"} == 1' + for: 5m labels: severity: warning annotations: - summary: Ceph OSD low space (instance {{ $labels.instance }}) - description: "Ceph Object Storage Daemon is going out of space. Please add more disks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Ceph OSD near full (instance {{ $labels.instance }}) + description: "A Ceph OSD is dangerously full. Please add more disks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CephOsdReweighted expr: 'ceph_osd_weight < 1' @@ -115,7 +120,7 @@ groups: - alert: CephPgUnavailable expr: 'ceph_pg_total - ceph_pg_active > 0' - for: 0m + for: 1m labels: severity: critical annotations: diff --git a/dist/rules/cert-manager/embedded-exporter.yml b/dist/rules/cert-manager/embedded-exporter.yml index 71edcf9..23a23d3 100644 --- a/dist/rules/cert-manager/embedded-exporter.yml +++ b/dist/rules/cert-manager/embedded-exporter.yml @@ -33,9 +33,10 @@ groups: summary: Cert-Manager certificate not ready (instance {{ $labels.instance }}) description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # In cert-manager 1.19+, the metric was renamed (dropped http_ prefix). Verify metric name against your version. + # Metric renamed in cert-manager v1.19+ (dropped the http_ prefix): certmanager_acme_client_request_count. + # For cert-manager < v1.19, use: certmanager_http_acme_client_request_count. - alert: Cert-managerHittingAcmeRateLimits - expr: 'sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0' + expr: 'sum by (host) (rate(certmanager_acme_client_request_count{status="429"}[5m])) > 0' for: 5m labels: severity: critical diff --git a/dist/rules/cilium/embedded-exporter.yml b/dist/rules/cilium/embedded-exporter.yml index bd6f786..01a5d34 100644 --- a/dist/rules/cilium/embedded-exporter.yml +++ b/dist/rules/cilium/embedded-exporter.yml @@ -45,7 +45,7 @@ groups: description: "Cilium agent {{ $labels.pod }} has {{ $value }} endpoint(s) in invalid state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentEndpointRegenerationFailures - expr: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0' + expr: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0.05' for: 5m labels: severity: warning @@ -54,7 +54,7 @@ groups: description: "Cilium agent {{ $labels.pod }} is failing to regenerate endpoints. Network policy enforcement may be stale.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentEndpointUpdateFailure - expr: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0' + expr: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0.05' for: 5m labels: severity: warning @@ -63,7 +63,7 @@ groups: description: "Cilium agent {{ $labels.pod }} is failing K8s endpoint update API calls ({{ $labels.method }} {{ $labels.return_code }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentEndpointCreateFailure - expr: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0' + expr: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0.05' for: 5m labels: severity: info @@ -72,7 +72,7 @@ groups: description: "Cilium agent {{ $labels.pod }} is failing CNI endpoint-create calls. New pods may fail to get networking.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentMapOperationFailures - expr: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0' + expr: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0.05' for: 5m labels: severity: warning @@ -100,7 +100,7 @@ groups: description: "Cilium agent {{ $labels.pod }} conntrack table is full, causing packet drops. Increase CT map size or investigate connection leaks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentConntrackFailedGarbageCollection - expr: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0' + expr: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0.05' for: 5m labels: severity: warning @@ -128,7 +128,7 @@ groups: description: "Cilium agent {{ $labels.pod }} is dropping packets due to policy denial. Verify network policies are correct.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentHighDropRate - expr: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0' + expr: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0.05' for: 5m labels: severity: warning @@ -146,7 +146,7 @@ groups: description: "Cilium agent {{ $labels.pod }} policy BPF map is above 90% utilization. New policies may fail to apply.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentPolicyImportErrors - expr: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0' + expr: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0.05' for: 5m labels: severity: warning @@ -156,7 +156,7 @@ groups: # Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity. - alert: CiliumAgentPolicyImplementationDelay - expr: 'histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay[5m])) by (le, pod)) > 60' + expr: 'histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay_bucket[5m])) by (le, pod)) > 60' for: 5m labels: severity: warning @@ -203,7 +203,7 @@ groups: # Some Cilium versions may not have a status label on this metric. Verify against your Cilium version. - alert: CiliumOperatorIpamInterfaceCreationFailures - expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0' + expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0.05' for: 10m labels: severity: warning @@ -212,7 +212,7 @@ groups: description: "Cilium operator is failing to create IPAM network interfaces. IP allocation may be impacted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentApiErrors - expr: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0' + expr: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0.05' for: 5m labels: severity: warning @@ -221,7 +221,7 @@ groups: description: "Cilium agent {{ $labels.pod }} API is returning 5xx errors ({{ $labels.return_code }}). Agent may be unhealthy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumAgentKubernetesClientErrors - expr: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0' + expr: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0.05' for: 5m labels: severity: info @@ -239,13 +239,13 @@ groups: description: "Cilium ClusterMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumClustermeshRemoteClusterFailing - expr: 'sum(rate(cilium_clustermesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0' + expr: 'sum(cilium_clustermesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0' for: 5m labels: severity: critical annotations: summary: Cilium ClusterMesh remote cluster failing (instance {{ $labels.instance }}) - description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing ({{ $value }} failures).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumKvstoremeshRemoteClusterNotReady expr: 'count(cilium_kvstoremesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0' @@ -257,16 +257,16 @@ groups: description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumKvstoremeshRemoteClusterFailing - expr: 'sum(rate(cilium_kvstoremesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0' + expr: 'sum(cilium_kvstoremesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0' for: 5m labels: severity: critical annotations: summary: Cilium KVStoreMesh remote cluster failing (instance {{ $labels.instance }}) - description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures ({{ $value }} failures).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumKvstoremeshSyncErrors - expr: 'sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0' + expr: 'sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0.05' for: 5m labels: severity: critical @@ -275,7 +275,7 @@ groups: description: "Cilium KVStoreMesh from {{ $labels.source_cluster }} is experiencing kvstore sync errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CiliumHubbleLostEvents - expr: 'sum(rate(hubble_lost_events_total[5m])) by (pod) > 0' + expr: 'sum(rate(hubble_lost_events_total[5m])) by (pod) > 0.05' for: 5m labels: severity: warning diff --git a/dist/rules/clickhouse/embedded-exporter.yml b/dist/rules/clickhouse/embedded-exporter.yml index 7883b36..2ec1052 100644 --- a/dist/rules/clickhouse/embedded-exporter.yml +++ b/dist/rules/clickhouse/embedded-exporter.yml @@ -135,7 +135,7 @@ groups: description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseRejectedInsertQueries - expr: 'increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0' + expr: 'increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 2' for: 1m labels: severity: warning @@ -144,7 +144,7 @@ groups: description: "INSERTs rejected due to too many active data parts. Reduce insert frequency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseDelayedInsertQueries - expr: 'increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0' + expr: 'increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 10' for: 2m labels: severity: warning @@ -172,7 +172,7 @@ groups: description: "High network usage. ClickHouse network usage exceeds 100MB/s.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseDistributedRejectedInserts - expr: 'increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 0' + expr: 'increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 3' for: 2m labels: severity: critical diff --git a/dist/rules/cortex/embedded-exporter.yml b/dist/rules/cortex/embedded-exporter.yml index ebaaf88..9917da1 100644 --- a/dist/rules/cortex/embedded-exporter.yml +++ b/dist/rules/cortex/embedded-exporter.yml @@ -24,23 +24,23 @@ groups: description: "Cortex not connected to Alertmanager (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - - alert: CortexNotificationAreBeingDropped + - alert: CortexNotificationsAreBeingDropped expr: 'rate(cortex_prometheus_notifications_dropped_total[5m]) > 0.05' for: 0m labels: severity: critical annotations: - summary: Cortex notification are being dropped (instance {{ $labels.instance }}) - description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Cortex notifications are being dropped (instance {{ $labels.instance }}) + description: "Cortex notifications are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - - alert: CortexNotificationError + - alert: CortexNotificationErrors expr: 'rate(cortex_prometheus_notifications_errors_total[5m]) > 0.05' for: 0m labels: severity: critical annotations: - summary: Cortex notification error (instance {{ $labels.instance }}) + summary: Cortex notification errors (instance {{ $labels.instance }}) description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CortexIngesterUnhealthy diff --git a/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml b/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml index 3ae6ed6..4ac34a1 100644 --- a/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml +++ b/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml @@ -23,6 +23,7 @@ groups: summary: CouchDB atom memory usage critical (instance {{ $labels.instance }}) description: "Atom memory usage is above 90% of limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # The default max_dbs_open is 500. Adjust the threshold (currently 0.9 * 1000) to match your max_dbs_open setting. - alert: CouchdbOpenDatabasesCritical expr: 'couchdb_httpd_open_databases > 0.9 * 1000' for: 5m @@ -32,6 +33,7 @@ groups: summary: CouchDB open databases critical (instance {{ $labels.instance }}) description: "Number of open databases exceeds 90% of node capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Adjust 65535 to match your system's file descriptor limit (ulimit -n). - alert: CouchdbOpenOsFilesCritical expr: 'couchdb_httpd_open_os_files > 0.9 * 65535' for: 5m @@ -159,7 +161,7 @@ groups: description: "CouchDB process has restarted recently\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbCriticalLogEntries - expr: 'increase(couchdb_server_couch_log{level=~"error|critical"}[5m]) > 0' + expr: 'increase(couchdb_server_couch_log{level=~"error|critical"}[5m]) > 5' for: 1m labels: severity: critical diff --git a/dist/rules/digitalocean/digitalocean-exporter.yml b/dist/rules/digitalocean/digitalocean-exporter.yml index 3502587..9f85bd2 100644 --- a/dist/rules/digitalocean/digitalocean-exporter.yml +++ b/dist/rules/digitalocean/digitalocean-exporter.yml @@ -78,7 +78,7 @@ groups: description: "DigitalOcean platform has {{ $value }} active incident(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: DigitaloceanExporterCollectionErrors - expr: 'increase(digitalocean_errors_total[5m]) > 0' + expr: 'increase(digitalocean_errors_total[5m]) > 3' for: 5m labels: severity: warning diff --git a/dist/rules/docker-containers/google-cadvisor.yml b/dist/rules/docker-containers/google-cadvisor.yml index ecd9d24..6927f36 100644 --- a/dist/rules/docker-containers/google-cadvisor.yml +++ b/dist/rules/docker-containers/google-cadvisor.yml @@ -73,7 +73,7 @@ groups: description: "This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ContainerLowCpuUtilization - expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20' + expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0' for: 7d labels: severity: info diff --git a/dist/rules/ebpf/ebpf-exporter.yml b/dist/rules/ebpf/ebpf-exporter.yml index 79c8df0..5c18ab9 100644 --- a/dist/rules/ebpf/ebpf-exporter.yml +++ b/dist/rules/ebpf/ebpf-exporter.yml @@ -16,7 +16,7 @@ groups: description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EbpfExporterDecoderErrors - expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0' + expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0.05' for: 5m labels: severity: warning diff --git a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml index 77f322f..4ebc820 100644 --- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml +++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml @@ -142,8 +142,9 @@ groups: summary: Elasticsearch no new documents (instance {{ $labels.instance }}) description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 10ms (0.01s) per indexing operation is a rough default. Adjust based on your document size and cluster performance. - alert: ElasticsearchHighIndexingLatency - expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0' + expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[5m]) / rate(elasticsearch_indices_indexing_index_total[5m]) > 0.01 and rate(elasticsearch_indices_indexing_index_total[5m]) > 0' for: 10m labels: severity: warning @@ -151,6 +152,7 @@ groups: summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }}) description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 10000 ops/s is a rough default. Adjust based on your cluster capacity and expected workload. - alert: ElasticsearchHighIndexingRate expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000' for: 5m @@ -160,6 +162,7 @@ groups: summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }}) description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 100 queries/s is very low for most production clusters. Adjust based on your expected query volume. - alert: ElasticsearchHighQueryRate expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100' for: 5m diff --git a/dist/rules/envoy/embedded-exporter.yml b/dist/rules/envoy/embedded-exporter.yml index 07adfad..13e737b 100644 --- a/dist/rules/envoy/embedded-exporter.yml +++ b/dist/rules/envoy/embedded-exporter.yml @@ -66,7 +66,7 @@ groups: severity: warning annotations: summary: Envoy cluster membership degraded (instance {{ $labels.instance }}) - description: "More than 25% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are unhealthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Only {{ $value | printf \"%.1f\" }}% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are healthy (threshold: 75%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighClusterUpstreamConnectionFailures expr: 'increase(envoy_cluster_upstream_cx_connect_fail[5m]) > 10' @@ -159,7 +159,7 @@ groups: description: "Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyNoHealthyUpstream - expr: 'increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 0' + expr: 'increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 3' for: 0m labels: severity: critical diff --git a/dist/rules/etcd/embedded-exporter.yml b/dist/rules/etcd/embedded-exporter.yml index 28c4ace..bcfca36 100644 --- a/dist/rules/etcd/embedded-exporter.yml +++ b/dist/rules/etcd/embedded-exporter.yml @@ -61,6 +61,7 @@ groups: summary: Etcd GRPC requests slow (instance {{ $labels.instance }}) description: "GRPC requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x. - alert: EtcdHighNumberOfFailedHttpRequestsWarning expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0' for: 2m @@ -70,6 +71,7 @@ groups: summary: Etcd high number of failed HTTP requests warning (instance {{ $labels.instance }}) description: "More than 1% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x. - alert: EtcdHighNumberOfFailedHttpRequestsCritical expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0' for: 2m @@ -79,6 +81,7 @@ groups: summary: Etcd high number of failed HTTP requests critical (instance {{ $labels.instance }}) description: "More than 5% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # This etcd_http_* metric is from the etcd v2 API and does not exist in etcd 3.x. Remove this rule if running etcd 3.x. - alert: EtcdHttpRequestsSlow expr: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15' for: 2m @@ -89,7 +92,7 @@ groups: description: "HTTP requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdMemberCommunicationSlow - expr: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15' + expr: 'histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) by (instance, le)) > 0.15' for: 2m labels: severity: warning @@ -107,7 +110,7 @@ groups: description: "Etcd server got {{ $value }} failed proposals in the past hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHighFsyncDurations - expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5' + expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le)) > 0.5' for: 2m labels: severity: warning @@ -116,7 +119,7 @@ groups: description: "Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHighCommitDurations - expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25' + expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le)) > 0.25' for: 2m labels: severity: warning diff --git a/dist/rules/freeswitch/znerol-freeswitch-exporter.yml b/dist/rules/freeswitch/znerol-freeswitch-exporter.yml index c0b9db6..dd3600b 100644 --- a/dist/rules/freeswitch/znerol-freeswitch-exporter.yml +++ b/dist/rules/freeswitch/znerol-freeswitch-exporter.yml @@ -7,12 +7,12 @@ groups: - alert: FreeswitchDown expr: 'freeswitch_up == 0' - for: 0m + for: 1m labels: severity: critical annotations: summary: Freeswitch down (instance {{ $labels.instance }}) - description: "Freeswitch is unresponsive\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Freeswitch {{ $labels.instance }} is unresponsive.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FreeswitchSessionsWarning expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80 and freeswitch_session_limit > 0' diff --git a/dist/rules/gitlab-ci/gitaly.yml b/dist/rules/gitlab-ci/gitaly.yml index adb6f9e..3c24364 100644 --- a/dist/rules/gitlab-ci/gitaly.yml +++ b/dist/rules/gitlab-ci/gitaly.yml @@ -5,9 +5,9 @@ groups: rules: - # grpc_code!="OK" includes non-error codes like NotFound, AlreadyExists. Consider filtering to specific error codes for less noise. + # Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - alert: GitlabGitalyHighGrpcErrorRate - expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' + expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown|DataLoss"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' for: 5m labels: severity: warning @@ -17,7 +17,6 @@ groups: # ResourceExhausted errors from Gitaly mean Git operations are being rejected due to # concurrency limits. This directly impacts users trying to push, pull, or clone. - # This alert is derived from the GitLab Omnibus default rules. - alert: GitlabGitalyResourceExhausted expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' for: 5m @@ -36,8 +35,9 @@ groups: summary: GitLab Gitaly high RPC latency (instance {{ $labels.instance }}) description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Brief throttling spikes are normal. Threshold of 0.1s/s (10% of CPU time throttled) filters out transient noise. - alert: GitlabGitalyCpuThrottled - expr: 'rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0' + expr: 'rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0.1' for: 5m labels: severity: warning @@ -46,7 +46,7 @@ groups: description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabGitalyAuthenticationFailures - expr: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 0' + expr: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 3' for: 0m labels: severity: warning diff --git a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml index a0d3f12..b75cfb6 100644 --- a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml +++ b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml @@ -138,7 +138,7 @@ groups: # This metric may not exist in all GitLab versions. Verify against your GitLab installation. - alert: GitlabCiPipelineFailuresIncreasing - expr: 'rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0' + expr: 'deriv(gitlab_ci_pipeline_failure_reasons[5m]) > 0.05' for: 10m labels: severity: warning @@ -179,7 +179,7 @@ groups: description: "GitLab Ruby heap fragmentation on {{ $labels.instance }} is {{ $value }}. High fragmentation wastes memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabRackUncaughtErrors - expr: 'rate(rack_uncaught_errors_total[5m]) > 0' + expr: 'rate(rack_uncaught_errors_total[5m]) > 0.05' for: 5m labels: severity: warning diff --git a/dist/rules/golang/golang-exporter.yml b/dist/rules/golang/golang-exporter.yml index 13b251c..dfd894c 100644 --- a/dist/rules/golang/golang-exporter.yml +++ b/dist/rules/golang/golang-exporter.yml @@ -57,10 +57,10 @@ groups: summary: Go heap objects count high (instance {{ $labels.instance }}) description: "Go heap has too many live objects (> 10M), high GC pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # go_memstats_gc_cpu_fraction is deprecated since Go 1.20 and may return 0 in newer versions. - # Consider using runtime/metrics-based alternatives if running Go >= 1.20. + # rate(go_gc_duration_seconds_sum) approximates the fraction of wall-clock time spent in GC. + # This replaces go_memstats_gc_cpu_fraction which was removed in client_golang v1.12+. - alert: GoGcCpuFractionHigh - expr: 'go_memstats_gc_cpu_fraction > 0.05' + expr: 'rate(go_gc_duration_seconds_sum[5m]) > 0.05' for: 5m labels: severity: warning @@ -68,23 +68,27 @@ groups: summary: Go GC CPU fraction high (instance {{ $labels.instance }}) description: "Go GC is consuming too much CPU (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # A threshold of 100/s only catches catastrophic leaks (30k goroutines in 5m). 10/s catches gradual leaks (~3k in 5m). + # Adjust based on your application's expected concurrency patterns. - alert: GoGoroutineSpike - expr: 'deriv(go_goroutines[5m]) > 100' + expr: 'deriv(go_goroutines[5m]) > 10' for: 5m labels: severity: warning annotations: summary: Go goroutine spike (instance {{ $labels.instance }}) - description: "Go goroutine count is growing rapidly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Go goroutine count is growing rapidly ({{ $value | printf \"%.0f\" }} goroutines/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: GoHeapFragmentation - expr: 'go_memstats_heap_idle_bytes / go_memstats_heap_sys_bytes > 0.9' - for: 5m + # Alerts when heap in-use grows by more than 10MB/s sustained over 10 minutes. + # Adjust threshold based on your workload. + - alert: GoHeapIn-useGrowing + expr: 'deriv(go_memstats_heap_inuse_bytes[10m]) > 1e7' + for: 0m labels: severity: warning annotations: - summary: Go heap fragmentation (instance {{ $labels.instance }}) - description: "Go heap has high idle ratio (> 90%), indicating memory fragmentation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Go heap in-use growing (instance {{ $labels.instance }}) + description: "Go heap in-use memory is growing steadily, potential memory leak or under-sized heap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GoMemoryLeak expr: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9' diff --git a/dist/rules/grafana-mimir/embedded-exporter.yml b/dist/rules/grafana-mimir/embedded-exporter.yml index d401079..1754fff 100644 --- a/dist/rules/grafana-mimir/embedded-exporter.yml +++ b/dist/rules/grafana-mimir/embedded-exporter.yml @@ -178,8 +178,9 @@ groups: summary: Mimir distributor inflight requests high (instance {{ $labels.instance }}) description: "Mimir distributor {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its inflight push requests limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirIngesterTsdbHeadCompactionFailed - expr: 'rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0' + expr: 'rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0.05' for: 15m labels: severity: critical @@ -187,26 +188,29 @@ groups: summary: Mimir ingester TSDB head compaction failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirIngesterTsdbHeadTruncationFailed - expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0' - for: 0m + expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0.05' + for: 15m labels: severity: critical annotations: summary: Mimir ingester TSDB head truncation failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirIngesterTsdbCheckpointCreationFailed - expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0' - for: 0m + expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0.05' + for: 15m labels: severity: critical annotations: summary: Mimir ingester TSDB checkpoint creation failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirIngesterTsdbCheckpointDeletionFailed - expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0' + expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0.05' for: 0m labels: severity: critical @@ -214,8 +218,9 @@ groups: summary: Mimir ingester TSDB checkpoint deletion failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirIngesterTsdbWalTruncationFailed - expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0' + expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0.05' for: 0m labels: severity: warning @@ -223,8 +228,9 @@ groups: summary: Mimir ingester TSDB WAL truncation failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirIngesterTsdbWalWritesFailed - expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0' + expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0.05' for: 3m labels: severity: critical @@ -232,7 +238,7 @@ groups: summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Threshold aligned with official Mimir mixin (30 minutes). + # Threshold of 30 minutes. Adjust based on your sync interval. - alert: MimirStoreGatewayHasNotSyncedBucket expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0' for: 5m @@ -240,7 +246,7 @@ groups: severity: critical annotations: summary: Mimir store gateway has not synced bucket (instance {{ $labels.instance }}) - description: "Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 30 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirStoreGatewayNoSyncedTenants expr: '(min by (instance, job) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0) and on (instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"} offset 1h > 0)' @@ -287,8 +293,9 @@ groups: summary: Mimir compactor has consecutive failures (instance {{ $labels.instance }}) description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # cortex_compactor_disk_out_of_space_errors_total is declared as gauge by Mimir despite the _total suffix, so delta() is used instead of increase(). - alert: MimirCompactorHasRunOutOfDiskSpace - expr: 'increase(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1' + expr: 'delta(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1' for: 0m labels: severity: critical @@ -305,7 +312,7 @@ groups: summary: Mimir compactor has not uploaded blocks (instance {{ $labels.instance }}) description: "Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Using 24h window per official mixin — compaction skips are rare events. + # Using a 24h window as compaction skips are rare events. - alert: MimirCompactorSkippedBlocks expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0' for: 5m @@ -352,8 +359,9 @@ groups: summary: Mimir ruler failed ring check (instance {{ $labels.instance }}) description: "Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirAlertmanagerSyncConfigsFailing - expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0' + expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0.05' for: 30m labels: severity: critical @@ -361,8 +369,9 @@ groups: summary: Mimir alertmanager sync configs failing (instance {{ $labels.instance }}) description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirAlertmanagerRingCheckFailing - expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0' + expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0.05' for: 10m labels: severity: critical @@ -370,8 +379,9 @@ groups: summary: Mimir alertmanager ring check failing (instance {{ $labels.instance }}) description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirAlertmanagerStateMergeFailing - expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0' + expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0.05' for: 10m labels: severity: critical @@ -379,8 +389,9 @@ groups: summary: Mimir alertmanager state merge failing (instance {{ $labels.instance }}) description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirAlertmanagerReplicationFailing - expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0' + expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0.05' for: 10m labels: severity: critical @@ -388,8 +399,9 @@ groups: summary: Mimir alertmanager replication failing (instance {{ $labels.instance }}) description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirAlertmanagerPersistStateFailing - expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0' + expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0.05' for: 1h labels: severity: critical diff --git a/dist/rules/grafana-tempo/embedded-exporter.yml b/dist/rules/grafana-tempo/embedded-exporter.yml index 256a044..c1dcff5 100644 --- a/dist/rules/grafana-tempo/embedded-exporter.yml +++ b/dist/rules/grafana-tempo/embedded-exporter.yml @@ -117,7 +117,7 @@ groups: summary: Tempo compaction too many outstanding blocks warning (instance {{ $labels.instance }}) description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration. + # Threshold of 100 blocks per compactor instance. Normalize by backend-worker count if needed. Adjust based on your environment. - alert: TempoCompactionTooManyOutstandingBlocksCritical expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 250' for: 24h @@ -127,8 +127,9 @@ groups: summary: Tempo compaction too many outstanding blocks critical (instance {{ $labels.instance }}) description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Increase compactor resources immediately.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: TempoDistributorUsageTrackerErrors - expr: 'sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0' + expr: 'sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0.05' for: 30m labels: severity: critical @@ -137,7 +138,7 @@ groups: description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoMetricsGeneratorProcessorUpdatesFailing - expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 0' + expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 2' for: 15m labels: severity: critical @@ -146,7 +147,7 @@ groups: description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoMetricsGeneratorServiceGraphsDroppingSpans - expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0' + expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans_total[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0' for: 15m labels: severity: warning diff --git a/dist/rules/graph-node/embedded-exporter.yml b/dist/rules/graph-node/embedded-exporter.yml index 9158bd0..b5ce3ab 100644 --- a/dist/rules/graph-node/embedded-exporter.yml +++ b/dist/rules/graph-node/embedded-exporter.yml @@ -41,6 +41,7 @@ groups: summary: Provider failed because get genesis timeout (instance {{ $labels.instance }}) description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 10ms. Adjust based on your expected database latency. - alert: StoreConnectionSlow expr: 'store_connection_wait_time_ms > 10' for: 0m @@ -50,6 +51,7 @@ groups: summary: Store connection slow (instance {{ $labels.instance }}) description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 20ms. Adjust based on your expected database latency. - alert: StoreConnectionVerySlow expr: 'store_connection_wait_time_ms > 20' for: 0m diff --git a/dist/rules/hadoop/jmx_exporter.yml b/dist/rules/hadoop/jmx_exporter.yml index c483ec9..272bcd3 100644 --- a/dist/rules/hadoop/jmx_exporter.yml +++ b/dist/rules/hadoop/jmx_exporter.yml @@ -5,6 +5,9 @@ groups: rules: + # When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0, + # so this alert may not fire. Prefer application-level availability metrics if available. + # Rename job="hadoop-namenode" to match the actual job name in your Prometheus scrape config. - alert: HadoopNameNodeDown expr: 'up{job="hadoop-namenode"} == 0' for: 5m @@ -14,6 +17,9 @@ groups: summary: Hadoop Name Node Down (instance {{ $labels.instance }}) description: "The Hadoop NameNode service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0, + # so this alert may not fire. Prefer application-level availability metrics if available. + # Rename job="hadoop-resourcemanager" to match the actual job name in your Prometheus scrape config. - alert: HadoopResourceManagerDown expr: 'up{job="hadoop-resourcemanager"} == 0' for: 5m @@ -51,7 +57,7 @@ groups: description: "There is an unusually high number of MapReduce task failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopResourceManagerMemoryHigh - expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8' + expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 and hadoop_resourcemanager_memory_max_bytes > 0' for: 15m labels: severity: warning @@ -78,7 +84,7 @@ groups: description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopHbaseRegionServerHeapLow - expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8' + expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 and hadoop_hbase_region_server_max_heap_bytes > 0' for: 10m labels: severity: warning diff --git a/dist/rules/haproxy/embedded-exporter-v2.yml b/dist/rules/haproxy/embedded-exporter-v2.yml index d6b2bbf..0dda5bb 100644 --- a/dist/rules/haproxy/embedded-exporter-v2.yml +++ b/dist/rules/haproxy/embedded-exporter-v2.yml @@ -12,7 +12,7 @@ groups: severity: critical annotations: summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }}) - description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp5xxErrorRateBackend expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0' @@ -21,7 +21,7 @@ groups: severity: critical annotations: summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }}) - description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp4xxErrorRateServer expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' @@ -57,7 +57,7 @@ groups: severity: critical annotations: summary: HAProxy backend connection errors (instance {{ $labels.instance }}) - description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Too many connection errors to {{ $labels.proxy }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerConnectionErrors expr: '(sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100' @@ -66,10 +66,10 @@ groups: severity: critical annotations: summary: HAProxy server connection errors (instance {{ $labels.instance }}) - description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Too many connection errors to {{ $labels.proxy }} (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyBackendMaxActiveSession>80% - expr: '((haproxy_backend_current_sessions >0) * 100) / (haproxy_backend_limit_sessions > 0) > 80' + expr: '(haproxy_backend_current_sessions / haproxy_backend_limit_sessions * 100) > 80 and haproxy_backend_limit_sessions > 0' for: 2m labels: severity: warning @@ -94,7 +94,7 @@ groups: severity: warning annotations: summary: HAProxy HTTP slowing down (instance {{ $labels.instance }}) - description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "HAProxy backend max total time is above 1s on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyRetryHigh expr: 'sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10' @@ -124,8 +124,8 @@ groups: description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerHealthcheckFailure - expr: 'increase(haproxy_server_check_failures_total[1m]) > 0' - for: 1m + expr: 'increase(haproxy_server_check_failures_total[1m]) > 2' + for: 0m labels: severity: warning annotations: diff --git a/dist/rules/haproxy/haproxy-exporter-v1.yml b/dist/rules/haproxy/haproxy-exporter-v1.yml index 0285929..3975ca1 100644 --- a/dist/rules/haproxy/haproxy-exporter-v1.yml +++ b/dist/rules/haproxy/haproxy-exporter-v1.yml @@ -15,22 +15,22 @@ groups: description: "HAProxy down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp4xxErrorRateBackend(v1) - expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' + expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: HAProxy high HTTP 4xx error rate backend (v1) (instance {{ $labels.instance }}) - description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp5xxErrorRateBackend(v1) - expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' + expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: summary: HAProxy high HTTP 5xx error rate backend (v1) (instance {{ $labels.instance }}) - description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp4xxErrorRateServer(v1) expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' @@ -66,7 +66,7 @@ groups: severity: critical annotations: summary: HAProxy backend connection errors (v1) (instance {{ $labels.instance }}) - description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Too many connection errors to {{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerConnectionErrors(v1) expr: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100' @@ -84,7 +84,7 @@ groups: severity: warning annotations: summary: HAProxy backend max active session (instance {{ $labels.instance }}) - description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "HAProxy backend {{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyPendingRequests(v1) expr: 'sum by (backend) (haproxy_backend_current_queue) > 0' @@ -93,7 +93,7 @@ groups: severity: warning annotations: summary: HAProxy pending requests (v1) (instance {{ $labels.instance }}) - description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Some HAProxy requests are pending on {{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHttpSlowingDown(v1) expr: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1' @@ -111,7 +111,7 @@ groups: severity: warning annotations: summary: HAProxy retry high (v1) (instance {{ $labels.instance }}) - description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "High rate of retry on {{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyBackendDown expr: 'haproxy_backend_up == 0' @@ -141,8 +141,8 @@ groups: description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerHealthcheckFailure(v1) - expr: 'increase(haproxy_server_check_failures_total[1m]) > 0' - for: 1m + expr: 'increase(haproxy_server_check_failures_total[1m]) > 2' + for: 0m labels: severity: warning annotations: diff --git a/dist/rules/hashicorp-vault/embedded-exporter.yml b/dist/rules/hashicorp-vault/embedded-exporter.yml index 51fcb22..4941dea 100644 --- a/dist/rules/hashicorp-vault/embedded-exporter.yml +++ b/dist/rules/hashicorp-vault/embedded-exporter.yml @@ -7,7 +7,7 @@ groups: - alert: VaultSealed expr: 'vault_core_unsealed == 0' - for: 0m + for: 1m labels: severity: critical annotations: @@ -21,7 +21,7 @@ groups: severity: warning annotations: summary: Vault too many pending tokens (instance {{ $labels.instance }}) - description: "Too many pending tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Too many pending tokens on {{ $labels.instance }}: {{ $value }} tokens created but not yet stored.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: VaultTooManyInfinityTokens expr: 'vault_token_count_by_ttl{creation_ttl="+Inf"} > 3' @@ -30,13 +30,13 @@ groups: severity: warning annotations: summary: Vault too many infinity tokens (instance {{ $labels.instance }}) - description: "Too many infinity tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Too many non-expiring tokens on {{ $labels.instance }}: {{ $value }} tokens with infinite TTL.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: VaultClusterHealth - expr: 'sum(vault_core_active) / count(vault_core_active) <= 0.5' + expr: 'sum(vault_core_active) / count(vault_core_active) <= 0.5 and count(vault_core_active) > 0' for: 0m labels: severity: critical annotations: summary: Vault cluster health (instance {{ $labels.instance }}) - description: "Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Vault cluster is not healthy: only {{ $value | humanizePercentage }} of nodes are active.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 9cb7584..a06a16d 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -14,8 +14,9 @@ groups: summary: Host out of memory (instance {{ $labels.instance }}) description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # node_vmstat_pgmajfault is exposed as untyped/gauge by node_exporter (from /proc/vmstat), so deriv() is used instead of rate(). - alert: HostMemoryUnderMemoryPressure - expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)' + expr: '(deriv(node_vmstat_pgmajfault[5m]) > 1000)' for: 0m labels: severity: warning @@ -173,13 +174,13 @@ groups: severity: warning annotations: summary: Host unusual disk IO (instance {{ $labels.instance }}) - description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # x2 context switches is an arbitrary number. # The alert threshold depends on the nature of the application. # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - alert: HostContextSwitchingHigh - expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2' + expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 and rate(node_context_switches_total[1d]) > 0' for: 0m labels: severity: warning @@ -223,7 +224,7 @@ groups: summary: Host node overtemperature alarm (instance {{ $labels.instance }}) description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin. + # Uses ignoring(state) to handle additional labels on node_md_disks. - alert: HostSoftwareRaidInsufficientDrives expr: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)' for: 0m @@ -253,7 +254,7 @@ groups: # When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger. - alert: HostOomKillDetected - expr: '(increase(node_vmstat_oom_kill[30m]) > 0)' + expr: '(delta(node_vmstat_oom_kill[30m]) > 0)' for: 0m labels: severity: warning @@ -268,7 +269,7 @@ groups: severity: info annotations: summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) - description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 1 minute.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostEdacUncorrectableErrorsDetected expr: '(node_edac_uncorrectable_errors_total > 0)' @@ -277,7 +278,7 @@ groups: severity: warning annotations: summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) - description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkReceiveErrors expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0' diff --git a/dist/rules/istio/embedded-exporter.yml b/dist/rules/istio/embedded-exporter.yml index cc05444..56ee1e0 100644 --- a/dist/rules/istio/embedded-exporter.yml +++ b/dist/rules/istio/embedded-exporter.yml @@ -12,17 +12,18 @@ groups: severity: warning annotations: summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }}) - description: "Gateway pods have dropped. Inbound traffic will likely be affected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Istio ingress gateway has only {{ $value }} available pod(s). Inbound traffic will likely be affected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: IstioPilotHighTotalRequestRate + - alert: IstioPilotHighPushErrorRate expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0' for: 1m labels: severity: warning annotations: - summary: Istio Pilot high total request rate (instance {{ $labels.instance }}) + summary: Istio Pilot high push error rate (instance {{ $labels.instance }}) description: "Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Mixer was deprecated in Istio 1.5 and removed in Istio 1.8+. This alert only applies to Istio < 1.8. - alert: IstioMixerPrometheusDispatchesLow expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180' for: 1m @@ -32,6 +33,7 @@ groups: summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }}) description: "Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 1000 req/s is a rough default. Adjust to your expected peak traffic. - alert: IstioHighTotalRequestRate expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000' for: 2m @@ -39,8 +41,9 @@ groups: severity: warning annotations: summary: Istio high total request rate (instance {{ $labels.instance }}) - description: "Global request rate in the service mesh is unusually high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Global request rate in the service mesh is unusually high ({{ $value | printf \"%.2f\" }} req/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 100 req/s is a rough default. Adjust to your expected baseline traffic. This alert may fire on startup or low-traffic environments. - alert: IstioLowTotalRequestRate expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100' for: 2m @@ -48,7 +51,7 @@ groups: severity: warning annotations: summary: Istio low total request rate (instance {{ $labels.instance }}) - description: "Global request rate in the service mesh is unusually low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Global request rate in the service mesh is unusually low ({{ $value | printf \"%.2f\" }} req/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioHigh4xxErrorRate expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0' @@ -57,7 +60,7 @@ groups: severity: warning annotations: summary: Istio high 4xx error rate (instance {{ $labels.instance }}) - description: "High percentage of HTTP 4xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "High percentage of HTTP 4xx responses in Istio ({{ $value | printf \"%.1f\" }}% > 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioHigh5xxErrorRate expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0' @@ -66,7 +69,7 @@ groups: severity: warning annotations: summary: Istio high 5xx error rate (instance {{ $labels.instance }}) - description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "High percentage of HTTP 5xx responses in Istio ({{ $value | printf \"%.1f\" }}% > 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioHighRequestLatency expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100 and rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 0' @@ -75,22 +78,22 @@ groups: severity: warning annotations: summary: Istio high request latency (instance {{ $labels.instance }}) - description: "Istio average requests execution is longer than 100ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Istio average request duration is {{ $value }}ms (> 100ms).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioLatency99Percentile - expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000' + expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, le)) > 1000' for: 1m labels: severity: warning annotations: summary: Istio latency 99 percentile (instance {{ $labels.instance }}) - description: "Istio 1% slowest requests are longer than 1000ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Istio p99 request latency is {{ $value }}ms (threshold: 1000ms).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioPilotDuplicateEntry - expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0' + expr: 'sum(pilot_duplicate_envoy_clusters{}) > 0' for: 0m labels: severity: critical annotations: summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }}) - description: "Istio pilot duplicate entry error.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Istio Pilot has detected {{ $value }} duplicate Envoy cluster(s), indicating misconfigured DestinationRules or ServiceEntries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/jenkins/metric-plugin.yml b/dist/rules/jenkins/metric-plugin.yml index 5271e1e..6b9e476 100644 --- a/dist/rules/jenkins/metric-plugin.yml +++ b/dist/rules/jenkins/metric-plugin.yml @@ -51,7 +51,7 @@ groups: description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JenkinsRunFailureTotal - expr: 'delta(jenkins_runs_failure_total[1h]) > 100' + expr: 'increase(jenkins_runs_failure_total[1h]) > 100' for: 0m labels: severity: warning diff --git a/dist/rules/kafka/danielqsj-kafka-exporter.yml b/dist/rules/kafka/danielqsj-kafka-exporter.yml index 69f4cd0..930d216 100644 --- a/dist/rules/kafka/danielqsj-kafka-exporter.yml +++ b/dist/rules/kafka/danielqsj-kafka-exporter.yml @@ -12,7 +12,7 @@ groups: severity: critical annotations: summary: Kafka topics replicas (instance {{ $labels.instance }}) - description: "Kafka topic in-sync partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Kafka topic {{ $labels.topic }} has fewer than 3 in-sync replicas ({{ $value }}), data durability is at risk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KafkaConsumerGroupLag expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 10000' diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml index 34c96bd..3a6e3dd 100644 --- a/dist/rules/kubernetes/kubestate-exporter.yml +++ b/dist/rules/kubernetes/kubestate-exporter.yml @@ -134,7 +134,7 @@ groups: description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPersistentvolumeError - expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0' + expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending"} > 0' for: 0m labels: severity: critical diff --git a/dist/rules/loki/embedded-exporter.yml b/dist/rules/loki/embedded-exporter.yml index 166cfa2..6ee997b 100644 --- a/dist/rules/loki/embedded-exporter.yml +++ b/dist/rules/loki/embedded-exporter.yml @@ -24,19 +24,19 @@ groups: description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestPanic - expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0' - for: 5m + expr: 'sum(increase(loki_panic_total[5m])) by (namespace, job) > 0' + for: 0m labels: severity: critical annotations: summary: Loki request panic (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "{{ $labels.job }} is experiencing {{ $value | humanize }} panic(s) in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestLatency - expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1' + expr: 'histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (namespace, job, route, le)) > 1' for: 5m labels: severity: critical annotations: summary: Loki request latency (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/memcached/memcached-exporter.yml b/dist/rules/memcached/memcached-exporter.yml index 91e659a..955ff82 100644 --- a/dist/rules/memcached/memcached-exporter.yml +++ b/dist/rules/memcached/memcached-exporter.yml @@ -34,13 +34,13 @@ groups: description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MemcachedOutOfMemoryErrors - expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0' + expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0.05' for: 5m labels: severity: warning annotations: summary: Memcached out of memory errors (instance {{ $labels.instance }}) - description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Memcached is returning out-of-memory errors on {{ $labels.instance }} ({{ $value }} errors/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions. - alert: MemcachedMemoryUsageHigh(>90%) @@ -73,7 +73,7 @@ groups: description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MemcachedConnectionsRejected - expr: 'increase(memcached_connections_rejected_total[5m]) > 0' + expr: 'increase(memcached_connections_rejected_total[5m]) > 3' for: 5m labels: severity: warning @@ -82,7 +82,7 @@ groups: description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MemcachedItemsTooLarge - expr: 'increase(memcached_item_too_large_total[5m]) > 0' + expr: 'increase(memcached_item_too_large_total[5m]) > 3' for: 5m labels: severity: info diff --git a/dist/rules/mysql/mysqld-exporter.yml b/dist/rules/mysql/mysqld-exporter.yml index 89f9eea..0aec1d7 100644 --- a/dist/rules/mysql/mysqld-exporter.yml +++ b/dist/rules/mysql/mysqld-exporter.yml @@ -71,17 +71,19 @@ groups: summary: MySQL Slave replication lag (instance {{ $labels.instance }}) description: "MySQL replication lag on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so delta() is used instead of increase(). - alert: MysqlSlowQueries - expr: 'increase(mysql_global_status_slow_queries[1m]) > 0' + expr: 'delta(mysql_global_status_slow_queries[1m]) > 0' for: 2m labels: severity: warning annotations: summary: MySQL slow queries (instance {{ $labels.instance }}) - description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "MySQL server has some new slow queries ({{ $value }} in the last minute).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of rate(). - alert: MysqlInnodbLogWaits - expr: 'rate(mysql_global_status_innodb_log_waits[15m]) > 10' + expr: 'deriv(mysql_global_status_innodb_log_waits[15m]) > 10' for: 0m labels: severity: warning @@ -98,8 +100,9 @@ groups: summary: MySQL restarted (instance {{ $labels.instance }}) description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of irate(). - alert: MysqlHighQps - expr: 'irate(mysql_global_status_questions[1m]) > 10000' + expr: 'deriv(mysql_global_status_questions[1m]) > 10000' for: 2m labels: severity: info diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml index e077089..6a52a79 100644 --- a/dist/rules/nats/nats-exporter.yml +++ b/dist/rules/nats/nats-exporter.yml @@ -32,6 +32,7 @@ groups: summary: Nats slow consumers (instance {{ $labels.instance }}) description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Replace job="nats" with the actual job name in your Prometheus configuration. - alert: NatsServerDown expr: 'absent(up{job="nats"})' for: 5m @@ -79,7 +80,7 @@ groups: description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighNumberOfSubscriptions - expr: 'gnatsd_connz_subscriptions > 1000' + expr: 'gnatsd_varz_subscriptions > 1000' for: 5m labels: severity: warning @@ -97,7 +98,7 @@ groups: description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsTooManyErrors - expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0' + expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 5' for: 5m labels: severity: warning @@ -114,6 +115,8 @@ groups: summary: Nats JetStream accounts exceeded (instance {{ $labels.instance }}) description: "JetStream has more than 100 active accounts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Only enable this alert if your deployment requires leaf node connections. + # This will fire spuriously if leaf nodes are not configured. - alert: NatsLeafNodeConnectionIssue expr: 'gnatsd_varz_leafnodes == 0' for: 5m diff --git a/dist/rules/nomad/embedded-exporter.yml b/dist/rules/nomad/embedded-exporter.yml index b8b4059..902e134 100644 --- a/dist/rules/nomad/embedded-exporter.yml +++ b/dist/rules/nomad/embedded-exporter.yml @@ -12,7 +12,7 @@ groups: severity: warning annotations: summary: Nomad job failed (instance {{ $labels.instance }}) - description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Nomad job {{ $labels.job }} has {{ $value }} failed allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadJobLost expr: 'nomad_nomad_job_summary_lost > 0' @@ -21,7 +21,7 @@ groups: severity: warning annotations: summary: Nomad job lost (instance {{ $labels.instance }}) - description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Nomad job {{ $labels.job }} has {{ $value }} lost allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadJobQueued expr: 'nomad_nomad_job_summary_queued > 0' @@ -30,7 +30,7 @@ groups: severity: warning annotations: summary: Nomad job queued (instance {{ $labels.instance }}) - description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Nomad job {{ $labels.job }} has {{ $value }} queued allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadBlockedEvaluation expr: 'nomad_nomad_blocked_evals_total_blocked > 0' @@ -39,4 +39,4 @@ groups: severity: warning annotations: summary: Nomad blocked evaluation (instance {{ $labels.instance }}) - description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Nomad has {{ $value }} blocked evaluations. The cluster may lack resources to place allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/openstack/openstack-exporter.yml b/dist/rules/openstack/openstack-exporter.yml index a75ed7a..e1b6223 100644 --- a/dist/rules/openstack/openstack-exporter.yml +++ b/dist/rules/openstack/openstack-exporter.yml @@ -5,6 +5,7 @@ groups: rules: + # Adjust the job label regex to match the actual job name in your Prometheus scrape config. - alert: OpenstackExporterDown expr: 'up{job=~".*openstack.*"} == 0' for: 2m diff --git a/dist/rules/opentelemetry-collector/embedded-exporter.yml b/dist/rules/opentelemetry-collector/embedded-exporter.yml index 4936fe0..fda97bb 100644 --- a/dist/rules/opentelemetry-collector/embedded-exporter.yml +++ b/dist/rules/opentelemetry-collector/embedded-exporter.yml @@ -8,6 +8,7 @@ groups: rules: + # Adjust the job label regex to match the actual job name in your Prometheus scrape config. - alert: OpentelemetryCollectorDown expr: 'up{job=~".*otel.*collector.*"} == 0' for: 1m @@ -17,8 +18,9 @@ groups: summary: OpenTelemetry Collector down (instance {{ $labels.instance }}) description: "OpenTelemetry Collector instance has disappeared or is not being scraped\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorReceiverRefusedSpans - expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0' + expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0.05' for: 5m labels: severity: critical @@ -26,8 +28,9 @@ groups: summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }}) description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorReceiverRefusedMetricPoints - expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0' + expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0.05' for: 5m labels: severity: critical @@ -35,8 +38,9 @@ groups: summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }}) description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorReceiverRefusedLogRecords - expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0' + expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0.05' for: 5m labels: severity: critical @@ -84,6 +88,7 @@ groups: description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. + # These processor metrics are deprecated since collector v0.110.0. - alert: OpentelemetryCollectorProcessorRefusedSpans expr: 'rate(otelcol_processor_refused_spans[5m]) > 0.05' for: 5m @@ -94,6 +99,7 @@ groups: description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. + # These processor metrics are deprecated since collector v0.110.0. - alert: OpentelemetryCollectorProcessorRefusedMetricPoints expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05' for: 5m @@ -104,7 +110,7 @@ groups: description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpentelemetryCollectorHighMemoryUsage - expr: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9' + expr: '(otelcol_process_runtime_heap_alloc_bytes / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes) > 0.9' for: 5m labels: severity: warning diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index be02424..237811e 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -70,7 +70,7 @@ groups: description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlDeadLocks - expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' + expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres",datid!="0"}[1m]) > 5' for: 0m labels: severity: warning @@ -79,7 +79,7 @@ groups: description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlHighRollbackRate - expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02' + expr: 'sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / (sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + sum by (namespace,datname,instance) (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))) > 0.02 and (sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + sum by (namespace,datname,instance) (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))) > 0' for: 0m labels: severity: warning @@ -96,6 +96,7 @@ groups: summary: Postgresql commit rate low (instance {{ $labels.instance }}) description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # pg_txid_current is not a default postgres_exporter metric. You need to define a custom query. See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - alert: PostgresqlLowXidConsumption expr: 'rate(pg_txid_current[1m]) < 5' for: 2m @@ -132,6 +133,7 @@ groups: summary: Postgresql configuration changed (instance {{ $labels.instance }}) description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # pg_stat_ssl_compression is not a default postgres_exporter metric and is only available on PostgreSQL 9.5-13 (removed in PG 14). See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - alert: PostgresqlSslCompressionActive expr: 'sum by (instance) (pg_stat_ssl_compression) > 0' for: 0m diff --git a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml index 2e8cd42..a63e745 100644 --- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml +++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml @@ -143,7 +143,7 @@ groups: description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusAlertmanagerNotificationFailing - expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0' + expr: 'rate(alertmanager_notifications_failed_total[3m]) > 0.05' for: 0m labels: severity: critical diff --git a/dist/rules/promtail/embedded-exporter.yml b/dist/rules/promtail/embedded-exporter.yml index f8e0a46..7ba03c7 100644 --- a/dist/rules/promtail/embedded-exporter.yml +++ b/dist/rules/promtail/embedded-exporter.yml @@ -15,7 +15,7 @@ groups: description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PromtailRequestLatency - expr: 'histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1' + expr: 'histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (namespace, job, route, le)) > 1' for: 5m labels: severity: critical diff --git a/dist/rules/pulsar/embedded-exporter.yml b/dist/rules/pulsar/embedded-exporter.yml index 8885369..60bf38f 100644 --- a/dist/rules/pulsar/embedded-exporter.yml +++ b/dist/rules/pulsar/embedded-exporter.yml @@ -41,24 +41,30 @@ groups: summary: Pulsar topic very large backlog storage size (instance {{ $labels.instance }}) description: "The topic backlog storage size is over 20 GB\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # pulsar_storage_write_latency_le_overflow is the overflow bucket of Pulsar's non-standard histogram. + # It counts write operations exceeding all defined latency bounds (> 1000ms). - alert: PulsarHighWriteLatency - expr: 'sum(pulsar_storage_write_latency_overflow > 0) by (topic)' + expr: 'sum(pulsar_storage_write_latency_le_overflow > 0) by (topic)' for: 1h labels: severity: critical annotations: summary: Pulsar high write latency (instance {{ $labels.instance }}) - description: "Messages cannot be written in a timely fashion\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Pulsar topic {{ $labels.topic }} has {{ $value }} storage write operations exceeding the maximum latency bucket (> 1000ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # pulsar_entry_size_le_overflow is the overflow bucket of Pulsar's non-standard histogram. + # It counts message entries exceeding all defined size bounds. - alert: PulsarLargeMessagePayload - expr: 'sum(pulsar_entry_size_overflow > 0) by (topic)' + expr: 'sum(pulsar_entry_size_le_overflow > 0) by (topic)' for: 1h labels: severity: warning annotations: summary: Pulsar large message payload (instance {{ $labels.instance }}) - description: "Observing large message payload (> 1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Pulsar topic {{ $labels.topic }} has {{ $value }} message entries exceeding the maximum size bucket (> 1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # This metric name is path-dependent and may differ based on your BookKeeper data directory configuration. + # Adjust the metric name to match your actual ledger directory path. - alert: PulsarHighLedgerDiskUsage expr: 'sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75' for: 1h @@ -84,7 +90,7 @@ groups: severity: critical annotations: summary: Pulsar high number of function errors (instance {{ $labels.instance }}) - description: "Observing more than 10 Function errors per minute\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Pulsar function {{ $labels.name }} has more than 10 errors per second ({{ $value | printf \"%.2f\" }}/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarHighNumberOfSinkErrors expr: 'sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10' @@ -93,4 +99,4 @@ groups: severity: critical annotations: summary: Pulsar high number of sink errors (instance {{ $labels.instance }}) - description: "Observing more than 10 Sink errors per minute\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Pulsar sink {{ $labels.name }} has more than 10 errors per second ({{ $value | printf \"%.2f\" }}/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/python/python-exporter.yml b/dist/rules/python/python-exporter.yml index 1da8228..c126d7b 100644 --- a/dist/rules/python/python-exporter.yml +++ b/dist/rules/python/python-exporter.yml @@ -6,13 +6,13 @@ groups: rules: - alert: PythonGcObjectsUncollectable - expr: 'increase(python_gc_objects_uncollectable_total[5m]) > 0' + expr: 'increase(python_gc_objects_uncollectable_total[5m]) > 1' for: 5m labels: severity: warning annotations: summary: Python GC objects uncollectable (instance {{ $labels.instance }}) - description: "Python has uncollectable objects, potential memory leak via reference cycles\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Python has uncollectable objects ({{ $value }}), potential memory leak via reference cycles\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PythonGcCollectionsHigh expr: 'rate(python_gc_objects_collected_total[5m]) > 10000' diff --git a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml index d8cdce8..f80783c 100644 --- a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml +++ b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml @@ -32,7 +32,7 @@ groups: severity: critical annotations: summary: RabbitMQ cluster partition (instance {{ $labels.instance }}) - description: "Cluster partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "RabbitMQ cluster has a network partition ({{ $value }} partitions detected). Messages may be lost or duplicated.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqOutOfMemory expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0' @@ -44,7 +44,7 @@ groups: description: "Memory available for RabbitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqInstanceTooManyConnections - expr: 'rabbitmq_connectionsTotal > 1000' + expr: 'rabbitmq_connections > 1000' for: 2m labels: severity: warning diff --git a/dist/rules/rabbitmq/rabbitmq-exporter.yml b/dist/rules/rabbitmq/rabbitmq-exporter.yml index 4eec1e4..af7844b 100644 --- a/dist/rules/rabbitmq/rabbitmq-exporter.yml +++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml @@ -23,7 +23,7 @@ groups: severity: critical annotations: summary: RabbitMQ node not distributed (instance {{ $labels.instance }}) - description: "Distribution link state is not 'up'\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Distribution link to peer {{ $labels.peer }} is not 'up' (state {{ $value }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqInstancesDifferentVersions expr: 'count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1' @@ -59,7 +59,7 @@ groups: severity: warning annotations: summary: RabbitMQ too many ready messages (instance {{ $labels.instance }}) - description: "RabbitMQ too many ready messages on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "RabbitMQ too many ready messages on queue {{ $labels.queue }} ({{ $value }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqTooManyUnackMessages expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000' @@ -68,7 +68,7 @@ groups: severity: warning annotations: summary: RabbitMQ too many unack messages (instance {{ $labels.instance }}) - description: "Too many unacknowledged messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Too many unacknowledged messages on queue {{ $labels.queue }} ({{ $value }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqTooManyConnections expr: 'rabbitmq_connections > 1000' @@ -88,11 +88,12 @@ groups: summary: RabbitMQ no queue consumer (instance {{ $labels.instance }}) description: "A queue has less than 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 3 avoids noise from occasional misroutes. Adjust based on your expected traffic patterns. - alert: RabbitmqUnroutableMessages - expr: 'increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0' + expr: 'increase(rabbitmq_channel_messages_unroutable_returned_total[5m]) > 3 or increase(rabbitmq_channel_messages_unroutable_dropped_total[5m]) > 3' for: 2m labels: severity: warning annotations: summary: RabbitMQ unroutable messages (instance {{ $labels.instance }}) - description: "A queue has unroutable messages ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "A queue has unroutable messages ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/ruby/ruby-exporter.yml b/dist/rules/ruby/ruby-exporter.yml index 1ac782e..9b677eb 100644 --- a/dist/rules/ruby/ruby-exporter.yml +++ b/dist/rules/ruby/ruby-exporter.yml @@ -24,9 +24,9 @@ groups: summary: Ruby heap free slots high (instance {{ $labels.instance }}) description: "Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Major GC rate > 5/s is extremely high. Consider lowering to > 1 or > 2 for earlier detection. + # Major GC rate > 5/s only fires if the app is essentially non-functional. Threshold of 2/s provides earlier detection. - alert: RubyMajorGcRateHigh - expr: 'rate(ruby_major_gc_ops_total[5m]) > 5' + expr: 'rate(ruby_major_gc_ops_total[5m]) > 2' for: 5m labels: severity: warning diff --git a/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml index 6b0c48e..8db4a2c 100644 --- a/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml +++ b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml @@ -30,7 +30,7 @@ groups: severity: critical annotations: summary: SMART device temperature over trip value (instance {{ $labels.instance }}) - description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartDeviceTemperatureNearingTripValue expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)' @@ -39,7 +39,7 @@ groups: severity: warning annotations: summary: SMART device temperature nearing trip value (instance {{ $labels.instance }}) - description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartStatus expr: 'smartctl_device_smart_status != 1' @@ -48,7 +48,7 @@ groups: severity: critical annotations: summary: SMART status (instance {{ $labels.instance }}) - description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartCriticalWarning expr: 'smartctl_device_critical_warning > 0' @@ -57,7 +57,7 @@ groups: severity: critical annotations: summary: SMART critical warning (instance {{ $labels.instance }}) - description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartMediaErrors expr: 'smartctl_device_media_errors > 0' @@ -66,7 +66,7 @@ groups: severity: critical annotations: summary: SMART media errors (instance {{ $labels.instance }}) - description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartWearoutIndicator expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold' @@ -75,4 +75,4 @@ groups: severity: critical annotations: summary: SMART Wearout Indicator (instance {{ $labels.instance }}) - description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/sidekiq/strech-sidekiq-exporter.yml b/dist/rules/sidekiq/strech-sidekiq-exporter.yml index 7da4969..e5685dc 100644 --- a/dist/rules/sidekiq/strech-sidekiq-exporter.yml +++ b/dist/rules/sidekiq/strech-sidekiq-exporter.yml @@ -6,16 +6,16 @@ groups: rules: - alert: SidekiqQueueSize - expr: 'sidekiq_queue_size > 100' + expr: 'sidekiq_queue_enqueued_jobs > 100' for: 1m labels: severity: warning annotations: summary: Sidekiq queue size (instance {{ $labels.instance }}) - description: "Sidekiq queue {{ $labels.name }} is growing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Sidekiq queue {{ $labels.name }} is growing ({{ $value }} enqueued jobs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SidekiqSchedulingLatencyTooHigh - expr: 'max(sidekiq_queue_latency) > 60' + expr: 'max(sidekiq_queue_latency_seconds) > 60' for: 0m labels: severity: critical diff --git a/dist/rules/snmp/snmp-exporter.yml b/dist/rules/snmp/snmp-exporter.yml index 920f714..ca4b48b 100644 --- a/dist/rules/snmp/snmp-exporter.yml +++ b/dist/rules/snmp/snmp-exporter.yml @@ -7,7 +7,7 @@ groups: rules: - # From the official snmp-mixin. + # Rename job=~"snmp.*" to match the actual job name in your Prometheus scrape config. - alert: SnmpTargetDown expr: 'up{job=~"snmp.*"} == 0' for: 5m diff --git a/dist/rules/spinnaker/embedded-exporter.yml b/dist/rules/spinnaker/embedded-exporter.yml index dac2885..5cdeee9 100644 --- a/dist/rules/spinnaker/embedded-exporter.yml +++ b/dist/rules/spinnaker/embedded-exporter.yml @@ -36,24 +36,24 @@ groups: description: "Orca queue message lag is {{ $value }}s. Pipeline stages are waiting too long before being processed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SpinnakerDeadMessages - expr: 'rate(queue_dead_messages_total[5m]) > 0' + expr: 'rate(queue_dead_messages_total[5m]) > 0.05' for: 2m labels: severity: critical annotations: summary: Spinnaker dead messages (instance {{ $labels.instance }}) - description: "Orca is producing dead-lettered messages ({{ $value }} per second). These are tasks that exhausted all retries and will not be executed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Orca is producing dead-lettered messages ({{ $value | humanize }}/s). These are tasks that exhausted all retries and will not be executed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Zombies are pipeline executions that are running but have lost their queue entry. # See https://spinnaker.io/docs/guides/runbooks/orca-zombie-executions/ - alert: SpinnakerZombieExecutions - expr: 'rate(queue_zombies_total[5m]) > 0' + expr: 'rate(queue_zombies_total[5m]) > 0.05' for: 5m labels: severity: warning annotations: summary: Spinnaker zombie executions (instance {{ $labels.instance }}) - description: "{{ $value }} zombie pipeline executions detected. These are executions with no corresponding queue messages.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Zombie pipeline executions rate is {{ $value | humanize }}/s. These are executions with no corresponding queue messages.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SpinnakerThreadPoolExhaustion expr: 'threadpool_blockingQueueSize > 0' @@ -76,7 +76,7 @@ groups: description: "Igor polling monitor {{ $labels.monitor }} for {{ $labels.partition }} has exceeded its item threshold, preventing pipeline triggers.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SpinnakerPollingMonitorFailures - expr: 'rate(pollingMonitor_failed_total[5m]) > 0' + expr: 'rate(pollingMonitor_failed_total[5m]) > 0.05' for: 5m labels: severity: warning @@ -95,7 +95,7 @@ groups: description: "Spinnaker API 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SpinnakerApiRateLimitThrottling - expr: 'rate(rateLimitThrottling_total[5m]) > 0' + expr: 'rate(rateLimitThrottling_total[5m]) > 0.05' for: 2m labels: severity: warning diff --git a/dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml b/dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml index dcc1ce9..2a0ed53 100644 --- a/dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml +++ b/dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml @@ -7,21 +7,21 @@ groups: - alert: SslCertificateProbeFailed expr: 'ssl_probe_success == 0' - for: 0m + for: 1m labels: severity: critical annotations: summary: SSL certificate probe failed (instance {{ $labels.instance }}) description: "Failed to fetch SSL information {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: SslCertificateOscpStatusUnknown + - alert: SslCertificateOcspStatusUnknown expr: 'ssl_ocsp_response_status == 2' for: 0m labels: severity: warning annotations: - summary: SSL certificate OSCP status unknown (instance {{ $labels.instance }}) - description: "Failed to get the OSCP status {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: SSL certificate OCSP status unknown (instance {{ $labels.instance }}) + description: "Failed to get the OCSP status for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SslCertificateRevoked expr: 'ssl_ocsp_response_status == 1' diff --git a/dist/rules/systemd/systemd-exporter.yml b/dist/rules/systemd/systemd-exporter.yml index b9f2adb..4b159ae 100644 --- a/dist/rules/systemd/systemd-exporter.yml +++ b/dist/rules/systemd/systemd-exporter.yml @@ -42,8 +42,9 @@ groups: summary: Systemd unit tasks near limit (instance {{ $labels.instance }}) description: "Systemd unit {{ $labels.name }} is using {{ $value | humanizePercentage }} of its task limit. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # systemd_socket_refused_connections_total is declared as Gauge by the exporter despite the _total suffix, so delta() is used instead of increase(). - alert: SystemdSocketRefusedConnections - expr: 'increase(systemd_socket_refused_connections_total[5m]) > 0' + expr: 'delta(systemd_socket_refused_connections_total[5m]) > 3' for: 2m labels: severity: warning diff --git a/dist/rules/thanos/thanos-bucket-replicate.yml b/dist/rules/thanos/thanos-bucket-replicate.yml index f1c44fe..b64ffa4 100644 --- a/dist/rules/thanos/thanos-bucket-replicate.yml +++ b/dist/rules/thanos/thanos-bucket-replicate.yml @@ -6,7 +6,7 @@ groups: rules: - alert: ThanosBucketReplicateErrorRate - expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) > 0' + expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total[5m])) > 0' for: 5m labels: severity: critical @@ -15,7 +15,7 @@ groups: description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosBucketReplicateRunLatency - expr: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)' + expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_count[5m])) > 0)' for: 5m labels: severity: critical diff --git a/dist/rules/thanos/thanos-compactor.yml b/dist/rules/thanos/thanos-compactor.yml index 3c88a33..f67a53f 100644 --- a/dist/rules/thanos/thanos-compactor.yml +++ b/dist/rules/thanos/thanos-compactor.yml @@ -15,7 +15,7 @@ groups: description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosCompactorHalted - expr: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1' + expr: 'thanos_compact_halted == 1' for: 5m labels: severity: warning @@ -24,7 +24,7 @@ groups: description: "Thanos Compact {{$labels.job}} has failed to run and now is halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosCompactorHighCompactionFailures - expr: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) > 0' + expr: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total[5m])) > 0' for: 15m labels: severity: warning diff --git a/dist/rules/thanos/thanos-query.yml b/dist/rules/thanos/thanos-query.yml index fc0b4c1..f8d5adf 100644 --- a/dist/rules/thanos/thanos-query.yml +++ b/dist/rules/thanos/thanos-query.yml @@ -32,8 +32,9 @@ groups: summary: Thanos Query Grpc Server Error Rate (instance {{ $labels.instance }}) description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Filters to actual error codes only. grpc_code!="OK" would include benign codes like NotFound, AlreadyExists, and Cancelled. - alert: ThanosQueryGrpcClientErrorRate - expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0' + expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded|ResourceExhausted", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0' for: 5m labels: severity: warning @@ -42,7 +43,7 @@ groups: description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryHighDNSFailures - expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) > 0' + expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m])) > 0' for: 15m labels: severity: warning @@ -51,7 +52,7 @@ groups: description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryInstantLatencyHigh - expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)' + expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query"}[5m])) > 0)' for: 10m labels: severity: critical diff --git a/dist/rules/thanos/thanos-receiver.yml b/dist/rules/thanos/thanos-receiver.yml index 8cc54f8..101b063 100644 --- a/dist/rules/thanos/thanos-receiver.yml +++ b/dist/rules/thanos/thanos-receiver.yml @@ -24,7 +24,7 @@ groups: description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveHighReplicationFailures - expr: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100' + expr: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error"}[5m])) / sum by (job) (rate(thanos_receive_replications_total[5m]))) > (max by (job) (floor((thanos_receive_replication_factor+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes))) * 100' for: 5m labels: severity: warning @@ -33,7 +33,7 @@ groups: description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveHighForwardRequestFailures - expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) > 0' + expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total[5m])) > 0' for: 5m labels: severity: info @@ -42,7 +42,7 @@ groups: description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveHighHashringFileRefreshFailures - expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0' + expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0' for: 15m labels: severity: warning @@ -51,7 +51,7 @@ groups: description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveConfigReloadFailure - expr: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1' + expr: 'avg by (job) (thanos_receive_config_last_reload_successful) != 1' for: 5m labels: severity: warning diff --git a/dist/rules/thanos/thanos-ruler.yml b/dist/rules/thanos/thanos-ruler.yml index 16d9cc1..8c04895 100644 --- a/dist/rules/thanos/thanos-ruler.yml +++ b/dist/rules/thanos/thanos-ruler.yml @@ -6,7 +6,7 @@ groups: rules: - alert: ThanosRuleQueueIsDroppingAlerts - expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0' + expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total[5m])) > 0' for: 5m labels: severity: critical @@ -15,7 +15,7 @@ groups: description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleSenderIsFailingAlerts - expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0' + expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total[5m])) > 0' for: 5m labels: severity: critical @@ -34,7 +34,7 @@ groups: # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: ThanosRuleHighRuleEvaluationWarnings - expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05' + expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total[5m])) > 0.05' for: 15m labels: severity: info @@ -61,7 +61,7 @@ groups: description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleConfigReloadFailure - expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1' + expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful) != 1' for: 5m labels: severity: info @@ -70,7 +70,7 @@ groups: description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleQueryHighDNSFailures - expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0' + expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) > 0' for: 15m labels: severity: warning @@ -79,7 +79,7 @@ groups: description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleAlertmanagerHighDNSFailures - expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0' + expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) > 0' for: 15m labels: severity: warning @@ -97,7 +97,7 @@ groups: description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosNoRuleEvaluations - expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0' + expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules) > 0' for: 5m labels: severity: critical diff --git a/dist/rules/thanos/thanos-sidecar.yml b/dist/rules/thanos/thanos-sidecar.yml index c0c668d..3a19c43 100644 --- a/dist/rules/thanos/thanos-sidecar.yml +++ b/dist/rules/thanos/thanos-sidecar.yml @@ -16,7 +16,7 @@ groups: description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosSidecarNoConnectionToStartedPrometheus - expr: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0' + expr: 'thanos_sidecar_prometheus_up == 0 and on (namespace, pod) prometheus_tsdb_data_replay_duration_seconds != 0' for: 5m labels: severity: critical diff --git a/dist/rules/thanos/thanos-store.yml b/dist/rules/thanos/thanos-store.yml index 289e0dd..075b0f7 100644 --- a/dist/rules/thanos/thanos-store.yml +++ b/dist/rules/thanos/thanos-store.yml @@ -15,7 +15,7 @@ groups: description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosStoreSeriesGateLatencyHigh - expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)' + expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count[5m])) > 0)' for: 10m labels: severity: warning diff --git a/dist/rules/zfs/zfs_exporter.yml b/dist/rules/zfs/zfs_exporter.yml index 78c4d05..c195c4d 100644 --- a/dist/rules/zfs/zfs_exporter.yml +++ b/dist/rules/zfs/zfs_exporter.yml @@ -6,13 +6,13 @@ groups: rules: - alert: ZfsPoolOutOfSpace - expr: 'zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0' + expr: 'zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0' for: 0m labels: severity: warning annotations: summary: ZFS pool out of space (instance {{ $labels.instance }}) - description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "ZFS pool {{ $labels.pool }} is almost full (< 10% left).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 0: ONLINE # 1: DEGRADED diff --git a/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml b/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml index 69e29e5..e0873a2 100644 --- a/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml +++ b/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml @@ -31,7 +31,7 @@ groups: severity: critical annotations: summary: Zookeeper Too Many Leaders (instance {{ $labels.instance }}) - description: "Zookeeper cluster has too many nodes marked as leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Zookeeper cluster has {{ $value }} nodes marked as leader (expected 1), indicating a split-brain\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ZookeeperNotOk expr: 'zk_ruok == 0' @@ -40,4 +40,4 @@ groups: severity: warning annotations: summary: Zookeeper Not Ok (instance {{ $labels.instance }}) - description: "Zookeeper instance is not ok\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Zookeeper instance {{ $labels.instance }} is not ok (ruok check failed)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"