This commit is contained in:
samber 2026-04-06 18:38:45 +00:00
parent 2258835c30
commit ed1515015a
65 changed files with 311 additions and 241 deletions

View file

@ -35,7 +35,7 @@ groups:
# A single restart may be normal during deployments. Adjust threshold based on restart tolerance.
- alert: FlinkJobRestartIncreasing
expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 1'
expr: 'delta(flink_jobmanager_job_numRestarts[5m]) > 1'
for: 5m
labels:
severity: warning
@ -44,7 +44,7 @@ groups:
description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: FlinkCheckpointFailures
expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1'
expr: 'delta(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1'
for: 5m
labels:
severity: warning
@ -82,8 +82,9 @@ groups:
summary: Flink task high backpressure time (instance {{ $labels.instance }})
description: "Flink task {{ $labels.task_name }} is spending {{ $value | humanize }}ms/sec in backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Flink TaskManagers manage their own memory pool. High JVM heap usage (outside managed memory) may indicate memory leaks or misconfiguration.
- alert: FlinkTaskmanagerHeapMemoryHigh
expr: 'flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9'
expr: 'flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_taskmanager_Status_JVM_Memory_Heap_Max > 0'
for: 5m
labels:
severity: warning
@ -92,7 +93,7 @@ groups:
description: "Flink TaskManager {{ $labels.instance }} heap memory usage is above 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: FlinkJobmanagerHeapMemoryHigh
expr: 'flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9'
expr: 'flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_jobmanager_Status_JVM_Memory_Heap_Max > 0'
for: 5m
labels:
severity: warning
@ -100,9 +101,10 @@ groups:
summary: Flink JobManager heap memory high (instance {{ $labels.instance }})
description: "Flink JobManager {{ $labels.instance }} heap memory usage is above 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Flink exposes GC time as a gauge (cumulative milliseconds), so deriv() is used instead of rate().
# Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload.
- alert: FlinkTaskmanagerGcTimeHigh
expr: 'rate(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100'
expr: 'deriv(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100'
for: 5m
labels:
severity: warning
@ -112,7 +114,7 @@ groups:
# Only fires for tasks that have previously received records, to avoid false positives during startup.
- alert: FlinkNoRecordsProcessed
expr: 'rate(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0'
expr: 'delta(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0'
for: 5m
labels:
severity: warning

View file

@ -27,7 +27,7 @@ groups:
expr: 'apache_uptime_seconds_total / 60 < 1'
for: 0m
labels:
severity: warning
severity: info
annotations:
summary: Apache restart (instance {{ $labels.instance }})
description: "Apache has just been restarted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -33,7 +33,7 @@ groups:
description: "UPS now running on battery (since {{$value | humanizeDuration}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ApcUpsLowBatteryVoltage
expr: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95'
expr: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95 and apcupsd_battery_nominal_volts > 0'
for: 0m
labels:
severity: warning

View file

@ -7,7 +7,7 @@ groups:
- alert: BlackboxProbeFailed
expr: 'probe_success == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -34,7 +34,7 @@ groups:
- alert: BlackboxProbeHttpFailure
expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
for: 0m
for: 1m
labels:
severity: critical
annotations:

View file

@ -6,13 +6,13 @@ groups:
rules:
- alert: CaddyReverseProxyDown
expr: 'count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0'
expr: 'caddy_reverse_proxy_upstreams_healthy == 0'
for: 0m
labels:
severity: critical
annotations:
summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }})
description: "All Caddy reverse proxies are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CaddyHighHttp4xxErrorRateService
expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'

View file

@ -33,7 +33,7 @@ groups:
description: "High viewwrite latency on {{ $labels.instance }} cassandra node\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraAuthenticationFailures
expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
expr: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
for: 2m
labels:
severity: warning
@ -97,7 +97,7 @@ groups:
description: "Some Cassandra repair tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraConnectionTimeoutsTotal(criteo)
expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
expr: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
for: 2m
labels:
severity: critical
@ -142,7 +142,7 @@ groups:
description: "Read failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestWriteFailure(criteo)
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0'
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0.05'
for: 0m
labels:
severity: critical
@ -151,7 +151,7 @@ groups:
description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestReadFailure(criteo)
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0'
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0.05'
for: 0m
labels:
severity: critical
@ -159,11 +159,12 @@ groups:
summary: Cassandra client request read failure (Criteo) (instance {{ $labels.instance }})
description: "A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# A low key cache hit rate increases disk I/O. Threshold is workload-dependent — adjust based on your data access patterns.
- alert: CassandraCacheHitRateKeyCache
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85'
for: 2m
labels:
severity: critical
severity: warning
annotations:
summary: Cassandra cache hit rate key cache (instance {{ $labels.instance }})
description: "Key cache hit rate is below 85%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -97,7 +97,7 @@ groups:
description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestWriteFailure(instaclustr)
expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 5'
for: 2m
labels:
severity: critical
@ -106,7 +106,7 @@ groups:
description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestReadFailure(instaclustr)
expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 5'
for: 2m
labels:
severity: critical

View file

@ -5,9 +5,11 @@ groups:
rules:
# ceph_health_status: 0=HEALTH_OK, 1=HEALTH_WARN, 2=HEALTH_ERR.
# This rule fires on any non-OK state. Split into separate warning/critical rules by using ==1 and ==2 thresholds if needed.
- alert: CephState
expr: 'ceph_health_status != 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -34,15 +36,16 @@ groups:
- alert: CephOsdDown
expr: 'ceph_osd_up == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
summary: Ceph OSD Down (instance {{ $labels.instance }})
description: "Ceph Object Storage Daemon Down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 5000ms (5 seconds). Adjust based on your expected OSD performance.
- alert: CephHighOsdLatency
expr: 'ceph_osd_perf_apply_latency_seconds > 5'
expr: 'ceph_osd_apply_latency_ms > 5000'
for: 1m
labels:
severity: warning
@ -50,14 +53,16 @@ groups:
summary: Ceph high OSD latency (instance {{ $labels.instance }})
description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephOsdLowSpace
expr: 'ceph_osd_utilization > 90'
for: 2m
# Ceph internally triggers OSD_NEARFULL based on the nearfull_ratio (default 85%).
# ceph_health_detail can also be used for more granular OSD space alerts.
- alert: CephOsdNearFull
expr: 'ceph_health_detail{name="OSD_NEARFULL"} == 1'
for: 5m
labels:
severity: warning
annotations:
summary: Ceph OSD low space (instance {{ $labels.instance }})
description: "Ceph Object Storage Daemon is going out of space. Please add more disks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Ceph OSD near full (instance {{ $labels.instance }})
description: "A Ceph OSD is dangerously full. Please add more disks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephOsdReweighted
expr: 'ceph_osd_weight < 1'
@ -115,7 +120,7 @@ groups:
- alert: CephPgUnavailable
expr: 'ceph_pg_total - ceph_pg_active > 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:

View file

@ -33,9 +33,10 @@ groups:
summary: Cert-Manager certificate not ready (instance {{ $labels.instance }})
description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# In cert-manager 1.19+, the metric was renamed (dropped http_ prefix). Verify metric name against your version.
# Metric renamed in cert-manager v1.19+ (dropped the http_ prefix): certmanager_acme_client_request_count.
# For cert-manager < v1.19, use: certmanager_http_acme_client_request_count.
- alert: Cert-managerHittingAcmeRateLimits
expr: 'sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0'
expr: 'sum by (host) (rate(certmanager_acme_client_request_count{status="429"}[5m])) > 0'
for: 5m
labels:
severity: critical

View file

@ -45,7 +45,7 @@ groups:
description: "Cilium agent {{ $labels.pod }} has {{ $value }} endpoint(s) in invalid state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentEndpointRegenerationFailures
expr: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0'
expr: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0.05'
for: 5m
labels:
severity: warning
@ -54,7 +54,7 @@ groups:
description: "Cilium agent {{ $labels.pod }} is failing to regenerate endpoints. Network policy enforcement may be stale.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentEndpointUpdateFailure
expr: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0'
expr: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0.05'
for: 5m
labels:
severity: warning
@ -63,7 +63,7 @@ groups:
description: "Cilium agent {{ $labels.pod }} is failing K8s endpoint update API calls ({{ $labels.method }} {{ $labels.return_code }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentEndpointCreateFailure
expr: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0'
expr: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0.05'
for: 5m
labels:
severity: info
@ -72,7 +72,7 @@ groups:
description: "Cilium agent {{ $labels.pod }} is failing CNI endpoint-create calls. New pods may fail to get networking.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentMapOperationFailures
expr: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0'
expr: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0.05'
for: 5m
labels:
severity: warning
@ -100,7 +100,7 @@ groups:
description: "Cilium agent {{ $labels.pod }} conntrack table is full, causing packet drops. Increase CT map size or investigate connection leaks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentConntrackFailedGarbageCollection
expr: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0'
expr: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0.05'
for: 5m
labels:
severity: warning
@ -128,7 +128,7 @@ groups:
description: "Cilium agent {{ $labels.pod }} is dropping packets due to policy denial. Verify network policies are correct.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentHighDropRate
expr: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0'
expr: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0.05'
for: 5m
labels:
severity: warning
@ -146,7 +146,7 @@ groups:
description: "Cilium agent {{ $labels.pod }} policy BPF map is above 90% utilization. New policies may fail to apply.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentPolicyImportErrors
expr: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0'
expr: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0.05'
for: 5m
labels:
severity: warning
@ -156,7 +156,7 @@ groups:
# Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity.
- alert: CiliumAgentPolicyImplementationDelay
expr: 'histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay[5m])) by (le, pod)) > 60'
expr: 'histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay_bucket[5m])) by (le, pod)) > 60'
for: 5m
labels:
severity: warning
@ -203,7 +203,7 @@ groups:
# Some Cilium versions may not have a status label on this metric. Verify against your Cilium version.
- alert: CiliumOperatorIpamInterfaceCreationFailures
expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0'
expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0.05'
for: 10m
labels:
severity: warning
@ -212,7 +212,7 @@ groups:
description: "Cilium operator is failing to create IPAM network interfaces. IP allocation may be impacted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentApiErrors
expr: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0'
expr: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0.05'
for: 5m
labels:
severity: warning
@ -221,7 +221,7 @@ groups:
description: "Cilium agent {{ $labels.pod }} API is returning 5xx errors ({{ $labels.return_code }}). Agent may be unhealthy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentKubernetesClientErrors
expr: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0'
expr: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0.05'
for: 5m
labels:
severity: info
@ -239,13 +239,13 @@ groups:
description: "Cilium ClusterMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumClustermeshRemoteClusterFailing
expr: 'sum(rate(cilium_clustermesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0'
expr: 'sum(cilium_clustermesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Cilium ClusterMesh remote cluster failing (instance {{ $labels.instance }})
description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing ({{ $value }} failures).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumKvstoremeshRemoteClusterNotReady
expr: 'count(cilium_kvstoremesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0'
@ -257,16 +257,16 @@ groups:
description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumKvstoremeshRemoteClusterFailing
expr: 'sum(rate(cilium_kvstoremesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0'
expr: 'sum(cilium_kvstoremesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Cilium KVStoreMesh remote cluster failing (instance {{ $labels.instance }})
description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures ({{ $value }} failures).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumKvstoremeshSyncErrors
expr: 'sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0'
expr: 'sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0.05'
for: 5m
labels:
severity: critical
@ -275,7 +275,7 @@ groups:
description: "Cilium KVStoreMesh from {{ $labels.source_cluster }} is experiencing kvstore sync errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumHubbleLostEvents
expr: 'sum(rate(hubble_lost_events_total[5m])) by (pod) > 0'
expr: 'sum(rate(hubble_lost_events_total[5m])) by (pod) > 0.05'
for: 5m
labels:
severity: warning

View file

@ -135,7 +135,7 @@ groups:
description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseRejectedInsertQueries
expr: 'increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0'
expr: 'increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 2'
for: 1m
labels:
severity: warning
@ -144,7 +144,7 @@ groups:
description: "INSERTs rejected due to too many active data parts. Reduce insert frequency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDelayedInsertQueries
expr: 'increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0'
expr: 'increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 10'
for: 2m
labels:
severity: warning
@ -172,7 +172,7 @@ groups:
description: "High network usage. ClickHouse network usage exceeds 100MB/s.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDistributedRejectedInserts
expr: 'increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 0'
expr: 'increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 3'
for: 2m
labels:
severity: critical

View file

@ -24,23 +24,23 @@ groups:
description: "Cortex not connected to Alertmanager (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: CortexNotificationAreBeingDropped
- alert: CortexNotificationsAreBeingDropped
expr: 'rate(cortex_prometheus_notifications_dropped_total[5m]) > 0.05'
for: 0m
labels:
severity: critical
annotations:
summary: Cortex notification are being dropped (instance {{ $labels.instance }})
description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Cortex notifications are being dropped (instance {{ $labels.instance }})
description: "Cortex notifications are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: CortexNotificationError
- alert: CortexNotificationErrors
expr: 'rate(cortex_prometheus_notifications_errors_total[5m]) > 0.05'
for: 0m
labels:
severity: critical
annotations:
summary: Cortex notification error (instance {{ $labels.instance }})
summary: Cortex notification errors (instance {{ $labels.instance }})
description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CortexIngesterUnhealthy

View file

@ -23,6 +23,7 @@ groups:
summary: CouchDB atom memory usage critical (instance {{ $labels.instance }})
description: "Atom memory usage is above 90% of limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# The default max_dbs_open is 500. Adjust the threshold (currently 0.9 * 1000) to match your max_dbs_open setting.
- alert: CouchdbOpenDatabasesCritical
expr: 'couchdb_httpd_open_databases > 0.9 * 1000'
for: 5m
@ -32,6 +33,7 @@ groups:
summary: CouchDB open databases critical (instance {{ $labels.instance }})
description: "Number of open databases exceeds 90% of node capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Adjust 65535 to match your system's file descriptor limit (ulimit -n).
- alert: CouchdbOpenOsFilesCritical
expr: 'couchdb_httpd_open_os_files > 0.9 * 65535'
for: 5m
@ -159,7 +161,7 @@ groups:
description: "CouchDB process has restarted recently\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbCriticalLogEntries
expr: 'increase(couchdb_server_couch_log{level=~"error|critical"}[5m]) > 0'
expr: 'increase(couchdb_server_couch_log{level=~"error|critical"}[5m]) > 5'
for: 1m
labels:
severity: critical

View file

@ -78,7 +78,7 @@ groups:
description: "DigitalOcean platform has {{ $value }} active incident(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: DigitaloceanExporterCollectionErrors
expr: 'increase(digitalocean_errors_total[5m]) > 0'
expr: 'increase(digitalocean_errors_total[5m]) > 3'
for: 5m
labels:
severity: warning

View file

@ -73,7 +73,7 @@ groups:
description: "This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerLowCpuUtilization
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0'
for: 7d
labels:
severity: info

View file

@ -16,7 +16,7 @@ groups:
description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EbpfExporterDecoderErrors
expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0'
expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0.05'
for: 5m
labels:
severity: warning

View file

@ -142,8 +142,9 @@ groups:
summary: Elasticsearch no new documents (instance {{ $labels.instance }})
description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 10ms (0.01s) per indexing operation is a rough default. Adjust based on your document size and cluster performance.
- alert: ElasticsearchHighIndexingLatency
expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0'
expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[5m]) / rate(elasticsearch_indices_indexing_index_total[5m]) > 0.01 and rate(elasticsearch_indices_indexing_index_total[5m]) > 0'
for: 10m
labels:
severity: warning
@ -151,6 +152,7 @@ groups:
summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 10000 ops/s is a rough default. Adjust based on your cluster capacity and expected workload.
- alert: ElasticsearchHighIndexingRate
expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
for: 5m
@ -160,6 +162,7 @@ groups:
summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }})
description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 100 queries/s is very low for most production clusters. Adjust based on your expected query volume.
- alert: ElasticsearchHighQueryRate
expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100'
for: 5m

View file

@ -66,7 +66,7 @@ groups:
severity: warning
annotations:
summary: Envoy cluster membership degraded (instance {{ $labels.instance }})
description: "More than 25% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are unhealthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Only {{ $value | printf \"%.1f\" }}% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are healthy (threshold: 75%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyHighClusterUpstreamConnectionFailures
expr: 'increase(envoy_cluster_upstream_cx_connect_fail[5m]) > 10'
@ -159,7 +159,7 @@ groups:
description: "Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyNoHealthyUpstream
expr: 'increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 0'
expr: 'increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 3'
for: 0m
labels:
severity: critical

View file

@ -61,6 +61,7 @@ groups:
summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
description: "GRPC requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x.
- alert: EtcdHighNumberOfFailedHttpRequestsWarning
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
for: 2m
@ -70,6 +71,7 @@ groups:
summary: Etcd high number of failed HTTP requests warning (instance {{ $labels.instance }})
description: "More than 1% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x.
- alert: EtcdHighNumberOfFailedHttpRequestsCritical
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
for: 2m
@ -79,6 +81,7 @@ groups:
summary: Etcd high number of failed HTTP requests critical (instance {{ $labels.instance }})
description: "More than 5% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This etcd_http_* metric is from the etcd v2 API and does not exist in etcd 3.x. Remove this rule if running etcd 3.x.
- alert: EtcdHttpRequestsSlow
expr: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15'
for: 2m
@ -89,7 +92,7 @@ groups:
description: "HTTP requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdMemberCommunicationSlow
expr: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15'
expr: 'histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) by (instance, le)) > 0.15'
for: 2m
labels:
severity: warning
@ -107,7 +110,7 @@ groups:
description: "Etcd server got {{ $value }} failed proposals in the past hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighFsyncDurations
expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5'
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le)) > 0.5'
for: 2m
labels:
severity: warning
@ -116,7 +119,7 @@ groups:
description: "Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighCommitDurations
expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25'
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le)) > 0.25'
for: 2m
labels:
severity: warning

View file

@ -7,12 +7,12 @@ groups:
- alert: FreeswitchDown
expr: 'freeswitch_up == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
summary: Freeswitch down (instance {{ $labels.instance }})
description: "Freeswitch is unresponsive\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Freeswitch {{ $labels.instance }} is unresponsive.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: FreeswitchSessionsWarning
expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80 and freeswitch_session_limit > 0'

View file

@ -5,9 +5,9 @@ groups:
rules:
# grpc_code!="OK" includes non-error codes like NotFound, AlreadyExists. Consider filtering to specific error codes for less noise.
# Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
- alert: GitlabGitalyHighGrpcErrorRate
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown|DataLoss"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
for: 5m
labels:
severity: warning
@ -17,7 +17,6 @@ groups:
# ResourceExhausted errors from Gitaly mean Git operations are being rejected due to
# concurrency limits. This directly impacts users trying to push, pull, or clone.
# This alert is derived from the GitLab Omnibus default rules.
- alert: GitlabGitalyResourceExhausted
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
for: 5m
@ -36,8 +35,9 @@ groups:
summary: GitLab Gitaly high RPC latency (instance {{ $labels.instance }})
description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Brief throttling spikes are normal. Threshold of 0.1s/s (10% of CPU time throttled) filters out transient noise.
- alert: GitlabGitalyCpuThrottled
expr: 'rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0'
expr: 'rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0.1'
for: 5m
labels:
severity: warning
@ -46,7 +46,7 @@ groups:
description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabGitalyAuthenticationFailures
expr: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 0'
expr: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 3'
for: 0m
labels:
severity: warning

View file

@ -138,7 +138,7 @@ groups:
# This metric may not exist in all GitLab versions. Verify against your GitLab installation.
- alert: GitlabCiPipelineFailuresIncreasing
expr: 'rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0'
expr: 'deriv(gitlab_ci_pipeline_failure_reasons[5m]) > 0.05'
for: 10m
labels:
severity: warning
@ -179,7 +179,7 @@ groups:
description: "GitLab Ruby heap fragmentation on {{ $labels.instance }} is {{ $value }}. High fragmentation wastes memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabRackUncaughtErrors
expr: 'rate(rack_uncaught_errors_total[5m]) > 0'
expr: 'rate(rack_uncaught_errors_total[5m]) > 0.05'
for: 5m
labels:
severity: warning

View file

@ -57,10 +57,10 @@ groups:
summary: Go heap objects count high (instance {{ $labels.instance }})
description: "Go heap has too many live objects (> 10M), high GC pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# go_memstats_gc_cpu_fraction is deprecated since Go 1.20 and may return 0 in newer versions.
# Consider using runtime/metrics-based alternatives if running Go >= 1.20.
# rate(go_gc_duration_seconds_sum) approximates the fraction of wall-clock time spent in GC.
# This replaces go_memstats_gc_cpu_fraction which was removed in client_golang v1.12+.
- alert: GoGcCpuFractionHigh
expr: 'go_memstats_gc_cpu_fraction > 0.05'
expr: 'rate(go_gc_duration_seconds_sum[5m]) > 0.05'
for: 5m
labels:
severity: warning
@ -68,23 +68,27 @@ groups:
summary: Go GC CPU fraction high (instance {{ $labels.instance }})
description: "Go GC is consuming too much CPU (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# A threshold of 100/s only catches catastrophic leaks (30k goroutines in 5m). 10/s catches gradual leaks (~3k in 5m).
# Adjust based on your application's expected concurrency patterns.
- alert: GoGoroutineSpike
expr: 'deriv(go_goroutines[5m]) > 100'
expr: 'deriv(go_goroutines[5m]) > 10'
for: 5m
labels:
severity: warning
annotations:
summary: Go goroutine spike (instance {{ $labels.instance }})
description: "Go goroutine count is growing rapidly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Go goroutine count is growing rapidly ({{ $value | printf \"%.0f\" }} goroutines/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GoHeapFragmentation
expr: 'go_memstats_heap_idle_bytes / go_memstats_heap_sys_bytes > 0.9'
for: 5m
# Alerts when heap in-use grows by more than 10MB/s sustained over 10 minutes.
# Adjust threshold based on your workload.
- alert: GoHeapIn-useGrowing
expr: 'deriv(go_memstats_heap_inuse_bytes[10m]) > 1e7'
for: 0m
labels:
severity: warning
annotations:
summary: Go heap fragmentation (instance {{ $labels.instance }})
description: "Go heap has high idle ratio (> 90%), indicating memory fragmentation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Go heap in-use growing (instance {{ $labels.instance }})
description: "Go heap in-use memory is growing steadily, potential memory leak or under-sized heap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GoMemoryLeak
expr: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9'

View file

@ -178,8 +178,9 @@ groups:
summary: Mimir distributor inflight requests high (instance {{ $labels.instance }})
description: "Mimir distributor {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its inflight push requests limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirIngesterTsdbHeadCompactionFailed
expr: 'rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0'
expr: 'rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0.05'
for: 15m
labels:
severity: critical
@ -187,26 +188,29 @@ groups:
summary: Mimir ingester TSDB head compaction failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirIngesterTsdbHeadTruncationFailed
expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0'
for: 0m
expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0.05'
for: 15m
labels:
severity: critical
annotations:
summary: Mimir ingester TSDB head truncation failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirIngesterTsdbCheckpointCreationFailed
expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0'
for: 0m
expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0.05'
for: 15m
labels:
severity: critical
annotations:
summary: Mimir ingester TSDB checkpoint creation failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirIngesterTsdbCheckpointDeletionFailed
expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0'
expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0.05'
for: 0m
labels:
severity: critical
@ -214,8 +218,9 @@ groups:
summary: Mimir ingester TSDB checkpoint deletion failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirIngesterTsdbWalTruncationFailed
expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0'
expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0.05'
for: 0m
labels:
severity: warning
@ -223,8 +228,9 @@ groups:
summary: Mimir ingester TSDB WAL truncation failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirIngesterTsdbWalWritesFailed
expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0'
expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0.05'
for: 3m
labels:
severity: critical
@ -232,7 +238,7 @@ groups:
summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold aligned with official Mimir mixin (30 minutes).
# Threshold of 30 minutes. Adjust based on your sync interval.
- alert: MimirStoreGatewayHasNotSyncedBucket
expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0'
for: 5m
@ -240,7 +246,7 @@ groups:
severity: critical
annotations:
summary: Mimir store gateway has not synced bucket (instance {{ $labels.instance }})
description: "Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 30 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirStoreGatewayNoSyncedTenants
expr: '(min by (instance, job) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0) and on (instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"} offset 1h > 0)'
@ -287,8 +293,9 @@ groups:
summary: Mimir compactor has consecutive failures (instance {{ $labels.instance }})
description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# cortex_compactor_disk_out_of_space_errors_total is declared as gauge by Mimir despite the _total suffix, so delta() is used instead of increase().
- alert: MimirCompactorHasRunOutOfDiskSpace
expr: 'increase(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1'
expr: 'delta(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1'
for: 0m
labels:
severity: critical
@ -305,7 +312,7 @@ groups:
summary: Mimir compactor has not uploaded blocks (instance {{ $labels.instance }})
description: "Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Using 24h window per official mixin — compaction skips are rare events.
# Using a 24h window as compaction skips are rare events.
- alert: MimirCompactorSkippedBlocks
expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0'
for: 5m
@ -352,8 +359,9 @@ groups:
summary: Mimir ruler failed ring check (instance {{ $labels.instance }})
description: "Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirAlertmanagerSyncConfigsFailing
expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0'
expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0.05'
for: 30m
labels:
severity: critical
@ -361,8 +369,9 @@ groups:
summary: Mimir alertmanager sync configs failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirAlertmanagerRingCheckFailing
expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0'
expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0.05'
for: 10m
labels:
severity: critical
@ -370,8 +379,9 @@ groups:
summary: Mimir alertmanager ring check failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirAlertmanagerStateMergeFailing
expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0'
expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0.05'
for: 10m
labels:
severity: critical
@ -379,8 +389,9 @@ groups:
summary: Mimir alertmanager state merge failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirAlertmanagerReplicationFailing
expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0'
expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0.05'
for: 10m
labels:
severity: critical
@ -388,8 +399,9 @@ groups:
summary: Mimir alertmanager replication failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirAlertmanagerPersistStateFailing
expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0'
expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0.05'
for: 1h
labels:
severity: critical

View file

@ -117,7 +117,7 @@ groups:
summary: Tempo compaction too many outstanding blocks warning (instance {{ $labels.instance }})
description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration.
# Threshold of 100 blocks per compactor instance. Normalize by backend-worker count if needed. Adjust based on your environment.
- alert: TempoCompactionTooManyOutstandingBlocksCritical
expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 250'
for: 24h
@ -127,8 +127,9 @@ groups:
summary: Tempo compaction too many outstanding blocks critical (instance {{ $labels.instance }})
description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Increase compactor resources immediately.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: TempoDistributorUsageTrackerErrors
expr: 'sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0'
expr: 'sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0.05'
for: 30m
labels:
severity: critical
@ -137,7 +138,7 @@ groups:
description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoMetricsGeneratorProcessorUpdatesFailing
expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 0'
expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 2'
for: 15m
labels:
severity: critical
@ -146,7 +147,7 @@ groups:
description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoMetricsGeneratorServiceGraphsDroppingSpans
expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans_total[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
for: 15m
labels:
severity: warning

View file

@ -41,6 +41,7 @@ groups:
summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 10ms. Adjust based on your expected database latency.
- alert: StoreConnectionSlow
expr: 'store_connection_wait_time_ms > 10'
for: 0m
@ -50,6 +51,7 @@ groups:
summary: Store connection slow (instance {{ $labels.instance }})
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 20ms. Adjust based on your expected database latency.
- alert: StoreConnectionVerySlow
expr: 'store_connection_wait_time_ms > 20'
for: 0m

View file

@ -5,6 +5,9 @@ groups:
rules:
# When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
# so this alert may not fire. Prefer application-level availability metrics if available.
# Rename job="hadoop-namenode" to match the actual job name in your Prometheus scrape config.
- alert: HadoopNameNodeDown
expr: 'up{job="hadoop-namenode"} == 0'
for: 5m
@ -14,6 +17,9 @@ groups:
summary: Hadoop Name Node Down (instance {{ $labels.instance }})
description: "The Hadoop NameNode service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
# so this alert may not fire. Prefer application-level availability metrics if available.
# Rename job="hadoop-resourcemanager" to match the actual job name in your Prometheus scrape config.
- alert: HadoopResourceManagerDown
expr: 'up{job="hadoop-resourcemanager"} == 0'
for: 5m
@ -51,7 +57,7 @@ groups:
description: "There is an unusually high number of MapReduce task failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopResourceManagerMemoryHigh
expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8'
expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 and hadoop_resourcemanager_memory_max_bytes > 0'
for: 15m
labels:
severity: warning
@ -78,7 +84,7 @@ groups:
description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopHbaseRegionServerHeapLow
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8'
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 and hadoop_hbase_region_server_max_heap_bytes > 0'
for: 10m
labels:
severity: warning

View file

@ -12,7 +12,7 @@ groups:
severity: critical
annotations:
summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp5xxErrorRateBackend
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0'
@ -21,7 +21,7 @@ groups:
severity: critical
annotations:
summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp4xxErrorRateServer
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
@ -57,7 +57,7 @@ groups:
severity: critical
annotations:
summary: HAProxy backend connection errors (instance {{ $labels.instance }})
description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many connection errors to {{ $labels.proxy }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerConnectionErrors
expr: '(sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100'
@ -66,10 +66,10 @@ groups:
severity: critical
annotations:
summary: HAProxy server connection errors (instance {{ $labels.instance }})
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many connection errors to {{ $labels.proxy }} (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendMaxActiveSession>80%
expr: '((haproxy_backend_current_sessions >0) * 100) / (haproxy_backend_limit_sessions > 0) > 80'
expr: '(haproxy_backend_current_sessions / haproxy_backend_limit_sessions * 100) > 80 and haproxy_backend_limit_sessions > 0'
for: 2m
labels:
severity: warning
@ -94,7 +94,7 @@ groups:
severity: warning
annotations:
summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "HAProxy backend max total time is above 1s on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyRetryHigh
expr: 'sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
@ -124,8 +124,8 @@ groups:
description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerHealthcheckFailure
expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
for: 1m
expr: 'increase(haproxy_server_check_failures_total[1m]) > 2'
for: 0m
labels:
severity: warning
annotations:

View file

@ -15,22 +15,22 @@ groups:
description: "HAProxy down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp4xxErrorRateBackend(v1)
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 4xx error rate backend (v1) (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp5xxErrorRateBackend(v1)
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 5xx error rate backend (v1) (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp4xxErrorRateServer(v1)
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
@ -66,7 +66,7 @@ groups:
severity: critical
annotations:
summary: HAProxy backend connection errors (v1) (instance {{ $labels.instance }})
description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many connection errors to {{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerConnectionErrors(v1)
expr: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100'
@ -84,7 +84,7 @@ groups:
severity: warning
annotations:
summary: HAProxy backend max active session (instance {{ $labels.instance }})
description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "HAProxy backend {{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyPendingRequests(v1)
expr: 'sum by (backend) (haproxy_backend_current_queue) > 0'
@ -93,7 +93,7 @@ groups:
severity: warning
annotations:
summary: HAProxy pending requests (v1) (instance {{ $labels.instance }})
description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Some HAProxy requests are pending on {{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHttpSlowingDown(v1)
expr: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1'
@ -111,7 +111,7 @@ groups:
severity: warning
annotations:
summary: HAProxy retry high (v1) (instance {{ $labels.instance }})
description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "High rate of retry on {{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendDown
expr: 'haproxy_backend_up == 0'
@ -141,8 +141,8 @@ groups:
description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerHealthcheckFailure(v1)
expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
for: 1m
expr: 'increase(haproxy_server_check_failures_total[1m]) > 2'
for: 0m
labels:
severity: warning
annotations:

View file

@ -7,7 +7,7 @@ groups:
- alert: VaultSealed
expr: 'vault_core_unsealed == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -21,7 +21,7 @@ groups:
severity: warning
annotations:
summary: Vault too many pending tokens (instance {{ $labels.instance }})
description: "Too many pending tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many pending tokens on {{ $labels.instance }}: {{ $value }} tokens created but not yet stored.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: VaultTooManyInfinityTokens
expr: 'vault_token_count_by_ttl{creation_ttl="+Inf"} > 3'
@ -30,13 +30,13 @@ groups:
severity: warning
annotations:
summary: Vault too many infinity tokens (instance {{ $labels.instance }})
description: "Too many infinity tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many non-expiring tokens on {{ $labels.instance }}: {{ $value }} tokens with infinite TTL.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: VaultClusterHealth
expr: 'sum(vault_core_active) / count(vault_core_active) <= 0.5'
expr: 'sum(vault_core_active) / count(vault_core_active) <= 0.5 and count(vault_core_active) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Vault cluster health (instance {{ $labels.instance }})
description: "Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Vault cluster is not healthy: only {{ $value | humanizePercentage }} of nodes are active.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -14,8 +14,9 @@ groups:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# node_vmstat_pgmajfault is exposed as untyped/gauge by node_exporter (from /proc/vmstat), so deriv() is used instead of rate().
- alert: HostMemoryUnderMemoryPressure
expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
expr: '(deriv(node_vmstat_pgmajfault[5m]) > 1000)'
for: 0m
labels:
severity: warning
@ -173,13 +174,13 @@ groups:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# x2 context switches is an arbitrary number.
# The alert threshold depends on the nature of the application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert: HostContextSwitchingHigh
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 and rate(node_context_switches_total[1d]) > 0'
for: 0m
labels:
severity: warning
@ -223,7 +224,7 @@ groups:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin.
# Uses ignoring(state) to handle additional labels on node_md_disks.
- alert: HostSoftwareRaidInsufficientDrives
expr: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)'
for: 0m
@ -253,7 +254,7 @@ groups:
# When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 1520 minutes to recover, the alert should still trigger.
- alert: HostOomKillDetected
expr: '(increase(node_vmstat_oom_kill[30m]) > 0)'
expr: '(delta(node_vmstat_oom_kill[30m]) > 0)'
for: 0m
labels:
severity: warning
@ -268,7 +269,7 @@ groups:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 1 minute.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0)'
@ -277,7 +278,7 @@ groups:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0'

View file

@ -12,17 +12,18 @@ groups:
severity: warning
annotations:
summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
description: "Gateway pods have dropped. Inbound traffic will likely be affected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Istio ingress gateway has only {{ $value }} available pod(s). Inbound traffic will likely be affected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioPilotHighTotalRequestRate
- alert: IstioPilotHighPushErrorRate
expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0'
for: 1m
labels:
severity: warning
annotations:
summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
summary: Istio Pilot high push error rate (instance {{ $labels.instance }})
description: "Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Mixer was deprecated in Istio 1.5 and removed in Istio 1.8+. This alert only applies to Istio < 1.8.
- alert: IstioMixerPrometheusDispatchesLow
expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
for: 1m
@ -32,6 +33,7 @@ groups:
summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
description: "Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 1000 req/s is a rough default. Adjust to your expected peak traffic.
- alert: IstioHighTotalRequestRate
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
for: 2m
@ -39,8 +41,9 @@ groups:
severity: warning
annotations:
summary: Istio high total request rate (instance {{ $labels.instance }})
description: "Global request rate in the service mesh is unusually high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Global request rate in the service mesh is unusually high ({{ $value | printf \"%.2f\" }} req/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 100 req/s is a rough default. Adjust to your expected baseline traffic. This alert may fire on startup or low-traffic environments.
- alert: IstioLowTotalRequestRate
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
for: 2m
@ -48,7 +51,7 @@ groups:
severity: warning
annotations:
summary: Istio low total request rate (instance {{ $labels.instance }})
description: "Global request rate in the service mesh is unusually low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Global request rate in the service mesh is unusually low ({{ $value | printf \"%.2f\" }} req/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioHigh4xxErrorRate
expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0'
@ -57,7 +60,7 @@ groups:
severity: warning
annotations:
summary: Istio high 4xx error rate (instance {{ $labels.instance }})
description: "High percentage of HTTP 4xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "High percentage of HTTP 4xx responses in Istio ({{ $value | printf \"%.1f\" }}% > 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioHigh5xxErrorRate
expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0'
@ -66,7 +69,7 @@ groups:
severity: warning
annotations:
summary: Istio high 5xx error rate (instance {{ $labels.instance }})
description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "High percentage of HTTP 5xx responses in Istio ({{ $value | printf \"%.1f\" }}% > 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioHighRequestLatency
expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100 and rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 0'
@ -75,22 +78,22 @@ groups:
severity: warning
annotations:
summary: Istio high request latency (instance {{ $labels.instance }})
description: "Istio average requests execution is longer than 100ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Istio average request duration is {{ $value }}ms (> 100ms).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioLatency99Percentile
expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, le)) > 1000'
for: 1m
labels:
severity: warning
annotations:
summary: Istio latency 99 percentile (instance {{ $labels.instance }})
description: "Istio 1% slowest requests are longer than 1000ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Istio p99 request latency is {{ $value }}ms (threshold: 1000ms).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioPilotDuplicateEntry
expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'
expr: 'sum(pilot_duplicate_envoy_clusters{}) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
description: "Istio pilot duplicate entry error.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Istio Pilot has detected {{ $value }} duplicate Envoy cluster(s), indicating misconfigured DestinationRules or ServiceEntries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -51,7 +51,7 @@ groups:
description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsRunFailureTotal
expr: 'delta(jenkins_runs_failure_total[1h]) > 100'
expr: 'increase(jenkins_runs_failure_total[1h]) > 100'
for: 0m
labels:
severity: warning

View file

@ -12,7 +12,7 @@ groups:
severity: critical
annotations:
summary: Kafka topics replicas (instance {{ $labels.instance }})
description: "Kafka topic in-sync partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Kafka topic {{ $labels.topic }} has fewer than 3 in-sync replicas ({{ $value }}), data durability is at risk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KafkaConsumerGroupLag
expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 10000'

View file

@ -134,7 +134,7 @@ groups:
description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeError
expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending"} > 0'
for: 0m
labels:
severity: critical

View file

@ -24,19 +24,19 @@ groups:
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestPanic
expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
for: 5m
expr: 'sum(increase(loki_panic_total[5m])) by (namespace, job) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Loki request panic (instance {{ $labels.instance }})
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "{{ $labels.job }} is experiencing {{ $value | humanize }} panic(s) in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestLatency
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
expr: 'histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (namespace, job, route, le)) > 1'
for: 5m
labels:
severity: critical
annotations:
summary: Loki request latency (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -34,13 +34,13 @@ groups:
description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MemcachedOutOfMemoryErrors
expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0'
expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Memcached out of memory errors (instance {{ $labels.instance }})
description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Memcached is returning out-of-memory errors on {{ $labels.instance }} ({{ $value }} errors/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions.
- alert: MemcachedMemoryUsageHigh(>90%)
@ -73,7 +73,7 @@ groups:
description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MemcachedConnectionsRejected
expr: 'increase(memcached_connections_rejected_total[5m]) > 0'
expr: 'increase(memcached_connections_rejected_total[5m]) > 3'
for: 5m
labels:
severity: warning
@ -82,7 +82,7 @@ groups:
description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MemcachedItemsTooLarge
expr: 'increase(memcached_item_too_large_total[5m]) > 0'
expr: 'increase(memcached_item_too_large_total[5m]) > 3'
for: 5m
labels:
severity: info

View file

@ -71,17 +71,19 @@ groups:
summary: MySQL Slave replication lag (instance {{ $labels.instance }})
description: "MySQL replication lag on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so delta() is used instead of increase().
- alert: MysqlSlowQueries
expr: 'increase(mysql_global_status_slow_queries[1m]) > 0'
expr: 'delta(mysql_global_status_slow_queries[1m]) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: MySQL slow queries (instance {{ $labels.instance }})
description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "MySQL server has some new slow queries ({{ $value }} in the last minute).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of rate().
- alert: MysqlInnodbLogWaits
expr: 'rate(mysql_global_status_innodb_log_waits[15m]) > 10'
expr: 'deriv(mysql_global_status_innodb_log_waits[15m]) > 10'
for: 0m
labels:
severity: warning
@ -98,8 +100,9 @@ groups:
summary: MySQL restarted (instance {{ $labels.instance }})
description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of irate().
- alert: MysqlHighQps
expr: 'irate(mysql_global_status_questions[1m]) > 10000'
expr: 'deriv(mysql_global_status_questions[1m]) > 10000'
for: 2m
labels:
severity: info

View file

@ -32,6 +32,7 @@ groups:
summary: Nats slow consumers (instance {{ $labels.instance }})
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Replace job="nats" with the actual job name in your Prometheus configuration.
- alert: NatsServerDown
expr: 'absent(up{job="nats"})'
for: 5m
@ -79,7 +80,7 @@ groups:
description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighNumberOfSubscriptions
expr: 'gnatsd_connz_subscriptions > 1000'
expr: 'gnatsd_varz_subscriptions > 1000'
for: 5m
labels:
severity: warning
@ -97,7 +98,7 @@ groups:
description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsTooManyErrors
expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0'
expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 5'
for: 5m
labels:
severity: warning
@ -114,6 +115,8 @@ groups:
summary: Nats JetStream accounts exceeded (instance {{ $labels.instance }})
description: "JetStream has more than 100 active accounts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Only enable this alert if your deployment requires leaf node connections.
# This will fire spuriously if leaf nodes are not configured.
- alert: NatsLeafNodeConnectionIssue
expr: 'gnatsd_varz_leafnodes == 0'
for: 5m

View file

@ -12,7 +12,7 @@ groups:
severity: warning
annotations:
summary: Nomad job failed (instance {{ $labels.instance }})
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Nomad job {{ $labels.job }} has {{ $value }} failed allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobLost
expr: 'nomad_nomad_job_summary_lost > 0'
@ -21,7 +21,7 @@ groups:
severity: warning
annotations:
summary: Nomad job lost (instance {{ $labels.instance }})
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Nomad job {{ $labels.job }} has {{ $value }} lost allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobQueued
expr: 'nomad_nomad_job_summary_queued > 0'
@ -30,7 +30,7 @@ groups:
severity: warning
annotations:
summary: Nomad job queued (instance {{ $labels.instance }})
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Nomad job {{ $labels.job }} has {{ $value }} queued allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadBlockedEvaluation
expr: 'nomad_nomad_blocked_evals_total_blocked > 0'
@ -39,4 +39,4 @@ groups:
severity: warning
annotations:
summary: Nomad blocked evaluation (instance {{ $labels.instance }})
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Nomad has {{ $value }} blocked evaluations. The cluster may lack resources to place allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -5,6 +5,7 @@ groups:
rules:
# Adjust the job label regex to match the actual job name in your Prometheus scrape config.
- alert: OpenstackExporterDown
expr: 'up{job=~".*openstack.*"} == 0'
for: 2m

View file

@ -8,6 +8,7 @@ groups:
rules:
# Adjust the job label regex to match the actual job name in your Prometheus scrape config.
- alert: OpentelemetryCollectorDown
expr: 'up{job=~".*otel.*collector.*"} == 0'
for: 1m
@ -17,8 +18,9 @@ groups:
summary: OpenTelemetry Collector down (instance {{ $labels.instance }})
description: "OpenTelemetry Collector instance has disappeared or is not being scraped\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorReceiverRefusedSpans
expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0'
expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0.05'
for: 5m
labels:
severity: critical
@ -26,8 +28,9 @@ groups:
summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorReceiverRefusedMetricPoints
expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0.05'
for: 5m
labels:
severity: critical
@ -35,8 +38,9 @@ groups:
summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorReceiverRefusedLogRecords
expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0.05'
for: 5m
labels:
severity: critical
@ -84,6 +88,7 @@ groups:
description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
# These processor metrics are deprecated since collector v0.110.0.
- alert: OpentelemetryCollectorProcessorRefusedSpans
expr: 'rate(otelcol_processor_refused_spans[5m]) > 0.05'
for: 5m
@ -94,6 +99,7 @@ groups:
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
# These processor metrics are deprecated since collector v0.110.0.
- alert: OpentelemetryCollectorProcessorRefusedMetricPoints
expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05'
for: 5m
@ -104,7 +110,7 @@ groups:
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorHighMemoryUsage
expr: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
expr: '(otelcol_process_runtime_heap_alloc_bytes / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes) > 0.9'
for: 5m
labels:
severity: warning

View file

@ -70,7 +70,7 @@ groups:
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlDeadLocks
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres",datid!="0"}[1m]) > 5'
for: 0m
labels:
severity: warning
@ -79,7 +79,7 @@ groups:
description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRollbackRate
expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
expr: 'sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / (sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + sum by (namespace,datname,instance) (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))) > 0.02 and (sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + sum by (namespace,datname,instance) (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))) > 0'
for: 0m
labels:
severity: warning
@ -96,6 +96,7 @@ groups:
summary: Postgresql commit rate low (instance {{ $labels.instance }})
description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# pg_txid_current is not a default postgres_exporter metric. You need to define a custom query. See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert: PostgresqlLowXidConsumption
expr: 'rate(pg_txid_current[1m]) < 5'
for: 2m
@ -132,6 +133,7 @@ groups:
summary: Postgresql configuration changed (instance {{ $labels.instance }})
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# pg_stat_ssl_compression is not a default postgres_exporter metric and is only available on PostgreSQL 9.5-13 (removed in PG 14). See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert: PostgresqlSslCompressionActive
expr: 'sum by (instance) (pg_stat_ssl_compression) > 0'
for: 0m

View file

@ -143,7 +143,7 @@ groups:
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0'
expr: 'rate(alertmanager_notifications_failed_total[3m]) > 0.05'
for: 0m
labels:
severity: critical

View file

@ -15,7 +15,7 @@ groups:
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PromtailRequestLatency
expr: 'histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1'
expr: 'histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (namespace, job, route, le)) > 1'
for: 5m
labels:
severity: critical

View file

@ -41,24 +41,30 @@ groups:
summary: Pulsar topic very large backlog storage size (instance {{ $labels.instance }})
description: "The topic backlog storage size is over 20 GB\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# pulsar_storage_write_latency_le_overflow is the overflow bucket of Pulsar's non-standard histogram.
# It counts write operations exceeding all defined latency bounds (> 1000ms).
- alert: PulsarHighWriteLatency
expr: 'sum(pulsar_storage_write_latency_overflow > 0) by (topic)'
expr: 'sum(pulsar_storage_write_latency_le_overflow > 0) by (topic)'
for: 1h
labels:
severity: critical
annotations:
summary: Pulsar high write latency (instance {{ $labels.instance }})
description: "Messages cannot be written in a timely fashion\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Pulsar topic {{ $labels.topic }} has {{ $value }} storage write operations exceeding the maximum latency bucket (> 1000ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# pulsar_entry_size_le_overflow is the overflow bucket of Pulsar's non-standard histogram.
# It counts message entries exceeding all defined size bounds.
- alert: PulsarLargeMessagePayload
expr: 'sum(pulsar_entry_size_overflow > 0) by (topic)'
expr: 'sum(pulsar_entry_size_le_overflow > 0) by (topic)'
for: 1h
labels:
severity: warning
annotations:
summary: Pulsar large message payload (instance {{ $labels.instance }})
description: "Observing large message payload (> 1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Pulsar topic {{ $labels.topic }} has {{ $value }} message entries exceeding the maximum size bucket (> 1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This metric name is path-dependent and may differ based on your BookKeeper data directory configuration.
# Adjust the metric name to match your actual ledger directory path.
- alert: PulsarHighLedgerDiskUsage
expr: 'sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75'
for: 1h
@ -84,7 +90,7 @@ groups:
severity: critical
annotations:
summary: Pulsar high number of function errors (instance {{ $labels.instance }})
description: "Observing more than 10 Function errors per minute\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Pulsar function {{ $labels.name }} has more than 10 errors per second ({{ $value | printf \"%.2f\" }}/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PulsarHighNumberOfSinkErrors
expr: 'sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10'
@ -93,4 +99,4 @@ groups:
severity: critical
annotations:
summary: Pulsar high number of sink errors (instance {{ $labels.instance }})
description: "Observing more than 10 Sink errors per minute\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Pulsar sink {{ $labels.name }} has more than 10 errors per second ({{ $value | printf \"%.2f\" }}/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -6,13 +6,13 @@ groups:
rules:
- alert: PythonGcObjectsUncollectable
expr: 'increase(python_gc_objects_uncollectable_total[5m]) > 0'
expr: 'increase(python_gc_objects_uncollectable_total[5m]) > 1'
for: 5m
labels:
severity: warning
annotations:
summary: Python GC objects uncollectable (instance {{ $labels.instance }})
description: "Python has uncollectable objects, potential memory leak via reference cycles\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Python has uncollectable objects ({{ $value }}), potential memory leak via reference cycles\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PythonGcCollectionsHigh
expr: 'rate(python_gc_objects_collected_total[5m]) > 10000'

View file

@ -32,7 +32,7 @@ groups:
severity: critical
annotations:
summary: RabbitMQ cluster partition (instance {{ $labels.instance }})
description: "Cluster partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "RabbitMQ cluster has a network partition ({{ $value }} partitions detected). Messages may be lost or duplicated.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqOutOfMemory
expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0'
@ -44,7 +44,7 @@ groups:
description: "Memory available for RabbitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqInstanceTooManyConnections
expr: 'rabbitmq_connectionsTotal > 1000'
expr: 'rabbitmq_connections > 1000'
for: 2m
labels:
severity: warning

View file

@ -23,7 +23,7 @@ groups:
severity: critical
annotations:
summary: RabbitMQ node not distributed (instance {{ $labels.instance }})
description: "Distribution link state is not 'up'\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Distribution link to peer {{ $labels.peer }} is not 'up' (state {{ $value }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqInstancesDifferentVersions
expr: 'count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1'
@ -59,7 +59,7 @@ groups:
severity: warning
annotations:
summary: RabbitMQ too many ready messages (instance {{ $labels.instance }})
description: "RabbitMQ too many ready messages on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "RabbitMQ too many ready messages on queue {{ $labels.queue }} ({{ $value }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyUnackMessages
expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
@ -68,7 +68,7 @@ groups:
severity: warning
annotations:
summary: RabbitMQ too many unack messages (instance {{ $labels.instance }})
description: "Too many unacknowledged messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many unacknowledged messages on queue {{ $labels.queue }} ({{ $value }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyConnections
expr: 'rabbitmq_connections > 1000'
@ -88,11 +88,12 @@ groups:
summary: RabbitMQ no queue consumer (instance {{ $labels.instance }})
description: "A queue has less than 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 3 avoids noise from occasional misroutes. Adjust based on your expected traffic patterns.
- alert: RabbitmqUnroutableMessages
expr: 'increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0'
expr: 'increase(rabbitmq_channel_messages_unroutable_returned_total[5m]) > 3 or increase(rabbitmq_channel_messages_unroutable_dropped_total[5m]) > 3'
for: 2m
labels:
severity: warning
annotations:
summary: RabbitMQ unroutable messages (instance {{ $labels.instance }})
description: "A queue has unroutable messages ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "A queue has unroutable messages ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -24,9 +24,9 @@ groups:
summary: Ruby heap free slots high (instance {{ $labels.instance }})
description: "Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Major GC rate > 5/s is extremely high. Consider lowering to > 1 or > 2 for earlier detection.
# Major GC rate > 5/s only fires if the app is essentially non-functional. Threshold of 2/s provides earlier detection.
- alert: RubyMajorGcRateHigh
expr: 'rate(ruby_major_gc_ops_total[5m]) > 5'
expr: 'rate(ruby_major_gc_ops_total[5m]) > 2'
for: 5m
labels:
severity: warning

View file

@ -30,7 +30,7 @@ groups:
severity: critical
annotations:
summary: SMART device temperature over trip value (instance {{ $labels.instance }})
description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartDeviceTemperatureNearingTripValue
expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
@ -39,7 +39,7 @@ groups:
severity: warning
annotations:
summary: SMART device temperature nearing trip value (instance {{ $labels.instance }})
description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartStatus
expr: 'smartctl_device_smart_status != 1'
@ -48,7 +48,7 @@ groups:
severity: critical
annotations:
summary: SMART status (instance {{ $labels.instance }})
description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartCriticalWarning
expr: 'smartctl_device_critical_warning > 0'
@ -57,7 +57,7 @@ groups:
severity: critical
annotations:
summary: SMART critical warning (instance {{ $labels.instance }})
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartMediaErrors
expr: 'smartctl_device_media_errors > 0'
@ -66,7 +66,7 @@ groups:
severity: critical
annotations:
summary: SMART media errors (instance {{ $labels.instance }})
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartWearoutIndicator
expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
@ -75,4 +75,4 @@ groups:
severity: critical
annotations:
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -6,16 +6,16 @@ groups:
rules:
- alert: SidekiqQueueSize
expr: 'sidekiq_queue_size > 100'
expr: 'sidekiq_queue_enqueued_jobs > 100'
for: 1m
labels:
severity: warning
annotations:
summary: Sidekiq queue size (instance {{ $labels.instance }})
description: "Sidekiq queue {{ $labels.name }} is growing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Sidekiq queue {{ $labels.name }} is growing ({{ $value }} enqueued jobs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SidekiqSchedulingLatencyTooHigh
expr: 'max(sidekiq_queue_latency) > 60'
expr: 'max(sidekiq_queue_latency_seconds) > 60'
for: 0m
labels:
severity: critical

View file

@ -7,7 +7,7 @@ groups:
rules:
# From the official snmp-mixin.
# Rename job=~"snmp.*" to match the actual job name in your Prometheus scrape config.
- alert: SnmpTargetDown
expr: 'up{job=~"snmp.*"} == 0'
for: 5m

View file

@ -36,24 +36,24 @@ groups:
description: "Orca queue message lag is {{ $value }}s. Pipeline stages are waiting too long before being processed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SpinnakerDeadMessages
expr: 'rate(queue_dead_messages_total[5m]) > 0'
expr: 'rate(queue_dead_messages_total[5m]) > 0.05'
for: 2m
labels:
severity: critical
annotations:
summary: Spinnaker dead messages (instance {{ $labels.instance }})
description: "Orca is producing dead-lettered messages ({{ $value }} per second). These are tasks that exhausted all retries and will not be executed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Orca is producing dead-lettered messages ({{ $value | humanize }}/s). These are tasks that exhausted all retries and will not be executed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Zombies are pipeline executions that are running but have lost their queue entry.
# See https://spinnaker.io/docs/guides/runbooks/orca-zombie-executions/
- alert: SpinnakerZombieExecutions
expr: 'rate(queue_zombies_total[5m]) > 0'
expr: 'rate(queue_zombies_total[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Spinnaker zombie executions (instance {{ $labels.instance }})
description: "{{ $value }} zombie pipeline executions detected. These are executions with no corresponding queue messages.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Zombie pipeline executions rate is {{ $value | humanize }}/s. These are executions with no corresponding queue messages.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SpinnakerThreadPoolExhaustion
expr: 'threadpool_blockingQueueSize > 0'
@ -76,7 +76,7 @@ groups:
description: "Igor polling monitor {{ $labels.monitor }} for {{ $labels.partition }} has exceeded its item threshold, preventing pipeline triggers.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SpinnakerPollingMonitorFailures
expr: 'rate(pollingMonitor_failed_total[5m]) > 0'
expr: 'rate(pollingMonitor_failed_total[5m]) > 0.05'
for: 5m
labels:
severity: warning
@ -95,7 +95,7 @@ groups:
description: "Spinnaker API 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SpinnakerApiRateLimitThrottling
expr: 'rate(rateLimitThrottling_total[5m]) > 0'
expr: 'rate(rateLimitThrottling_total[5m]) > 0.05'
for: 2m
labels:
severity: warning

View file

@ -7,21 +7,21 @@ groups:
- alert: SslCertificateProbeFailed
expr: 'ssl_probe_success == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
summary: SSL certificate probe failed (instance {{ $labels.instance }})
description: "Failed to fetch SSL information {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SslCertificateOscpStatusUnknown
- alert: SslCertificateOcspStatusUnknown
expr: 'ssl_ocsp_response_status == 2'
for: 0m
labels:
severity: warning
annotations:
summary: SSL certificate OSCP status unknown (instance {{ $labels.instance }})
description: "Failed to get the OSCP status {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: SSL certificate OCSP status unknown (instance {{ $labels.instance }})
description: "Failed to get the OCSP status for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SslCertificateRevoked
expr: 'ssl_ocsp_response_status == 1'

View file

@ -42,8 +42,9 @@ groups:
summary: Systemd unit tasks near limit (instance {{ $labels.instance }})
description: "Systemd unit {{ $labels.name }} is using {{ $value | humanizePercentage }} of its task limit. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# systemd_socket_refused_connections_total is declared as Gauge by the exporter despite the _total suffix, so delta() is used instead of increase().
- alert: SystemdSocketRefusedConnections
expr: 'increase(systemd_socket_refused_connections_total[5m]) > 0'
expr: 'delta(systemd_socket_refused_connections_total[5m]) > 3'
for: 2m
labels:
severity: warning

View file

@ -6,7 +6,7 @@ groups:
rules:
- alert: ThanosBucketReplicateErrorRate
expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) > 0'
expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total[5m])) > 0'
for: 5m
labels:
severity: critical
@ -15,7 +15,7 @@ groups:
description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosBucketReplicateRunLatency
expr: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_count[5m])) > 0)'
for: 5m
labels:
severity: critical

View file

@ -15,7 +15,7 @@ groups:
description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosCompactorHalted
expr: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
expr: 'thanos_compact_halted == 1'
for: 5m
labels:
severity: warning
@ -24,7 +24,7 @@ groups:
description: "Thanos Compact {{$labels.job}} has failed to run and now is halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosCompactorHighCompactionFailures
expr: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) > 0'
expr: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total[5m])) > 0'
for: 15m
labels:
severity: warning

View file

@ -32,8 +32,9 @@ groups:
summary: Thanos Query Grpc Server Error Rate (instance {{ $labels.instance }})
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Filters to actual error codes only. grpc_code!="OK" would include benign codes like NotFound, AlreadyExists, and Cancelled.
- alert: ThanosQueryGrpcClientErrorRate
expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0'
expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded|ResourceExhausted", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0'
for: 5m
labels:
severity: warning
@ -42,7 +43,7 @@ groups:
description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryHighDNSFailures
expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) > 0'
expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m])) > 0'
for: 15m
labels:
severity: warning
@ -51,7 +52,7 @@ groups:
description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosQueryInstantLatencyHigh
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
for: 10m
labels:
severity: critical

View file

@ -24,7 +24,7 @@ groups:
description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveHighReplicationFailures
expr: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
expr: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error"}[5m])) / sum by (job) (rate(thanos_receive_replications_total[5m]))) > (max by (job) (floor((thanos_receive_replication_factor+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes))) * 100'
for: 5m
labels:
severity: warning
@ -33,7 +33,7 @@ groups:
description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveHighForwardRequestFailures
expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) > 0'
expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total[5m])) > 0'
for: 5m
labels:
severity: info
@ -42,7 +42,7 @@ groups:
description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveHighHashringFileRefreshFailures
expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0'
expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0'
for: 15m
labels:
severity: warning
@ -51,7 +51,7 @@ groups:
description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosReceiveConfigReloadFailure
expr: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
expr: 'avg by (job) (thanos_receive_config_last_reload_successful) != 1'
for: 5m
labels:
severity: warning

View file

@ -6,7 +6,7 @@ groups:
rules:
- alert: ThanosRuleQueueIsDroppingAlerts
expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total[5m])) > 0'
for: 5m
labels:
severity: critical
@ -15,7 +15,7 @@ groups:
description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleSenderIsFailingAlerts
expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total[5m])) > 0'
for: 5m
labels:
severity: critical
@ -34,7 +34,7 @@ groups:
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: ThanosRuleHighRuleEvaluationWarnings
expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05'
expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total[5m])) > 0.05'
for: 15m
labels:
severity: info
@ -61,7 +61,7 @@ groups:
description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleConfigReloadFailure
expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
expr: 'avg by (job, instance) (thanos_rule_config_last_reload_successful) != 1'
for: 5m
labels:
severity: info
@ -70,7 +70,7 @@ groups:
description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleQueryHighDNSFailures
expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0'
expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) > 0'
for: 15m
labels:
severity: warning
@ -79,7 +79,7 @@ groups:
description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleAlertmanagerHighDNSFailures
expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0'
expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) > 0'
for: 15m
labels:
severity: warning
@ -97,7 +97,7 @@ groups:
description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosNoRuleEvaluations
expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
expr: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules) > 0'
for: 5m
labels:
severity: critical

View file

@ -16,7 +16,7 @@ groups:
description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosSidecarNoConnectionToStartedPrometheus
expr: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
expr: 'thanos_sidecar_prometheus_up == 0 and on (namespace, pod) prometheus_tsdb_data_replay_duration_seconds != 0'
for: 5m
labels:
severity: critical

View file

@ -15,7 +15,7 @@ groups:
description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosStoreSeriesGateLatencyHigh
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
expr: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count[5m])) > 0)'
for: 10m
labels:
severity: warning

View file

@ -6,13 +6,13 @@ groups:
rules:
- alert: ZfsPoolOutOfSpace
expr: 'zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0'
expr: 'zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0'
for: 0m
labels:
severity: warning
annotations:
summary: ZFS pool out of space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "ZFS pool {{ $labels.pool }} is almost full (< 10% left).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 0: ONLINE
# 1: DEGRADED

View file

@ -31,7 +31,7 @@ groups:
severity: critical
annotations:
summary: Zookeeper Too Many Leaders (instance {{ $labels.instance }})
description: "Zookeeper cluster has too many nodes marked as leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Zookeeper cluster has {{ $value }} nodes marked as leader (expected 1), indicating a split-brain\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ZookeeperNotOk
expr: 'zk_ruok == 0'
@ -40,4 +40,4 @@ groups:
severity: warning
annotations:
summary: Zookeeper Not Ok (instance {{ $labels.instance }})
description: "Zookeeper instance is not ok\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Zookeeper instance {{ $labels.instance }} is not ok (ruok check failed)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"