This commit is contained in:
samber 2026-03-18 20:41:01 +00:00
parent e3a7165a65
commit af2f277830
32 changed files with 226 additions and 217 deletions

View file

@ -24,7 +24,7 @@ groups:
description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSlowProbe
expr: 'avg_over_time(probe_duration_seconds[1m]) > 1'
expr: 'probe_duration_seconds > 1'
for: 1m
labels:
severity: warning
@ -73,7 +73,7 @@ groups:
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowHttp
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
expr: 'probe_http_duration_seconds > 1'
for: 1m
labels:
severity: warning
@ -82,7 +82,7 @@ groups:
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowPing
expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
expr: 'probe_icmp_duration_seconds > 1'
for: 1m
labels:
severity: warning

View file

@ -51,31 +51,31 @@ groups:
summary: Cassandra node down (instance {{ $labels.instance }})
description: "Cassandra node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraCommitlogPendingTasks
- alert: CassandraCommitlogPendingTasks(criteo)
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra commitlog pending tasks (instance {{ $labels.instance }})
summary: Cassandra commitlog pending tasks (Criteo) (instance {{ $labels.instance }})
description: "Unexpected number of Cassandra commitlog pending tasks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraCompactionExecutorBlockedTasks
- alert: CassandraCompactionExecutorBlockedTasks(criteo)
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra compaction executor blocked tasks (instance {{ $labels.instance }})
summary: Cassandra compaction executor blocked tasks (Criteo) (instance {{ $labels.instance }})
description: "Some Cassandra compaction executor tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraFlushWriterBlockedTasks
- alert: CassandraFlushWriterBlockedTasks(criteo)
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra flush writer blocked tasks (instance {{ $labels.instance }})
summary: Cassandra flush writer blocked tasks (Criteo) (instance {{ $labels.instance }})
description: "Some Cassandra flush writer tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraRepairPendingTasks
@ -96,67 +96,67 @@ groups:
summary: Cassandra repair blocked tasks (instance {{ $labels.instance }})
description: "Some Cassandra repair tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraConnectionTimeoutsTotal
- alert: CassandraConnectionTimeoutsTotal(criteo)
expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra connection timeouts total (instance {{ $labels.instance }})
summary: Cassandra connection timeouts total (Criteo) (instance {{ $labels.instance }})
description: "Some connection between nodes are ending in timeout\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraStorageExceptions
- alert: CassandraStorageExceptions(criteo)
expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra storage exceptions (instance {{ $labels.instance }})
summary: Cassandra storage exceptions (Criteo) (instance {{ $labels.instance }})
description: "Something is going wrong with cassandra storage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraTombstoneDump
- alert: CassandraTombstoneDump(criteo)
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra tombstone dump (instance {{ $labels.instance }})
summary: Cassandra tombstone dump (Criteo) (instance {{ $labels.instance }})
description: "Too much tombstones scanned in queries\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestUnavailableWrite
- alert: CassandraClientRequestUnavailableWrite(criteo)
expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra client request unavailable write (instance {{ $labels.instance }})
summary: Cassandra client request unavailable write (Criteo) (instance {{ $labels.instance }})
description: "Write failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestUnavailableRead
- alert: CassandraClientRequestUnavailableRead(criteo)
expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra client request unavailable read (instance {{ $labels.instance }})
summary: Cassandra client request unavailable read (Criteo) (instance {{ $labels.instance }})
description: "Read failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestWriteFailure
- alert: CassandraClientRequestWriteFailure(criteo)
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra client request write failure (instance {{ $labels.instance }})
summary: Cassandra client request write failure (Criteo) (instance {{ $labels.instance }})
description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestReadFailure
- alert: CassandraClientRequestReadFailure(criteo)
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra client request read failure (instance {{ $labels.instance }})
summary: Cassandra client request read failure (Criteo) (instance {{ $labels.instance }})
description: "A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraCacheHitRateKeyCache

View file

@ -24,92 +24,92 @@ groups:
summary: Cassandra many compaction tasks are pending (instance {{ $labels.instance }})
description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraCommitlogPendingTasks
- alert: CassandraCommitlogPendingTasks(instaclustr)
expr: 'cassandra_commit_log_pending_tasks > 15'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra commitlog pending tasks (instance {{ $labels.instance }})
summary: Cassandra commitlog pending tasks (Instaclustr) (instance {{ $labels.instance }})
description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraCompactionExecutorBlockedTasks
- alert: CassandraCompactionExecutorBlockedTasks(instaclustr)
expr: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra compaction executor blocked tasks (instance {{ $labels.instance }})
summary: Cassandra compaction executor blocked tasks (Instaclustr) (instance {{ $labels.instance }})
description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraFlushWriterBlockedTasks
- alert: CassandraFlushWriterBlockedTasks(instaclustr)
expr: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra flush writer blocked tasks (instance {{ $labels.instance }})
summary: Cassandra flush writer blocked tasks (Instaclustr) (instance {{ $labels.instance }})
description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraConnectionTimeoutsTotal
- alert: CassandraConnectionTimeoutsTotal(instaclustr)
expr: 'sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra connection timeouts total (instance {{ $labels.instance }})
summary: Cassandra connection timeouts total (Instaclustr) (instance {{ $labels.instance }})
description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraStorageExceptions
- alert: CassandraStorageExceptions(instaclustr)
expr: 'changes(cassandra_storage_exceptions_total[1m]) > 1'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra storage exceptions (instance {{ $labels.instance }})
summary: Cassandra storage exceptions (Instaclustr) (instance {{ $labels.instance }})
description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraTombstoneDump
- alert: CassandraTombstoneDump(instaclustr)
expr: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra tombstone dump (instance {{ $labels.instance }})
summary: Cassandra tombstone dump (Instaclustr) (instance {{ $labels.instance }})
description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestUnavailableWrite
- alert: CassandraClientRequestUnavailableWrite(instaclustr)
expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra client request unavailable write (instance {{ $labels.instance }})
summary: Cassandra client request unavailable write (Instaclustr) (instance {{ $labels.instance }})
description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestUnavailableRead
- alert: CassandraClientRequestUnavailableRead(instaclustr)
expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra client request unavailable read (instance {{ $labels.instance }})
summary: Cassandra client request unavailable read (Instaclustr) (instance {{ $labels.instance }})
description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestWriteFailure
- alert: CassandraClientRequestWriteFailure(instaclustr)
expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra client request write failure (instance {{ $labels.instance }})
summary: Cassandra client request write failure (Instaclustr) (instance {{ $labels.instance }})
description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestReadFailure
- alert: CassandraClientRequestReadFailure(instaclustr)
expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra client request read failure (instance {{ $labels.instance }})
summary: Cassandra client request read failure (Instaclustr) (instance {{ $labels.instance }})
description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -117,7 +117,7 @@ groups:
description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseAuthenticationFailures
expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0'
expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 3'
for: 0m
labels:
severity: info
@ -126,7 +126,7 @@ groups:
description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseAccessDeniedErrors
expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0'
expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 3'
for: 0m
labels:
severity: info

View file

@ -31,7 +31,7 @@ groups:
severity: critical
annotations:
summary: Cortex notification are being dropped (instance {{ $labels.instance }})
description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: CortexNotificationError
@ -41,7 +41,7 @@ groups:
severity: critical
annotations:
summary: Cortex notification error (instance {{ $labels.instance }})
description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CortexIngesterUnhealthy
expr: 'cortex_ring_members{state="Unhealthy", name="ingester"} > 0'

View file

@ -33,7 +33,7 @@ groups:
severity: warning
annotations:
summary: Container High CPU utilization (instance {{ $labels.instance }})
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Container CPU utilization is above 80% (current: {{ $value | printf \"%.2f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
- alert: ContainerHighMemoryUsage
@ -55,13 +55,13 @@ groups:
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerHighThrottleRate
expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0'
expr: 'sum(rate(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Container high throttle rate (instance {{ $labels.instance }})
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Container is being throttled ({{ $value | humanizePercentage }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerHighLowChangeCpuUsage
expr: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25'
@ -79,7 +79,7 @@ groups:
severity: info
annotations:
summary: Container Low CPU utilization (instance {{ $labels.instance }})
description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf \"%.2f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerLowMemoryUsage
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'

View file

@ -143,13 +143,13 @@ groups:
description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighIndexingLatency
expr: 'increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and increase(elasticsearch_indices_indexing_index_total[1m]) > 0'
expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0'
for: 10m
labels:
severity: warning
annotations:
summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighIndexingRate
expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
@ -170,10 +170,10 @@ groups:
description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighQueryLatency
expr: 'increase(elasticsearch_indices_search_query_time_seconds[1m]) / increase(elasticsearch_indices_search_query_total[1m]) > 1 and increase(elasticsearch_indices_search_query_total[1m]) > 0'
expr: 'rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Elasticsearch High Query Latency (instance {{ $labels.instance }})
description: "The query latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -42,13 +42,13 @@ groups:
description: "More than 10% of downstream HTTP responses are 4xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyDownstreamConnectionsOverflowing
expr: 'increase(envoy_listener_downstream_cx_overflow[5m]) > 0'
expr: 'increase(envoy_listener_downstream_cx_overflow[5m]) > 5'
for: 0m
labels:
severity: warning
annotations:
summary: Envoy downstream connections overflowing (instance {{ $labels.instance }})
description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyClusterMembershipEmpty
expr: 'envoy_cluster_membership_healthy == 0'
@ -75,10 +75,10 @@ groups:
severity: warning
annotations:
summary: Envoy high cluster upstream connection failures (instance {{ $labels.instance }})
description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyHighClusterUpstreamRequestTimeoutRate
expr: 'increase(envoy_cluster_upstream_rq_timeout[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0'
expr: 'rate(envoy_cluster_upstream_rq_timeout[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
for: 5m
labels:
severity: warning
@ -87,7 +87,7 @@ groups:
description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyHighClusterUpstream5xxErrorRate
expr: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0'
expr: 'rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
for: 1m
labels:
severity: critical
@ -102,7 +102,7 @@ groups:
severity: warning
annotations:
summary: Envoy cluster health check failures (instance {{ $labels.instance }})
description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyClusterOutlierDetectionEjectionsActive
expr: 'envoy_cluster_outlier_detection_ejections_active > 0'
@ -114,22 +114,22 @@ groups:
description: "There are active outlier detection ejections in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyListenerSslConnectionErrors
expr: 'increase(envoy_listener_ssl_connection_error[5m]) > 0'
expr: 'increase(envoy_listener_ssl_connection_error[5m]) > 5'
for: 0m
labels:
severity: warning
annotations:
summary: Envoy listener SSL connection errors (instance {{ $labels.instance }})
description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyGlobalDownstreamConnectionsOverflowing
expr: 'increase(envoy_listener_downstream_global_cx_overflow[5m]) > 0'
expr: 'increase(envoy_listener_downstream_global_cx_overflow[5m]) > 5'
for: 0m
labels:
severity: critical
annotations:
summary: Envoy global downstream connections overflowing (instance {{ $labels.instance }})
description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoySslCertificateExpiringSoon
expr: 'envoy_server_days_until_first_cert_expiring < 7'
@ -165,7 +165,7 @@ groups:
severity: critical
annotations:
summary: Envoy no healthy upstream (instance {{ $labels.instance }})
description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyHighDownstreamRequestTimeoutRate
expr: 'increase(envoy_http_downstream_rq_timeout[5m]) > 5'
@ -174,4 +174,4 @@ groups:
severity: warning
annotations:
summary: Envoy high downstream request timeout rate (instance {{ $labels.instance }})
description: "Downstream requests are timing out on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -30,26 +30,26 @@ groups:
severity: warning
annotations:
summary: Etcd high number of leader changes (instance {{ $labels.instance }})
description: "Etcd leader changed more than 2 times during 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Etcd leader changed {{ $value }} times during 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
- alert: EtcdHighNumberOfFailedGrpcRequests
- alert: EtcdHighNumberOfFailedGrpcRequestsWarning
expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
summary: Etcd high number of failed GRPC requests warning (instance {{ $labels.instance }})
description: "More than 1% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
- alert: EtcdHighNumberOfFailedGrpcRequests
- alert: EtcdHighNumberOfFailedGrpcRequestsCritical
expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
summary: Etcd high number of failed GRPC requests critical (instance {{ $labels.instance }})
description: "More than 5% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdGrpcRequestsSlow
@ -61,22 +61,22 @@ groups:
summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
description: "GRPC requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighNumberOfFailedHttpRequests
- alert: EtcdHighNumberOfFailedHttpRequestsWarning
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
summary: Etcd high number of failed HTTP requests warning (instance {{ $labels.instance }})
description: "More than 1% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighNumberOfFailedHttpRequests
- alert: EtcdHighNumberOfFailedHttpRequestsCritical
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
summary: Etcd high number of failed HTTP requests critical (instance {{ $labels.instance }})
description: "More than 5% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHttpRequestsSlow
@ -104,7 +104,7 @@ groups:
severity: warning
annotations:
summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
description: "Etcd server got more than 5 failed proposals past hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Etcd server got {{ $value }} failed proposals in the past hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighFsyncDurations
expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5'

View file

@ -8,7 +8,7 @@ groups:
# Queued connections indicate Puma workers are saturated.
# Consider increasing puma['worker_processes'] or puma['max_threads'] in gitlab.rb.
- alert: GitlabPumaHighQueuedConnections
expr: 'avg_over_time(puma_queued_connections[5m]) > 5'
expr: 'puma_queued_connections > 5'
for: 5m
labels:
severity: warning
@ -85,7 +85,7 @@ groups:
severity: warning
annotations:
summary: GitLab Sidekiq high job completion time (instance {{ $labels.instance }})
description: "GitLab Sidekiq job average completion time on {{ $labels.instance }} is above 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "GitLab Sidekiq job p95 completion time on {{ $labels.instance }} is above 5 minutes ({{ $value | humanizeDuration }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
# High queue latency means jobs are stuck waiting. Check Sidekiq concurrency and queue sizes.

View file

@ -105,14 +105,15 @@ groups:
summary: Mimir ingested data too far in the future (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.job }} has ingested samples with timestamps more than 1 hour in the future.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirStoreGatewayTooManyFailedOperations
expr: 'sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0'
expr: 'sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Mimir store gateway too many failed operations (instance {{ $labels.instance }})
description: "Mimir store-gateway {{ $labels.job }} bucket operations are failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir store-gateway {{ $labels.job }} bucket operations are failing ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirRingMembersMismatch
expr: 'max by (name, job) (sum by (name, job, instance) (cortex_ring_members)) != min by (name, job) (sum by (name, job, instance) (cortex_ring_members))'
@ -184,7 +185,7 @@ groups:
severity: critical
annotations:
summary: Mimir ingester TSDB head compaction failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirIngesterTsdbHeadTruncationFailed
expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0'
@ -193,7 +194,7 @@ groups:
severity: critical
annotations:
summary: Mimir ingester TSDB head truncation failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirIngesterTsdbCheckpointCreationFailed
expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0'
@ -202,7 +203,7 @@ groups:
severity: critical
annotations:
summary: Mimir ingester TSDB checkpoint creation failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirIngesterTsdbCheckpointDeletionFailed
expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0'
@ -211,7 +212,7 @@ groups:
severity: critical
annotations:
summary: Mimir ingester TSDB checkpoint deletion failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirIngesterTsdbWalTruncationFailed
expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0'
@ -220,7 +221,7 @@ groups:
severity: warning
annotations:
summary: Mimir ingester TSDB WAL truncation failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirIngesterTsdbWalWritesFailed
expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0'
@ -229,7 +230,7 @@ groups:
severity: critical
annotations:
summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold aligned with official Mimir mixin (30 minutes).
- alert: MimirStoreGatewayHasNotSyncedBucket
@ -284,7 +285,7 @@ groups:
severity: critical
annotations:
summary: Mimir compactor has consecutive failures (instance {{ $labels.instance }})
description: "Mimir compactor {{ $labels.instance }} has had 2+ compaction failures in the last 2 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirCompactorHasRunOutOfDiskSpace
expr: 'increase(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1'
@ -312,7 +313,7 @@ groups:
severity: warning
annotations:
summary: Mimir compactor skipped blocks (instance {{ $labels.instance }})
description: "Mimir compactor has found blocks that cannot be compacted (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirRulerTooManyFailedPushes
expr: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 0'
@ -341,14 +342,15 @@ groups:
summary: Mimir ruler missed evaluations (instance {{ $labels.instance }})
description: "Mimir ruler {{ $labels.instance }} is missing {{ printf \"%.2f\" $value }}% of rule group evaluations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirRulerFailedRingCheck
expr: 'sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0'
expr: 'sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0.05'
for: 5m
labels:
severity: critical
annotations:
summary: Mimir ruler failed ring check (instance {{ $labels.instance }})
description: "Mimir ruler {{ $labels.job }} is failing ring checks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirAlertmanagerSyncConfigsFailing
expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0'
@ -357,7 +359,7 @@ groups:
severity: critical
annotations:
summary: Mimir alertmanager sync configs failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirAlertmanagerRingCheckFailing
expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0'
@ -366,7 +368,7 @@ groups:
severity: critical
annotations:
summary: Mimir alertmanager ring check failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing ring checks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirAlertmanagerStateMergeFailing
expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0'
@ -375,7 +377,7 @@ groups:
severity: critical
annotations:
summary: Mimir alertmanager state merge failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirAlertmanagerReplicationFailing
expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0'
@ -384,7 +386,7 @@ groups:
severity: critical
annotations:
summary: Mimir alertmanager replication failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirAlertmanagerPersistStateFailing
expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0'
@ -393,7 +395,7 @@ groups:
severity: critical
annotations:
summary: Mimir alertmanager persist state failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing to persist state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Mimir alertmanager {{ $labels.job }} is failing to persist state ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirAlertmanagerInitialSyncFailed
expr: 'increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0'

View file

@ -40,7 +40,7 @@ groups:
severity: critical
annotations:
summary: Tempo compactions failing (instance {{ $labels.instance }})
description: "Greater than 2 compactions have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "{{ $value }} compactions have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoPollsFailing
expr: 'sum by (job) (increase(tempodb_blocklist_poll_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_poll_errors_total[5m])) > 0'
@ -49,7 +49,7 @@ groups:
severity: critical
annotations:
summary: Tempo polls failing (instance {{ $labels.instance }})
description: "Greater than 2 blocklist polls have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "{{ $value }} blocklist polls have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoTenantIndexFailures
expr: 'sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[5m])) > 0'
@ -58,7 +58,7 @@ groups:
severity: critical
annotations:
summary: Tempo tenant index failures (instance {{ $labels.instance }})
description: "Greater than 2 tenant index failures in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "{{ $value }} tenant index failures in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoNoTenantIndexBuilders
expr: 'sum by (tenant) (tempodb_blocklist_tenant_index_builder) == 0 and on() max(tempodb_blocklist_length) > 0'
@ -105,7 +105,7 @@ groups:
severity: critical
annotations:
summary: Tempo user configurable overrides reload failing (instance {{ $labels.instance }})
description: "Greater than 5 user-configurable overrides reloads have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "{{ $value }} user-configurable overrides reloads have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 100 blocks per compactor instance. Adjust based on your environment.
- alert: TempoCompactionTooManyOutstandingBlocksWarning
@ -134,7 +134,7 @@ groups:
severity: critical
annotations:
summary: Tempo distributor usage tracker errors (instance {{ $labels.instance }})
description: "Tempo distributor usage tracker errors for {{ $labels.job }} (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoMetricsGeneratorProcessorUpdatesFailing
expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 0'
@ -143,7 +143,7 @@ groups:
severity: critical
annotations:
summary: Tempo metrics generator processor updates failing (instance {{ $labels.instance }})
description: "Tempo metrics generator processor updates are failing for {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoMetricsGeneratorServiceGraphsDroppingSpans
expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
@ -161,7 +161,7 @@ groups:
severity: critical
annotations:
summary: Tempo metrics generator collections failing (instance {{ $labels.instance }})
description: "Tempo metrics generator collections are failing for {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Tempo metrics generator collections are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Fires when the memcached error rate exceeds 20%. Only relevant if Tempo is configured with memcached caching.
- alert: TempoMemcachedErrorsElevated

View file

@ -130,4 +130,4 @@ groups:
severity: warning
annotations:
summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
description: "Some server healthcheck are failing on {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -14,71 +14,71 @@ groups:
summary: HAProxy down (instance {{ $labels.instance }})
description: "HAProxy down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp4xxErrorRateBackend
- alert: HaproxyHighHttp4xxErrorRateBackend(v1)
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
summary: HAProxy high HTTP 4xx error rate backend (v1) (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp5xxErrorRateBackend
- alert: HaproxyHighHttp5xxErrorRateBackend(v1)
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
summary: HAProxy high HTTP 5xx error rate backend (v1) (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp4xxErrorRateServer
- alert: HaproxyHighHttp4xxErrorRateServer(v1)
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }})
summary: HAProxy high HTTP 4xx error rate server (v1) (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp5xxErrorRateServer
- alert: HaproxyHighHttp5xxErrorRateServer(v1)
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }})
summary: HAProxy high HTTP 5xx error rate server (v1) (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerResponseErrors
- alert: HaproxyServerResponseErrors(v1)
expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy server response errors (instance {{ $labels.instance }})
summary: HAProxy server response errors (v1) (instance {{ $labels.instance }})
description: "Too many response errors to {{ $labels.server }} server (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendConnectionErrors
- alert: HaproxyBackendConnectionErrors(v1)
expr: 'sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy backend connection errors (instance {{ $labels.instance }})
summary: HAProxy backend connection errors (v1) (instance {{ $labels.instance }})
description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerConnectionErrors
- alert: HaproxyServerConnectionErrors(v1)
expr: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100'
for: 0m
labels:
severity: critical
annotations:
summary: HAProxy server connection errors (instance {{ $labels.instance }})
summary: HAProxy server connection errors (v1) (instance {{ $labels.instance }})
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendMaxActiveSession
expr: '((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80 and sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])) > 0'
expr: '((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0'
for: 2m
labels:
severity: warning
@ -86,31 +86,31 @@ groups:
summary: HAProxy backend max active session (instance {{ $labels.instance }})
description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyPendingRequests
- alert: HaproxyPendingRequests(v1)
expr: 'sum by (backend) (haproxy_backend_current_queue) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy pending requests (instance {{ $labels.instance }})
summary: HAProxy pending requests (v1) (instance {{ $labels.instance }})
description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHttpSlowingDown
- alert: HaproxyHttpSlowingDown(v1)
expr: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
summary: HAProxy HTTP slowing down (v1) (instance {{ $labels.instance }})
description: "Average request time is increasing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyRetryHigh
- alert: HaproxyRetryHigh(v1)
expr: 'sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy retry high (instance {{ $labels.instance }})
summary: HAProxy retry high (v1) (instance {{ $labels.instance }})
description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendDown
@ -131,20 +131,20 @@ groups:
summary: HAProxy server down (instance {{ $labels.instance }})
description: "HAProxy server is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyFrontendSecurityBlockedRequests
- alert: HaproxyFrontendSecurityBlockedRequests(v1)
expr: 'sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }})
summary: HAProxy frontend security blocked requests (v1) (instance {{ $labels.instance }})
description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerHealthcheckFailure
- alert: HaproxyServerHealthcheckFailure(v1)
expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
for: 1m
labels:
severity: warning
annotations:
summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
description: "Some server healthcheck are failing on {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: HAProxy server healthcheck failure (v1) (instance {{ $labels.instance }})
description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -21,7 +21,7 @@ groups:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostMemoryIsUnderutilized

View file

@ -14,20 +14,20 @@ groups:
summary: Juniper switch down (instance {{ $labels.instance }})
description: "The switch appears to be down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JuniperHighBandwidthUsage1gib
- alert: JuniperCriticalBandwidthUsage1gib
expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90'
for: 1m
labels:
severity: critical
annotations:
summary: Juniper high Bandwidth Usage 1GiB (instance {{ $labels.instance }})
summary: Juniper critical Bandwidth Usage 1GiB (instance {{ $labels.instance }})
description: "Interface is highly saturated. (> 0.90GiB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JuniperHighBandwidthUsage1gib
- alert: JuniperWarningBandwidthUsage1gib
expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80'
for: 1m
labels:
severity: warning
annotations:
summary: Juniper high Bandwidth Usage 1GiB (instance {{ $labels.instance }})
summary: Juniper warning Bandwidth Usage 1GiB (instance {{ $labels.instance }})
description: "Interface is getting saturated. (> 0.80GiB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -11,7 +11,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes Node ready (node {{ $labels.node }})
summary: Kubernetes Node not ready (instance {{ $labels.instance }})
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Kubernetes Node with disabled schedules are fine.
@ -22,7 +22,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes node scheduling disabled (node {{ $labels.node }})
summary: Kubernetes Node scheduling disabled (instance {{ $labels.instance }})
description: "Node {{ $labels.node }} has been marked as unschedulable for more than 30 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesNodeMemoryPressure
@ -31,7 +31,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes memory pressure (node {{ $labels.node }})
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
description: "Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesNodeDiskPressure
@ -40,7 +40,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes disk pressure (node {{ $labels.node }})
summary: Kubernetes Node disk pressure (instance {{ $labels.instance }})
description: "Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesNodeNetworkUnavailable
@ -67,7 +67,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobFailed
@ -76,7 +76,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
summary: Kubernetes Job failed (instance {{ $labels.instance }})
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobNotStarting
@ -85,7 +85,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }})
summary: Kubernetes Job not starting (instance {{ $labels.instance }})
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobFailing
@ -94,7 +94,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes CronJob failing ({{ $labels.namespace }}/{{ $labels.cronjob }})
summary: Kubernetes CronJob failing (instance {{ $labels.instance }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is failing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobSuspended
@ -103,7 +103,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeclaimPending
@ -112,7 +112,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesVolumeOutOfDiskSpace
@ -139,7 +139,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetDown
@ -148,7 +148,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScaleInability
@ -193,7 +193,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPodCrashLooping
@ -202,7 +202,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesReplicasetReplicasMismatch
@ -211,7 +211,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
summary: Kubernetes ReplicaSet replicas mismatch (instance {{ $labels.instance }})
description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDeploymentReplicasMismatch
@ -220,7 +220,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetReplicasMismatch
@ -238,7 +238,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetGenerationMismatch
@ -247,7 +247,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetUpdateNotRolledOut
@ -256,7 +256,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetRolloutStuck
@ -265,7 +265,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetMisscheduled
@ -274,7 +274,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold should be customized for each cronjob name.
@ -284,7 +284,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobSlowCompletion
@ -293,7 +293,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
summary: Kubernetes Job slow completion (instance {{ $labels.instance }})
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerErrors
@ -303,7 +303,7 @@ groups:
severity: critical
annotations:
summary: Kubernetes API server errors (instance {{ $labels.instance }})
description: "Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Kubernetes API server is experiencing {{ $value | humanize }}% error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiClientErrors
expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 and sum(rate(rest_client_requests_total[1m])) by (instance, job) > 0'
@ -312,7 +312,7 @@ groups:
severity: critical
annotations:
summary: Kubernetes API client errors (instance {{ $labels.instance }})
description: "Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Kubernetes API client is experiencing {{ $value | humanize }}% error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesClientCertificateExpiresNextWeek
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60'

View file

@ -21,7 +21,7 @@ groups:
severity: critical
annotations:
summary: Loki request errors (instance {{ $labels.instance }})
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestPanic
expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'

View file

@ -5,13 +5,13 @@ groups:
rules:
- alert: MongodbReplicationLag
- alert: MongodbReplicationLag(dcu)
expr: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10'
for: 0m
labels:
severity: critical
annotations:
summary: MongoDB replication lag (instance {{ $labels.instance }})
summary: MongoDB replication lag (DCU) (instance {{ $labels.instance }})
description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbReplicationStatus3
@ -59,29 +59,29 @@ groups:
summary: MongoDB replication Status 10 (instance {{ $labels.instance }})
description: "MongoDB Replication set member was once in a replica set but was subsequently removed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbNumberCursorsOpen
- alert: MongodbNumberCursorsOpen(dcu)
expr: 'mongodb_metrics_cursor_open{state="total_open"} > 10000'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB number cursors open (instance {{ $labels.instance }})
summary: MongoDB number cursors open (DCU) (instance {{ $labels.instance }})
description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbCursorsTimeouts
- alert: MongodbCursorsTimeouts(dcu)
expr: 'increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB cursors timeouts (instance {{ $labels.instance }})
description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: MongoDB cursors timeouts (DCU) (instance {{ $labels.instance }})
description: "Too many cursors are timing out ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbTooManyConnections
- alert: MongodbTooManyConnections(dcu)
expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB too many connections (instance {{ $labels.instance }})
summary: MongoDB too many connections (DCU) (instance {{ $labels.instance }})
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -25,13 +25,13 @@ groups:
summary: Mongodb replica member unhealthy (instance {{ $labels.instance }})
description: "MongoDB replica member is not healthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbReplicationLag
- alert: MongodbReplicationLag(percona)
expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10'
for: 0m
labels:
severity: critical
annotations:
summary: MongoDB replication lag (instance {{ $labels.instance }})
summary: MongoDB replication lag (Percona) (instance {{ $labels.instance }})
description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both.
@ -44,29 +44,29 @@ groups:
summary: MongoDB replication headroom (instance {{ $labels.instance }})
description: "MongoDB replication headroom is <= 0\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbNumberCursorsOpen
- alert: MongodbNumberCursorsOpen(percona)
expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB number cursors open (instance {{ $labels.instance }})
summary: MongoDB number cursors open (Percona) (instance {{ $labels.instance }})
description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbCursorsTimeouts
- alert: MongodbCursorsTimeouts(percona)
expr: 'increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB cursors timeouts (instance {{ $labels.instance }})
description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: MongoDB cursors timeouts (Percona) (instance {{ $labels.instance }})
description: "Too many cursors are timing out ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbTooManyConnections
- alert: MongodbTooManyConnections(percona)
expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB too many connections (instance {{ $labels.instance }})
summary: MongoDB too many connections (Percona) (instance {{ $labels.instance }})
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -78,7 +78,7 @@ groups:
severity: warning
annotations:
summary: MySQL slow queries (instance {{ $labels.instance }})
description: "MySQL server mysql has some new slow query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlInnodbLogWaits
expr: 'rate(mysql_global_status_innodb_log_waits[15m]) > 10'
@ -87,7 +87,7 @@ groups:
severity: warning
annotations:
summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
description: "MySQL innodb log writes stalling\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "MySQL innodb log writes stalling ({{ $value }} waits/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlRestarted
expr: 'mysql_global_status_uptime < 60'

View file

@ -103,7 +103,7 @@ groups:
severity: warning
annotations:
summary: Nats too many errors (instance {{ $labels.instance }})
description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsJetstreamAccountsExceeded
expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'

View file

@ -15,13 +15,13 @@ groups:
summary: Netdata high cpu usage (instance {{ $labels.instance }})
description: "Netdata high CPU usage (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
- alert: NetdataCpuStealNoisyNeighbor
expr: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10'
for: 5m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
summary: Netdata CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NetdataHighMemoryUsage
@ -67,7 +67,7 @@ groups:
severity: info
annotations:
summary: Netdata disk reallocated sectors (instance {{ $labels.instance }})
description: "Reallocated sectors on disk\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Disk reallocated sectors detected ({{ $value }} sectors)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NetdataDiskCurrentPendingSector
expr: 'netdata_smartd_log_current_pending_sector_count_sectors_average > 0'
@ -85,4 +85,4 @@ groups:
severity: warning
annotations:
summary: Netdata reported uncorrectable disk sectors (instance {{ $labels.instance }})
description: "Reported uncorrectable disk sectors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Reported uncorrectable disk sectors ({{ $value }} sectors)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -24,7 +24,7 @@ groups:
severity: critical
annotations:
summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorReceiverRefusedMetricPoints
expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
@ -33,7 +33,7 @@ groups:
severity: critical
annotations:
summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorReceiverRefusedLogRecords
expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
@ -42,34 +42,37 @@ groups:
severity: critical
annotations:
summary: OpenTelemetry Collector receiver refused log records (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s log records on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorExporterFailedSpans
expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0'
expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter failed spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s spans via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorExporterFailedMetricPoints
expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0'
expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter failed metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s metric points via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorExporterFailedLogRecords
expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0'
expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter failed log records (instance {{ $labels.instance }})
description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s log records via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorExporterQueueNearlyFull
expr: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
@ -80,23 +83,25 @@ groups:
summary: OpenTelemetry Collector exporter queue nearly full (instance {{ $labels.instance }})
description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorProcessorRefusedSpans
expr: 'rate(otelcol_processor_refused_spans[5m]) > 0'
expr: 'rate(otelcol_processor_refused_spans[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector processor refused spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorProcessorRefusedMetricPoints
expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0'
expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector processor refused metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorHighMemoryUsage
expr: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'

View file

@ -6,10 +6,10 @@ groups:
rules:
- alert: Php-fpmMax-childrenReached
expr: 'sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 0'
expr: 'sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 3'
for: 0m
labels:
severity: warning
annotations:
summary: PHP-FPM max-children reached (instance {{ $labels.instance }})
description: "PHP-FPM reached max children - {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "PHP-FPM reached max children on {{ $labels.instance }} ({{ $value }} times in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -76,7 +76,7 @@ groups:
severity: warning
annotations:
summary: Postgresql dead locks (instance {{ $labels.instance }})
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRollbackRate
expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'

View file

@ -149,7 +149,7 @@ groups:
severity: critical
annotations:
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Alertmanager is failing sending notifications ({{ $value }} notifications/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetEmpty
expr: 'prometheus_sd_discovered_targets == 0'
@ -176,16 +176,16 @@ groups:
severity: warning
annotations:
summary: Prometheus large scrape (instance {{ $labels.instance }})
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Prometheus has many scrapes that exceed the sample limit ({{ $value }} scrapes)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetScrapeDuplicate
expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0'
expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 3'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Prometheus has many samples rejected due to duplicate timestamps but different values ({{ $value }} samples)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0'

View file

@ -43,13 +43,13 @@ groups:
summary: RabbitMQ out of memory (instance {{ $labels.instance }})
description: "Memory available for RabbitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyConnections
- alert: RabbitmqInstanceTooManyConnections
expr: 'rabbitmq_connectionsTotal > 1000'
for: 2m
labels:
severity: warning
annotations:
summary: RabbitMQ too many connections (instance {{ $labels.instance }})
summary: RabbitMQ instance too many connections (instance {{ $labels.instance }})
description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Indicate the queue name in dedicated label.

View file

@ -95,4 +95,4 @@ groups:
severity: warning
annotations:
summary: RabbitMQ unroutable messages (instance {{ $labels.instance }})
description: "A queue has unroutable messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "A queue has unroutable messages ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -49,7 +49,7 @@ groups:
severity: warning
annotations:
summary: Systemd socket refused connections (instance {{ $labels.instance }})
description: "Systemd socket {{ $labels.name }} is refusing connections. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Systemd socket {{ $labels.name }} is refusing connections. ({{ $value }} refused in last 5m, instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 100 connections is arbitrary. Adjust to your workload.
- alert: SystemdSocketHighConnections

View file

@ -12,7 +12,7 @@ groups:
severity: critical
annotations:
summary: Thanos Rule Queue Is Dropping Alerts (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} is failing to queue alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleSenderIsFailingAlerts
expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
@ -21,7 +21,7 @@ groups:
severity: critical
annotations:
summary: Thanos Rule Sender Is Failing Alerts (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleHighRuleEvaluationFailures
expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) > 0'
@ -30,16 +30,17 @@ groups:
severity: critical
annotations:
summary: Thanos Rule High Rule Evaluation Failures (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} is failing to evaluate rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Thanos Rule {{$labels.instance}} is failing to evaluate {{$value | humanize}}% of rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: ThanosRuleHighRuleEvaluationWarnings
expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05'
for: 15m
labels:
severity: info
annotations:
summary: Thanos Rule High Rule Evaluation Warnings (instance {{ $labels.instance }})
description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosRuleRuleEvaluationLatencyHigh
expr: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'

View file

@ -5,14 +5,15 @@ groups:
rules:
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: ThanosSidecarBucketOperationsFailed
expr: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0'
expr: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0.05'
for: 5m
labels:
severity: critical
annotations:
summary: Thanos Sidecar Bucket Operations Failed (instance {{ $labels.instance }})
description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ThanosSidecarNoConnectionToStartedPrometheus
expr: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'