diff --git a/dist/rules/blackbox/blackbox-exporter.yml b/dist/rules/blackbox/blackbox-exporter.yml index 7eb0a74..48f69b0 100644 --- a/dist/rules/blackbox/blackbox-exporter.yml +++ b/dist/rules/blackbox/blackbox-exporter.yml @@ -24,7 +24,7 @@ groups: description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxSlowProbe - expr: 'avg_over_time(probe_duration_seconds[1m]) > 1' + expr: 'probe_duration_seconds > 1' for: 1m labels: severity: warning @@ -73,7 +73,7 @@ groups: description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxProbeSlowHttp - expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1' + expr: 'probe_http_duration_seconds > 1' for: 1m labels: severity: warning @@ -82,7 +82,7 @@ groups: description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxProbeSlowPing - expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1' + expr: 'probe_icmp_duration_seconds > 1' for: 1m labels: severity: warning diff --git a/dist/rules/cassandra/criteo-cassandra-exporter.yml b/dist/rules/cassandra/criteo-cassandra-exporter.yml index 82ab06c..02434b1 100644 --- a/dist/rules/cassandra/criteo-cassandra-exporter.yml +++ b/dist/rules/cassandra/criteo-cassandra-exporter.yml @@ -51,31 +51,31 @@ groups: summary: Cassandra node down (instance {{ $labels.instance }}) description: "Cassandra node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraCommitlogPendingTasks + - alert: CassandraCommitlogPendingTasks(criteo) expr: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15' for: 2m labels: severity: warning annotations: - summary: Cassandra commitlog pending tasks (instance {{ $labels.instance }}) + summary: Cassandra commitlog pending tasks (Criteo) (instance {{ $labels.instance }}) description: "Unexpected number of Cassandra commitlog pending tasks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraCompactionExecutorBlockedTasks + - alert: CassandraCompactionExecutorBlockedTasks(criteo) expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0' for: 2m labels: severity: warning annotations: - summary: Cassandra compaction executor blocked tasks (instance {{ $labels.instance }}) + summary: Cassandra compaction executor blocked tasks (Criteo) (instance {{ $labels.instance }}) description: "Some Cassandra compaction executor tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraFlushWriterBlockedTasks + - alert: CassandraFlushWriterBlockedTasks(criteo) expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0' for: 2m labels: severity: warning annotations: - summary: Cassandra flush writer blocked tasks (instance {{ $labels.instance }}) + summary: Cassandra flush writer blocked tasks (Criteo) (instance {{ $labels.instance }}) description: "Some Cassandra flush writer tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraRepairPendingTasks @@ -96,67 +96,67 @@ groups: summary: Cassandra repair blocked tasks (instance {{ $labels.instance }}) description: "Some Cassandra repair tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraConnectionTimeoutsTotal + - alert: CassandraConnectionTimeoutsTotal(criteo) expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5' for: 2m labels: severity: critical annotations: - summary: Cassandra connection timeouts total (instance {{ $labels.instance }}) + summary: Cassandra connection timeouts total (Criteo) (instance {{ $labels.instance }}) description: "Some connection between nodes are ending in timeout\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraStorageExceptions + - alert: CassandraStorageExceptions(criteo) expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1' for: 0m labels: severity: critical annotations: - summary: Cassandra storage exceptions (instance {{ $labels.instance }}) + summary: Cassandra storage exceptions (Criteo) (instance {{ $labels.instance }}) description: "Something is going wrong with cassandra storage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraTombstoneDump + - alert: CassandraTombstoneDump(criteo) expr: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000' for: 0m labels: severity: critical annotations: - summary: Cassandra tombstone dump (instance {{ $labels.instance }}) + summary: Cassandra tombstone dump (Criteo) (instance {{ $labels.instance }}) description: "Too much tombstones scanned in queries\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraClientRequestUnavailableWrite + - alert: CassandraClientRequestUnavailableWrite(criteo) expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0' for: 0m labels: severity: critical annotations: - summary: Cassandra client request unavailable write (instance {{ $labels.instance }}) + summary: Cassandra client request unavailable write (Criteo) (instance {{ $labels.instance }}) description: "Write failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraClientRequestUnavailableRead + - alert: CassandraClientRequestUnavailableRead(criteo) expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0' for: 0m labels: severity: critical annotations: - summary: Cassandra client request unavailable read (instance {{ $labels.instance }}) + summary: Cassandra client request unavailable read (Criteo) (instance {{ $labels.instance }}) description: "Read failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraClientRequestWriteFailure + - alert: CassandraClientRequestWriteFailure(criteo) expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0' for: 0m labels: severity: critical annotations: - summary: Cassandra client request write failure (instance {{ $labels.instance }}) + summary: Cassandra client request write failure (Criteo) (instance {{ $labels.instance }}) description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraClientRequestReadFailure + - alert: CassandraClientRequestReadFailure(criteo) expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0' for: 0m labels: severity: critical annotations: - summary: Cassandra client request read failure (instance {{ $labels.instance }}) + summary: Cassandra client request read failure (Criteo) (instance {{ $labels.instance }}) description: "A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraCacheHitRateKeyCache diff --git a/dist/rules/cassandra/instaclustr-cassandra-exporter.yml b/dist/rules/cassandra/instaclustr-cassandra-exporter.yml index 7369835..1ed8fa6 100644 --- a/dist/rules/cassandra/instaclustr-cassandra-exporter.yml +++ b/dist/rules/cassandra/instaclustr-cassandra-exporter.yml @@ -24,92 +24,92 @@ groups: summary: Cassandra many compaction tasks are pending (instance {{ $labels.instance }}) description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraCommitlogPendingTasks + - alert: CassandraCommitlogPendingTasks(instaclustr) expr: 'cassandra_commit_log_pending_tasks > 15' for: 2m labels: severity: warning annotations: - summary: Cassandra commitlog pending tasks (instance {{ $labels.instance }}) + summary: Cassandra commitlog pending tasks (Instaclustr) (instance {{ $labels.instance }}) description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraCompactionExecutorBlockedTasks + - alert: CassandraCompactionExecutorBlockedTasks(instaclustr) expr: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15' for: 2m labels: severity: warning annotations: - summary: Cassandra compaction executor blocked tasks (instance {{ $labels.instance }}) + summary: Cassandra compaction executor blocked tasks (Instaclustr) (instance {{ $labels.instance }}) description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraFlushWriterBlockedTasks + - alert: CassandraFlushWriterBlockedTasks(instaclustr) expr: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15' for: 2m labels: severity: warning annotations: - summary: Cassandra flush writer blocked tasks (instance {{ $labels.instance }}) + summary: Cassandra flush writer blocked tasks (Instaclustr) (instance {{ $labels.instance }}) description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraConnectionTimeoutsTotal + - alert: CassandraConnectionTimeoutsTotal(instaclustr) expr: 'sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5' for: 2m labels: severity: critical annotations: - summary: Cassandra connection timeouts total (instance {{ $labels.instance }}) + summary: Cassandra connection timeouts total (Instaclustr) (instance {{ $labels.instance }}) description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraStorageExceptions + - alert: CassandraStorageExceptions(instaclustr) expr: 'changes(cassandra_storage_exceptions_total[1m]) > 1' for: 0m labels: severity: critical annotations: - summary: Cassandra storage exceptions (instance {{ $labels.instance }}) + summary: Cassandra storage exceptions (Instaclustr) (instance {{ $labels.instance }}) description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraTombstoneDump + - alert: CassandraTombstoneDump(instaclustr) expr: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100' for: 2m labels: severity: critical annotations: - summary: Cassandra tombstone dump (instance {{ $labels.instance }}) + summary: Cassandra tombstone dump (Instaclustr) (instance {{ $labels.instance }}) description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraClientRequestUnavailableWrite + - alert: CassandraClientRequestUnavailableWrite(instaclustr) expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0' for: 2m labels: severity: critical annotations: - summary: Cassandra client request unavailable write (instance {{ $labels.instance }}) + summary: Cassandra client request unavailable write (Instaclustr) (instance {{ $labels.instance }}) description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraClientRequestUnavailableRead + - alert: CassandraClientRequestUnavailableRead(instaclustr) expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0' for: 2m labels: severity: critical annotations: - summary: Cassandra client request unavailable read (instance {{ $labels.instance }}) + summary: Cassandra client request unavailable read (Instaclustr) (instance {{ $labels.instance }}) description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraClientRequestWriteFailure + - alert: CassandraClientRequestWriteFailure(instaclustr) expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0' for: 2m labels: severity: critical annotations: - summary: Cassandra client request write failure (instance {{ $labels.instance }}) + summary: Cassandra client request write failure (Instaclustr) (instance {{ $labels.instance }}) description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraClientRequestReadFailure + - alert: CassandraClientRequestReadFailure(instaclustr) expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0' for: 2m labels: severity: critical annotations: - summary: Cassandra client request read failure (instance {{ $labels.instance }}) + summary: Cassandra client request read failure (Instaclustr) (instance {{ $labels.instance }}) description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/clickhouse/embedded-exporter.yml b/dist/rules/clickhouse/embedded-exporter.yml index 07ca4a1..7883b36 100644 --- a/dist/rules/clickhouse/embedded-exporter.yml +++ b/dist/rules/clickhouse/embedded-exporter.yml @@ -117,7 +117,7 @@ groups: description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseAuthenticationFailures - expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0' + expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 3' for: 0m labels: severity: info @@ -126,7 +126,7 @@ groups: description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseAccessDeniedErrors - expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0' + expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 3' for: 0m labels: severity: info diff --git a/dist/rules/cortex/embedded-exporter.yml b/dist/rules/cortex/embedded-exporter.yml index 82aee46..ebaaf88 100644 --- a/dist/rules/cortex/embedded-exporter.yml +++ b/dist/rules/cortex/embedded-exporter.yml @@ -31,7 +31,7 @@ groups: severity: critical annotations: summary: Cortex notification are being dropped (instance {{ $labels.instance }}) - description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: CortexNotificationError @@ -41,7 +41,7 @@ groups: severity: critical annotations: summary: Cortex notification error (instance {{ $labels.instance }}) - description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CortexIngesterUnhealthy expr: 'cortex_ring_members{state="Unhealthy", name="ingester"} > 0' diff --git a/dist/rules/docker-containers/google-cadvisor.yml b/dist/rules/docker-containers/google-cadvisor.yml index dcf5588..ecd9d24 100644 --- a/dist/rules/docker-containers/google-cadvisor.yml +++ b/dist/rules/docker-containers/google-cadvisor.yml @@ -33,7 +33,7 @@ groups: severity: warning annotations: summary: Container High CPU utilization (instance {{ $labels.instance }}) - description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Container CPU utilization is above 80% (current: {{ $value | printf \"%.2f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d - alert: ContainerHighMemoryUsage @@ -55,13 +55,13 @@ groups: description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ContainerHighThrottleRate - expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0' + expr: 'sum(rate(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0' for: 5m labels: severity: warning annotations: summary: Container high throttle rate (instance {{ $labels.instance }}) - description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Container is being throttled ({{ $value | humanizePercentage }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ContainerHighLowChangeCpuUsage expr: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25' @@ -79,7 +79,7 @@ groups: severity: info annotations: summary: Container Low CPU utilization (instance {{ $labels.instance }}) - description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf \"%.2f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ContainerLowMemoryUsage expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20' diff --git a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml index ae69b92..77f322f 100644 --- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml +++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml @@ -143,13 +143,13 @@ groups: description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHighIndexingLatency - expr: 'increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and increase(elasticsearch_indices_indexing_index_total[1m]) > 0' + expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / rate(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and rate(elasticsearch_indices_indexing_index_total[1m]) > 0' for: 10m labels: severity: warning annotations: summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }}) - description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHighIndexingRate expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000' @@ -170,10 +170,10 @@ groups: description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHighQueryLatency - expr: 'increase(elasticsearch_indices_search_query_time_seconds[1m]) / increase(elasticsearch_indices_search_query_total[1m]) > 1 and increase(elasticsearch_indices_search_query_total[1m]) > 0' + expr: 'rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0' for: 5m labels: severity: warning annotations: summary: Elasticsearch High Query Latency (instance {{ $labels.instance }}) - description: "The query latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/envoy/embedded-exporter.yml b/dist/rules/envoy/embedded-exporter.yml index 3eb43de..07adfad 100644 --- a/dist/rules/envoy/embedded-exporter.yml +++ b/dist/rules/envoy/embedded-exporter.yml @@ -42,13 +42,13 @@ groups: description: "More than 10% of downstream HTTP responses are 4xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyDownstreamConnectionsOverflowing - expr: 'increase(envoy_listener_downstream_cx_overflow[5m]) > 0' + expr: 'increase(envoy_listener_downstream_cx_overflow[5m]) > 5' for: 0m labels: severity: warning annotations: summary: Envoy downstream connections overflowing (instance {{ $labels.instance }}) - description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyClusterMembershipEmpty expr: 'envoy_cluster_membership_healthy == 0' @@ -75,10 +75,10 @@ groups: severity: warning annotations: summary: Envoy high cluster upstream connection failures (instance {{ $labels.instance }}) - description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighClusterUpstreamRequestTimeoutRate - expr: 'increase(envoy_cluster_upstream_rq_timeout[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0' + expr: 'rate(envoy_cluster_upstream_rq_timeout[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0' for: 5m labels: severity: warning @@ -87,7 +87,7 @@ groups: description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighClusterUpstream5xxErrorRate - expr: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0' + expr: 'rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0' for: 1m labels: severity: critical @@ -102,7 +102,7 @@ groups: severity: warning annotations: summary: Envoy cluster health check failures (instance {{ $labels.instance }}) - description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyClusterOutlierDetectionEjectionsActive expr: 'envoy_cluster_outlier_detection_ejections_active > 0' @@ -114,22 +114,22 @@ groups: description: "There are active outlier detection ejections in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyListenerSslConnectionErrors - expr: 'increase(envoy_listener_ssl_connection_error[5m]) > 0' + expr: 'increase(envoy_listener_ssl_connection_error[5m]) > 5' for: 0m labels: severity: warning annotations: summary: Envoy listener SSL connection errors (instance {{ $labels.instance }}) - description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyGlobalDownstreamConnectionsOverflowing - expr: 'increase(envoy_listener_downstream_global_cx_overflow[5m]) > 0' + expr: 'increase(envoy_listener_downstream_global_cx_overflow[5m]) > 5' for: 0m labels: severity: critical annotations: summary: Envoy global downstream connections overflowing (instance {{ $labels.instance }}) - description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoySslCertificateExpiringSoon expr: 'envoy_server_days_until_first_cert_expiring < 7' @@ -165,7 +165,7 @@ groups: severity: critical annotations: summary: Envoy no healthy upstream (instance {{ $labels.instance }}) - description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighDownstreamRequestTimeoutRate expr: 'increase(envoy_http_downstream_rq_timeout[5m]) > 5' @@ -174,4 +174,4 @@ groups: severity: warning annotations: summary: Envoy high downstream request timeout rate (instance {{ $labels.instance }}) - description: "Downstream requests are timing out on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/etcd/embedded-exporter.yml b/dist/rules/etcd/embedded-exporter.yml index 7a10a9d..28c4ace 100644 --- a/dist/rules/etcd/embedded-exporter.yml +++ b/dist/rules/etcd/embedded-exporter.yml @@ -30,26 +30,26 @@ groups: severity: warning annotations: summary: Etcd high number of leader changes (instance {{ $labels.instance }}) - description: "Etcd leader changed more than 2 times during 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Etcd leader changed {{ $value }} times during 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - - alert: EtcdHighNumberOfFailedGrpcRequests + - alert: EtcdHighNumberOfFailedGrpcRequestsWarning expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0' for: 2m labels: severity: warning annotations: - summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }}) + summary: Etcd high number of failed GRPC requests warning (instance {{ $labels.instance }}) description: "More than 1% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - - alert: EtcdHighNumberOfFailedGrpcRequests + - alert: EtcdHighNumberOfFailedGrpcRequestsCritical expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0' for: 2m labels: severity: critical annotations: - summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }}) + summary: Etcd high number of failed GRPC requests critical (instance {{ $labels.instance }}) description: "More than 5% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdGrpcRequestsSlow @@ -61,22 +61,22 @@ groups: summary: Etcd GRPC requests slow (instance {{ $labels.instance }}) description: "GRPC requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: EtcdHighNumberOfFailedHttpRequests + - alert: EtcdHighNumberOfFailedHttpRequestsWarning expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0' for: 2m labels: severity: warning annotations: - summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }}) + summary: Etcd high number of failed HTTP requests warning (instance {{ $labels.instance }}) description: "More than 1% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: EtcdHighNumberOfFailedHttpRequests + - alert: EtcdHighNumberOfFailedHttpRequestsCritical expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0' for: 2m labels: severity: critical annotations: - summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }}) + summary: Etcd high number of failed HTTP requests critical (instance {{ $labels.instance }}) description: "More than 5% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHttpRequestsSlow @@ -104,7 +104,7 @@ groups: severity: warning annotations: summary: Etcd high number of failed proposals (instance {{ $labels.instance }}) - description: "Etcd server got more than 5 failed proposals past hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Etcd server got {{ $value }} failed proposals in the past hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHighFsyncDurations expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5' diff --git a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml index c9e2b6e..a0d3f12 100644 --- a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml +++ b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml @@ -8,7 +8,7 @@ groups: # Queued connections indicate Puma workers are saturated. # Consider increasing puma['worker_processes'] or puma['max_threads'] in gitlab.rb. - alert: GitlabPumaHighQueuedConnections - expr: 'avg_over_time(puma_queued_connections[5m]) > 5' + expr: 'puma_queued_connections > 5' for: 5m labels: severity: warning @@ -85,7 +85,7 @@ groups: severity: warning annotations: summary: GitLab Sidekiq high job completion time (instance {{ $labels.instance }}) - description: "GitLab Sidekiq job average completion time on {{ $labels.instance }} is above 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "GitLab Sidekiq job p95 completion time on {{ $labels.instance }} is above 5 minutes ({{ $value | humanizeDuration }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. # High queue latency means jobs are stuck waiting. Check Sidekiq concurrency and queue sizes. diff --git a/dist/rules/grafana-mimir/embedded-exporter.yml b/dist/rules/grafana-mimir/embedded-exporter.yml index a9ed278..d401079 100644 --- a/dist/rules/grafana-mimir/embedded-exporter.yml +++ b/dist/rules/grafana-mimir/embedded-exporter.yml @@ -105,14 +105,15 @@ groups: summary: Mimir ingested data too far in the future (instance {{ $labels.instance }}) description: "Mimir ingester {{ $labels.job }} has ingested samples with timestamps more than 1 hour in the future.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirStoreGatewayTooManyFailedOperations - expr: 'sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0' + expr: 'sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0.05' for: 5m labels: severity: warning annotations: summary: Mimir store gateway too many failed operations (instance {{ $labels.instance }}) - description: "Mimir store-gateway {{ $labels.job }} bucket operations are failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir store-gateway {{ $labels.job }} bucket operations are failing ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirRingMembersMismatch expr: 'max by (name, job) (sum by (name, job, instance) (cortex_ring_members)) != min by (name, job) (sum by (name, job, instance) (cortex_ring_members))' @@ -184,7 +185,7 @@ groups: severity: critical annotations: summary: Mimir ingester TSDB head compaction failed (instance {{ $labels.instance }}) - description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterTsdbHeadTruncationFailed expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0' @@ -193,7 +194,7 @@ groups: severity: critical annotations: summary: Mimir ingester TSDB head truncation failed (instance {{ $labels.instance }}) - description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterTsdbCheckpointCreationFailed expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0' @@ -202,7 +203,7 @@ groups: severity: critical annotations: summary: Mimir ingester TSDB checkpoint creation failed (instance {{ $labels.instance }}) - description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterTsdbCheckpointDeletionFailed expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0' @@ -211,7 +212,7 @@ groups: severity: critical annotations: summary: Mimir ingester TSDB checkpoint deletion failed (instance {{ $labels.instance }}) - description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterTsdbWalTruncationFailed expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0' @@ -220,7 +221,7 @@ groups: severity: warning annotations: summary: Mimir ingester TSDB WAL truncation failed (instance {{ $labels.instance }}) - description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirIngesterTsdbWalWritesFailed expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0' @@ -229,7 +230,7 @@ groups: severity: critical annotations: summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }}) - description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold aligned with official Mimir mixin (30 minutes). - alert: MimirStoreGatewayHasNotSyncedBucket @@ -284,7 +285,7 @@ groups: severity: critical annotations: summary: Mimir compactor has consecutive failures (instance {{ $labels.instance }}) - description: "Mimir compactor {{ $labels.instance }} has had 2+ compaction failures in the last 2 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirCompactorHasRunOutOfDiskSpace expr: 'increase(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1' @@ -312,7 +313,7 @@ groups: severity: warning annotations: summary: Mimir compactor skipped blocks (instance {{ $labels.instance }}) - description: "Mimir compactor has found blocks that cannot be compacted (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirRulerTooManyFailedPushes expr: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 0' @@ -341,14 +342,15 @@ groups: summary: Mimir ruler missed evaluations (instance {{ $labels.instance }}) description: "Mimir ruler {{ $labels.instance }} is missing {{ printf \"%.2f\" $value }}% of rule group evaluations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: MimirRulerFailedRingCheck - expr: 'sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0' + expr: 'sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0.05' for: 5m labels: severity: critical annotations: summary: Mimir ruler failed ring check (instance {{ $labels.instance }}) - description: "Mimir ruler {{ $labels.job }} is failing ring checks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerSyncConfigsFailing expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0' @@ -357,7 +359,7 @@ groups: severity: critical annotations: summary: Mimir alertmanager sync configs failing (instance {{ $labels.instance }}) - description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerRingCheckFailing expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0' @@ -366,7 +368,7 @@ groups: severity: critical annotations: summary: Mimir alertmanager ring check failing (instance {{ $labels.instance }}) - description: "Mimir alertmanager {{ $labels.job }} is failing ring checks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerStateMergeFailing expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0' @@ -375,7 +377,7 @@ groups: severity: critical annotations: summary: Mimir alertmanager state merge failing (instance {{ $labels.instance }}) - description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerReplicationFailing expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0' @@ -384,7 +386,7 @@ groups: severity: critical annotations: summary: Mimir alertmanager replication failing (instance {{ $labels.instance }}) - description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerPersistStateFailing expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0' @@ -393,7 +395,7 @@ groups: severity: critical annotations: summary: Mimir alertmanager persist state failing (instance {{ $labels.instance }}) - description: "Mimir alertmanager {{ $labels.job }} is failing to persist state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Mimir alertmanager {{ $labels.job }} is failing to persist state ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirAlertmanagerInitialSyncFailed expr: 'increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0' diff --git a/dist/rules/grafana-tempo/embedded-exporter.yml b/dist/rules/grafana-tempo/embedded-exporter.yml index 42d1994..256a044 100644 --- a/dist/rules/grafana-tempo/embedded-exporter.yml +++ b/dist/rules/grafana-tempo/embedded-exporter.yml @@ -40,7 +40,7 @@ groups: severity: critical annotations: summary: Tempo compactions failing (instance {{ $labels.instance }}) - description: "Greater than 2 compactions have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "{{ $value }} compactions have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoPollsFailing expr: 'sum by (job) (increase(tempodb_blocklist_poll_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_poll_errors_total[5m])) > 0' @@ -49,7 +49,7 @@ groups: severity: critical annotations: summary: Tempo polls failing (instance {{ $labels.instance }}) - description: "Greater than 2 blocklist polls have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "{{ $value }} blocklist polls have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoTenantIndexFailures expr: 'sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[5m])) > 0' @@ -58,7 +58,7 @@ groups: severity: critical annotations: summary: Tempo tenant index failures (instance {{ $labels.instance }}) - description: "Greater than 2 tenant index failures in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "{{ $value }} tenant index failures in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoNoTenantIndexBuilders expr: 'sum by (tenant) (tempodb_blocklist_tenant_index_builder) == 0 and on() max(tempodb_blocklist_length) > 0' @@ -105,7 +105,7 @@ groups: severity: critical annotations: summary: Tempo user configurable overrides reload failing (instance {{ $labels.instance }}) - description: "Greater than 5 user-configurable overrides reloads have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "{{ $value }} user-configurable overrides reloads have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 100 blocks per compactor instance. Adjust based on your environment. - alert: TempoCompactionTooManyOutstandingBlocksWarning @@ -134,7 +134,7 @@ groups: severity: critical annotations: summary: Tempo distributor usage tracker errors (instance {{ $labels.instance }}) - description: "Tempo distributor usage tracker errors for {{ $labels.job }} (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoMetricsGeneratorProcessorUpdatesFailing expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 0' @@ -143,7 +143,7 @@ groups: severity: critical annotations: summary: Tempo metrics generator processor updates failing (instance {{ $labels.instance }}) - description: "Tempo metrics generator processor updates are failing for {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoMetricsGeneratorServiceGraphsDroppingSpans expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0' @@ -161,7 +161,7 @@ groups: severity: critical annotations: summary: Tempo metrics generator collections failing (instance {{ $labels.instance }}) - description: "Tempo metrics generator collections are failing for {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Tempo metrics generator collections are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Fires when the memcached error rate exceeds 20%. Only relevant if Tempo is configured with memcached caching. - alert: TempoMemcachedErrorsElevated diff --git a/dist/rules/haproxy/embedded-exporter-v2.yml b/dist/rules/haproxy/embedded-exporter-v2.yml index 4863a3d..d6b2bbf 100644 --- a/dist/rules/haproxy/embedded-exporter-v2.yml +++ b/dist/rules/haproxy/embedded-exporter-v2.yml @@ -130,4 +130,4 @@ groups: severity: warning annotations: summary: HAProxy server healthcheck failure (instance {{ $labels.instance }}) - description: "Some server healthcheck are failing on {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/haproxy/haproxy-exporter-v1.yml b/dist/rules/haproxy/haproxy-exporter-v1.yml index 82bf561..0285929 100644 --- a/dist/rules/haproxy/haproxy-exporter-v1.yml +++ b/dist/rules/haproxy/haproxy-exporter-v1.yml @@ -14,71 +14,71 @@ groups: summary: HAProxy down (instance {{ $labels.instance }}) description: "HAProxy down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyHighHttp4xxErrorRateBackend + - alert: HaproxyHighHttp4xxErrorRateBackend(v1) expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: - summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }}) + summary: HAProxy high HTTP 4xx error rate backend (v1) (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyHighHttp5xxErrorRateBackend + - alert: HaproxyHighHttp5xxErrorRateBackend(v1) expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: - summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }}) + summary: HAProxy high HTTP 5xx error rate backend (v1) (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyHighHttp4xxErrorRateServer + - alert: HaproxyHighHttp4xxErrorRateServer(v1) expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: - summary: HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }}) + summary: HAProxy high HTTP 4xx error rate server (v1) (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyHighHttp5xxErrorRateServer + - alert: HaproxyHighHttp5xxErrorRateServer(v1) expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: - summary: HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }}) + summary: HAProxy high HTTP 5xx error rate server (v1) (instance {{ $labels.instance }}) description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyServerResponseErrors + - alert: HaproxyServerResponseErrors(v1) expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical annotations: - summary: HAProxy server response errors (instance {{ $labels.instance }}) + summary: HAProxy server response errors (v1) (instance {{ $labels.instance }}) description: "Too many response errors to {{ $labels.server }} server (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyBackendConnectionErrors + - alert: HaproxyBackendConnectionErrors(v1) expr: 'sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100' for: 1m labels: severity: critical annotations: - summary: HAProxy backend connection errors (instance {{ $labels.instance }}) + summary: HAProxy backend connection errors (v1) (instance {{ $labels.instance }}) description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyServerConnectionErrors + - alert: HaproxyServerConnectionErrors(v1) expr: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100' for: 0m labels: severity: critical annotations: - summary: HAProxy server connection errors (instance {{ $labels.instance }}) + summary: HAProxy server connection errors (v1) (instance {{ $labels.instance }}) description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyBackendMaxActiveSession - expr: '((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80 and sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])) > 0' + expr: '((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0' for: 2m labels: severity: warning @@ -86,31 +86,31 @@ groups: summary: HAProxy backend max active session (instance {{ $labels.instance }}) description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyPendingRequests + - alert: HaproxyPendingRequests(v1) expr: 'sum by (backend) (haproxy_backend_current_queue) > 0' for: 2m labels: severity: warning annotations: - summary: HAProxy pending requests (instance {{ $labels.instance }}) + summary: HAProxy pending requests (v1) (instance {{ $labels.instance }}) description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyHttpSlowingDown + - alert: HaproxyHttpSlowingDown(v1) expr: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1' for: 1m labels: severity: warning annotations: - summary: HAProxy HTTP slowing down (instance {{ $labels.instance }}) + summary: HAProxy HTTP slowing down (v1) (instance {{ $labels.instance }}) description: "Average request time is increasing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyRetryHigh + - alert: HaproxyRetryHigh(v1) expr: 'sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10' for: 2m labels: severity: warning annotations: - summary: HAProxy retry high (instance {{ $labels.instance }}) + summary: HAProxy retry high (v1) (instance {{ $labels.instance }}) description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyBackendDown @@ -131,20 +131,20 @@ groups: summary: HAProxy server down (instance {{ $labels.instance }}) description: "HAProxy server is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyFrontendSecurityBlockedRequests + - alert: HaproxyFrontendSecurityBlockedRequests(v1) expr: 'sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10' for: 2m labels: severity: warning annotations: - summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }}) + summary: HAProxy frontend security blocked requests (v1) (instance {{ $labels.instance }}) description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyServerHealthcheckFailure + - alert: HaproxyServerHealthcheckFailure(v1) expr: 'increase(haproxy_server_check_failures_total[1m]) > 0' for: 1m labels: severity: warning annotations: - summary: HAProxy server healthcheck failure (instance {{ $labels.instance }}) - description: "Some server healthcheck are failing on {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: HAProxy server healthcheck failure (v1) (instance {{ $labels.instance }}) + description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 0ef89c4..9cb7584 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -21,7 +21,7 @@ groups: severity: warning annotations: summary: Host memory under memory pressure (instance {{ $labels.instance }}) - description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - alert: HostMemoryIsUnderutilized diff --git a/dist/rules/juniper/czerwonk-junos-exporter.yml b/dist/rules/juniper/czerwonk-junos-exporter.yml index e224732..31b49ac 100644 --- a/dist/rules/juniper/czerwonk-junos-exporter.yml +++ b/dist/rules/juniper/czerwonk-junos-exporter.yml @@ -14,20 +14,20 @@ groups: summary: Juniper switch down (instance {{ $labels.instance }}) description: "The switch appears to be down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: JuniperHighBandwidthUsage1gib + - alert: JuniperCriticalBandwidthUsage1gib expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90' for: 1m labels: severity: critical annotations: - summary: Juniper high Bandwidth Usage 1GiB (instance {{ $labels.instance }}) + summary: Juniper critical Bandwidth Usage 1GiB (instance {{ $labels.instance }}) description: "Interface is highly saturated. (> 0.90GiB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: JuniperHighBandwidthUsage1gib + - alert: JuniperWarningBandwidthUsage1gib expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80' for: 1m labels: severity: warning annotations: - summary: Juniper high Bandwidth Usage 1GiB (instance {{ $labels.instance }}) + summary: Juniper warning Bandwidth Usage 1GiB (instance {{ $labels.instance }}) description: "Interface is getting saturated. (> 0.80GiB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml index 3ff0d75..34c96bd 100644 --- a/dist/rules/kubernetes/kubestate-exporter.yml +++ b/dist/rules/kubernetes/kubestate-exporter.yml @@ -11,7 +11,7 @@ groups: labels: severity: critical annotations: - summary: Kubernetes Node ready (node {{ $labels.node }}) + summary: Kubernetes Node not ready (instance {{ $labels.instance }}) description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Kubernetes Node with disabled schedules are fine. @@ -22,7 +22,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes node scheduling disabled (node {{ $labels.node }}) + summary: Kubernetes Node scheduling disabled (instance {{ $labels.instance }}) description: "Node {{ $labels.node }} has been marked as unschedulable for more than 30 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesNodeMemoryPressure @@ -31,7 +31,7 @@ groups: labels: severity: critical annotations: - summary: Kubernetes memory pressure (node {{ $labels.node }}) + summary: Kubernetes Node memory pressure (instance {{ $labels.instance }}) description: "Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesNodeDiskPressure @@ -40,7 +40,7 @@ groups: labels: severity: critical annotations: - summary: Kubernetes disk pressure (node {{ $labels.node }}) + summary: Kubernetes Node disk pressure (instance {{ $labels.instance }}) description: "Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesNodeNetworkUnavailable @@ -67,7 +67,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }}) + summary: Kubernetes Container oom killer (instance {{ $labels.instance }}) description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesJobFailed @@ -76,7 +76,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }}) + summary: Kubernetes Job failed (instance {{ $labels.instance }}) description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesJobNotStarting @@ -85,7 +85,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }}) + summary: Kubernetes Job not starting (instance {{ $labels.instance }}) description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesCronjobFailing @@ -94,7 +94,7 @@ groups: labels: severity: critical annotations: - summary: Kubernetes CronJob failing ({{ $labels.namespace }}/{{ $labels.cronjob }}) + summary: Kubernetes CronJob failing (instance {{ $labels.instance }}) description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is failing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesCronjobSuspended @@ -103,7 +103,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }}) + summary: Kubernetes CronJob suspended (instance {{ $labels.instance }}) description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPersistentvolumeclaimPending @@ -112,7 +112,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}) + summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }}) description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesVolumeOutOfDiskSpace @@ -139,7 +139,7 @@ groups: labels: severity: critical annotations: - summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}) + summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }}) description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesStatefulsetDown @@ -148,7 +148,7 @@ groups: labels: severity: critical annotations: - summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }}) + summary: Kubernetes StatefulSet down (instance {{ $labels.instance }}) description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaScaleInability @@ -193,7 +193,7 @@ groups: labels: severity: critical annotations: - summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }}) + summary: Kubernetes Pod not healthy (instance {{ $labels.instance }}) description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPodCrashLooping @@ -202,7 +202,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }}) + summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesReplicasetReplicasMismatch @@ -211,7 +211,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }}) + summary: Kubernetes ReplicaSet replicas mismatch (instance {{ $labels.instance }}) description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesDeploymentReplicasMismatch @@ -220,7 +220,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }}) + summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }}) description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesStatefulsetReplicasMismatch @@ -238,7 +238,7 @@ groups: labels: severity: critical annotations: - summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }}) + summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }}) description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesStatefulsetGenerationMismatch @@ -247,7 +247,7 @@ groups: labels: severity: critical annotations: - summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }}) + summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }}) description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesStatefulsetUpdateNotRolledOut @@ -256,7 +256,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }}) + summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }}) description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesDaemonsetRolloutStuck @@ -265,7 +265,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }}) + summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }}) description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesDaemonsetMisscheduled @@ -274,7 +274,7 @@ groups: labels: severity: critical annotations: - summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }}) + summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }}) description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold should be customized for each cronjob name. @@ -284,7 +284,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }}) + summary: Kubernetes CronJob too long (instance {{ $labels.instance }}) description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesJobSlowCompletion @@ -293,7 +293,7 @@ groups: labels: severity: critical annotations: - summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }}) + summary: Kubernetes Job slow completion (instance {{ $labels.instance }}) description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesApiServerErrors @@ -303,7 +303,7 @@ groups: severity: critical annotations: summary: Kubernetes API server errors (instance {{ $labels.instance }}) - description: "Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Kubernetes API server is experiencing {{ $value | humanize }}% error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesApiClientErrors expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 and sum(rate(rest_client_requests_total[1m])) by (instance, job) > 0' @@ -312,7 +312,7 @@ groups: severity: critical annotations: summary: Kubernetes API client errors (instance {{ $labels.instance }}) - description: "Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Kubernetes API client is experiencing {{ $value | humanize }}% error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesClientCertificateExpiresNextWeek expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60' diff --git a/dist/rules/loki/embedded-exporter.yml b/dist/rules/loki/embedded-exporter.yml index f7d0974..166cfa2 100644 --- a/dist/rules/loki/embedded-exporter.yml +++ b/dist/rules/loki/embedded-exporter.yml @@ -21,7 +21,7 @@ groups: severity: critical annotations: summary: Loki request errors (instance {{ $labels.instance }}) - description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestPanic expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0' diff --git a/dist/rules/mongodb/dcu-mongodb-exporter.yml b/dist/rules/mongodb/dcu-mongodb-exporter.yml index ef7835d..a45617c 100644 --- a/dist/rules/mongodb/dcu-mongodb-exporter.yml +++ b/dist/rules/mongodb/dcu-mongodb-exporter.yml @@ -5,13 +5,13 @@ groups: rules: - - alert: MongodbReplicationLag + - alert: MongodbReplicationLag(dcu) expr: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10' for: 0m labels: severity: critical annotations: - summary: MongoDB replication lag (instance {{ $labels.instance }}) + summary: MongoDB replication lag (DCU) (instance {{ $labels.instance }}) description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbReplicationStatus3 @@ -59,29 +59,29 @@ groups: summary: MongoDB replication Status 10 (instance {{ $labels.instance }}) description: "MongoDB Replication set member was once in a replica set but was subsequently removed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: MongodbNumberCursorsOpen + - alert: MongodbNumberCursorsOpen(dcu) expr: 'mongodb_metrics_cursor_open{state="total_open"} > 10000' for: 2m labels: severity: warning annotations: - summary: MongoDB number cursors open (instance {{ $labels.instance }}) + summary: MongoDB number cursors open (DCU) (instance {{ $labels.instance }}) description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: MongodbCursorsTimeouts + - alert: MongodbCursorsTimeouts(dcu) expr: 'increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100' for: 2m labels: severity: warning annotations: - summary: MongoDB cursors timeouts (instance {{ $labels.instance }}) - description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: MongoDB cursors timeouts (DCU) (instance {{ $labels.instance }}) + description: "Too many cursors are timing out ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: MongodbTooManyConnections + - alert: MongodbTooManyConnections(dcu) expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0' for: 2m labels: severity: warning annotations: - summary: MongoDB too many connections (instance {{ $labels.instance }}) + summary: MongoDB too many connections (DCU) (instance {{ $labels.instance }}) description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/mongodb/percona-mongodb-exporter.yml b/dist/rules/mongodb/percona-mongodb-exporter.yml index a257c9d..d3159e0 100644 --- a/dist/rules/mongodb/percona-mongodb-exporter.yml +++ b/dist/rules/mongodb/percona-mongodb-exporter.yml @@ -25,13 +25,13 @@ groups: summary: Mongodb replica member unhealthy (instance {{ $labels.instance }}) description: "MongoDB replica member is not healthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: MongodbReplicationLag + - alert: MongodbReplicationLag(percona) expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10' for: 0m labels: severity: critical annotations: - summary: MongoDB replication lag (instance {{ $labels.instance }}) + summary: MongoDB replication lag (Percona) (instance {{ $labels.instance }}) description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both. @@ -44,29 +44,29 @@ groups: summary: MongoDB replication headroom (instance {{ $labels.instance }}) description: "MongoDB replication headroom is <= 0\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: MongodbNumberCursorsOpen + - alert: MongodbNumberCursorsOpen(percona) expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000' for: 2m labels: severity: warning annotations: - summary: MongoDB number cursors open (instance {{ $labels.instance }}) + summary: MongoDB number cursors open (Percona) (instance {{ $labels.instance }}) description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: MongodbCursorsTimeouts + - alert: MongodbCursorsTimeouts(percona) expr: 'increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100' for: 2m labels: severity: warning annotations: - summary: MongoDB cursors timeouts (instance {{ $labels.instance }}) - description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: MongoDB cursors timeouts (Percona) (instance {{ $labels.instance }}) + description: "Too many cursors are timing out ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: MongodbTooManyConnections + - alert: MongodbTooManyConnections(percona) expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0' for: 2m labels: severity: warning annotations: - summary: MongoDB too many connections (instance {{ $labels.instance }}) + summary: MongoDB too many connections (Percona) (instance {{ $labels.instance }}) description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/mysql/mysqld-exporter.yml b/dist/rules/mysql/mysqld-exporter.yml index 582324e..89f9eea 100644 --- a/dist/rules/mysql/mysqld-exporter.yml +++ b/dist/rules/mysql/mysqld-exporter.yml @@ -78,7 +78,7 @@ groups: severity: warning annotations: summary: MySQL slow queries (instance {{ $labels.instance }}) - description: "MySQL server mysql has some new slow query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "MySQL server mysql has some new slow query ({{ $value }} in the last minute).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlInnodbLogWaits expr: 'rate(mysql_global_status_innodb_log_waits[15m]) > 10' @@ -87,7 +87,7 @@ groups: severity: warning annotations: summary: MySQL InnoDB log waits (instance {{ $labels.instance }}) - description: "MySQL innodb log writes stalling\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "MySQL innodb log writes stalling ({{ $value }} waits/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlRestarted expr: 'mysql_global_status_uptime < 60' diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml index ee29e1c..e077089 100644 --- a/dist/rules/nats/nats-exporter.yml +++ b/dist/rules/nats/nats-exporter.yml @@ -103,7 +103,7 @@ groups: severity: warning annotations: summary: Nats too many errors (instance {{ $labels.instance }}) - description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsJetstreamAccountsExceeded expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100' diff --git a/dist/rules/netdata/embedded-exporter.yml b/dist/rules/netdata/embedded-exporter.yml index 6d4f1e0..8033ea4 100644 --- a/dist/rules/netdata/embedded-exporter.yml +++ b/dist/rules/netdata/embedded-exporter.yml @@ -15,13 +15,13 @@ groups: summary: Netdata high cpu usage (instance {{ $labels.instance }}) description: "Netdata high CPU usage (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HostCpuStealNoisyNeighbor + - alert: NetdataCpuStealNoisyNeighbor expr: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10' for: 5m labels: severity: warning annotations: - summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) + summary: Netdata CPU steal noisy neighbor (instance {{ $labels.instance }}) description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataHighMemoryUsage @@ -67,7 +67,7 @@ groups: severity: info annotations: summary: Netdata disk reallocated sectors (instance {{ $labels.instance }}) - description: "Reallocated sectors on disk\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Disk reallocated sectors detected ({{ $value }} sectors)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataDiskCurrentPendingSector expr: 'netdata_smartd_log_current_pending_sector_count_sectors_average > 0' @@ -85,4 +85,4 @@ groups: severity: warning annotations: summary: Netdata reported uncorrectable disk sectors (instance {{ $labels.instance }}) - description: "Reported uncorrectable disk sectors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Reported uncorrectable disk sectors ({{ $value }} sectors)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/opentelemetry-collector/embedded-exporter.yml b/dist/rules/opentelemetry-collector/embedded-exporter.yml index 2ab4217..4936fe0 100644 --- a/dist/rules/opentelemetry-collector/embedded-exporter.yml +++ b/dist/rules/opentelemetry-collector/embedded-exporter.yml @@ -24,7 +24,7 @@ groups: severity: critical annotations: summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }}) - description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpentelemetryCollectorReceiverRefusedMetricPoints expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0' @@ -33,7 +33,7 @@ groups: severity: critical annotations: summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }}) - description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpentelemetryCollectorReceiverRefusedLogRecords expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0' @@ -42,34 +42,37 @@ groups: severity: critical annotations: summary: OpenTelemetry Collector receiver refused log records (instance {{ $labels.instance }}) - description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s log records on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorExporterFailedSpans - expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0' + expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0.05' for: 5m labels: severity: warning annotations: summary: OpenTelemetry Collector exporter failed spans (instance {{ $labels.instance }}) - description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s spans via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorExporterFailedMetricPoints - expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0' + expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0.05' for: 5m labels: severity: warning annotations: summary: OpenTelemetry Collector exporter failed metric points (instance {{ $labels.instance }}) - description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s metric points via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorExporterFailedLogRecords - expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0' + expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0.05' for: 5m labels: severity: warning annotations: summary: OpenTelemetry Collector exporter failed log records (instance {{ $labels.instance }}) - description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s log records via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpentelemetryCollectorExporterQueueNearlyFull expr: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0' @@ -80,23 +83,25 @@ groups: summary: OpenTelemetry Collector exporter queue nearly full (instance {{ $labels.instance }}) description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorProcessorRefusedSpans - expr: 'rate(otelcol_processor_refused_spans[5m]) > 0' + expr: 'rate(otelcol_processor_refused_spans[5m]) > 0.05' for: 5m labels: severity: warning annotations: summary: OpenTelemetry Collector processor refused spans (instance {{ $labels.instance }}) - description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: OpentelemetryCollectorProcessorRefusedMetricPoints - expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0' + expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05' for: 5m labels: severity: warning annotations: summary: OpenTelemetry Collector processor refused metric points (instance {{ $labels.instance }}) - description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: OpentelemetryCollectorHighMemoryUsage expr: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9' diff --git a/dist/rules/php-fpm/bakins-fpm-exporter.yml b/dist/rules/php-fpm/bakins-fpm-exporter.yml index 379fea7..bc297e4 100644 --- a/dist/rules/php-fpm/bakins-fpm-exporter.yml +++ b/dist/rules/php-fpm/bakins-fpm-exporter.yml @@ -6,10 +6,10 @@ groups: rules: - alert: Php-fpmMax-childrenReached - expr: 'sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 0' + expr: 'sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 3' for: 0m labels: severity: warning annotations: summary: PHP-FPM max-children reached (instance {{ $labels.instance }}) - description: "PHP-FPM reached max children - {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "PHP-FPM reached max children on {{ $labels.instance }} ({{ $value }} times in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index b0565aa..be02424 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -76,7 +76,7 @@ groups: severity: warning annotations: summary: Postgresql dead locks (instance {{ $labels.instance }}) - description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlHighRollbackRate expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02' diff --git a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml index 09fe333..2e8cd42 100644 --- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml +++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml @@ -149,7 +149,7 @@ groups: severity: critical annotations: summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) - description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Alertmanager is failing sending notifications ({{ $value }} notifications/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTargetEmpty expr: 'prometheus_sd_discovered_targets == 0' @@ -176,16 +176,16 @@ groups: severity: warning annotations: summary: Prometheus large scrape (instance {{ $labels.instance }}) - description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Prometheus has many scrapes that exceed the sample limit ({{ $value }} scrapes)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTargetScrapeDuplicate - expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0' + expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 3' for: 0m labels: severity: warning annotations: summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) - description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Prometheus has many samples rejected due to duplicate timestamps but different values ({{ $value }} samples)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTsdbCheckpointCreationFailures expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0' diff --git a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml index fd83621..d8cdce8 100644 --- a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml +++ b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml @@ -43,13 +43,13 @@ groups: summary: RabbitMQ out of memory (instance {{ $labels.instance }}) description: "Memory available for RabbitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: RabbitmqTooManyConnections + - alert: RabbitmqInstanceTooManyConnections expr: 'rabbitmq_connectionsTotal > 1000' for: 2m labels: severity: warning annotations: - summary: RabbitMQ too many connections (instance {{ $labels.instance }}) + summary: RabbitMQ instance too many connections (instance {{ $labels.instance }}) description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Indicate the queue name in dedicated label. diff --git a/dist/rules/rabbitmq/rabbitmq-exporter.yml b/dist/rules/rabbitmq/rabbitmq-exporter.yml index 5ddcc95..4eec1e4 100644 --- a/dist/rules/rabbitmq/rabbitmq-exporter.yml +++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml @@ -95,4 +95,4 @@ groups: severity: warning annotations: summary: RabbitMQ unroutable messages (instance {{ $labels.instance }}) - description: "A queue has unroutable messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "A queue has unroutable messages ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/systemd/systemd-exporter.yml b/dist/rules/systemd/systemd-exporter.yml index 8e1717e..b9f2adb 100644 --- a/dist/rules/systemd/systemd-exporter.yml +++ b/dist/rules/systemd/systemd-exporter.yml @@ -49,7 +49,7 @@ groups: severity: warning annotations: summary: Systemd socket refused connections (instance {{ $labels.instance }}) - description: "Systemd socket {{ $labels.name }} is refusing connections. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Systemd socket {{ $labels.name }} is refusing connections. ({{ $value }} refused in last 5m, instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold of 100 connections is arbitrary. Adjust to your workload. - alert: SystemdSocketHighConnections diff --git a/dist/rules/thanos/thanos-ruler.yml b/dist/rules/thanos/thanos-ruler.yml index 17f11a7..16d9cc1 100644 --- a/dist/rules/thanos/thanos-ruler.yml +++ b/dist/rules/thanos/thanos-ruler.yml @@ -12,7 +12,7 @@ groups: severity: critical annotations: summary: Thanos Rule Queue Is Dropping Alerts (instance {{ $labels.instance }}) - description: "Thanos Rule {{$labels.instance}} is failing to queue alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleSenderIsFailingAlerts expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0' @@ -21,7 +21,7 @@ groups: severity: critical annotations: summary: Thanos Rule Sender Is Failing Alerts (instance {{ $labels.instance }}) - description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleHighRuleEvaluationFailures expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) > 0' @@ -30,16 +30,17 @@ groups: severity: critical annotations: summary: Thanos Rule High Rule Evaluation Failures (instance {{ $labels.instance }}) - description: "Thanos Rule {{$labels.instance}} is failing to evaluate rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Thanos Rule {{$labels.instance}} is failing to evaluate {{$value | humanize}}% of rules.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: ThanosRuleHighRuleEvaluationWarnings - expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0' + expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05' for: 15m labels: severity: info annotations: summary: Thanos Rule High Rule Evaluation Warnings (instance {{ $labels.instance }}) - description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleRuleEvaluationLatencyHigh expr: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))' diff --git a/dist/rules/thanos/thanos-sidecar.yml b/dist/rules/thanos/thanos-sidecar.yml index 82bab36..c0c668d 100644 --- a/dist/rules/thanos/thanos-sidecar.yml +++ b/dist/rules/thanos/thanos-sidecar.yml @@ -5,14 +5,15 @@ groups: rules: + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: ThanosSidecarBucketOperationsFailed - expr: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0' + expr: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0.05' for: 5m labels: severity: critical annotations: summary: Thanos Sidecar Bucket Operations Failed (instance {{ $labels.instance }}) - description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosSidecarNoConnectionToStartedPrometheus expr: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'