diff --git a/dist/rules/apache-flink/flink-prometheus-reporter.yml b/dist/rules/apache-flink/flink-prometheus-reporter.yml index c7cb9cd..3bd48ce 100644 --- a/dist/rules/apache-flink/flink-prometheus-reporter.yml +++ b/dist/rules/apache-flink/flink-prometheus-reporter.yml @@ -52,10 +52,10 @@ groups: summary: Flink checkpoint failures (instance {{ $labels.instance }}) description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Value is in milliseconds. humanizeDuration expects seconds, so the template output may be misleading. + # Value is converted from milliseconds to seconds for correct humanizeDuration display. # Threshold is 60 seconds. Adjust based on your checkpoint interval and state size. - alert: FlinkCheckpointDurationHigh - expr: 'flink_jobmanager_job_lastCheckpointDuration > 60000' + expr: 'flink_jobmanager_job_lastCheckpointDuration / 1000 > 60' for: 5m labels: severity: warning diff --git a/dist/rules/apache-spark/spark-prometheus.yml b/dist/rules/apache-spark/spark-prometheus.yml index 7d7ac12..39d5739 100644 --- a/dist/rules/apache-spark/spark-prometheus.yml +++ b/dist/rules/apache-spark/spark-prometheus.yml @@ -51,7 +51,7 @@ groups: # Fires when more than 10% of executor time is spent in garbage collection. # This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/). - alert: SparkExecutorHighGcTime - expr: 'metrics_executor_totalGCTime_seconds_total / (metrics_executor_totalDuration > 0) > 0.1' + expr: 'metrics_executor_totalGCTime_seconds_total / metrics_executor_totalDuration > 0.1 and metrics_executor_totalDuration > 0' for: 5m labels: severity: warning @@ -60,7 +60,7 @@ groups: description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SparkExecutorAllTasksFailing - expr: 'metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks == 0' + expr: 'metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks_total == 0' for: 5m labels: severity: critical @@ -69,7 +69,7 @@ groups: description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SparkExecutorHighTaskFailureRate - expr: 'metrics_executor_failedTasks_total / (metrics_executor_totalTasks_total > 0) > 0.1' + expr: 'metrics_executor_failedTasks_total / metrics_executor_totalTasks_total > 0.1 and metrics_executor_totalTasks_total > 0' for: 5m labels: severity: warning diff --git a/dist/rules/apache/lusitaniae-apache-exporter.yml b/dist/rules/apache/lusitaniae-apache-exporter.yml index 5876e25..b17e11b 100644 --- a/dist/rules/apache/lusitaniae-apache-exporter.yml +++ b/dist/rules/apache/lusitaniae-apache-exporter.yml @@ -15,7 +15,7 @@ groups: description: "Apache down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ApacheWorkersLoad - expr: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80' + expr: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80 and sum by (instance) (apache_scoreboard) > 0' for: 2m labels: severity: warning diff --git a/dist/rules/azure/azure-metrics-exporter.yml b/dist/rules/azure/azure-metrics-exporter.yml index 741dc98..91b2954 100644 --- a/dist/rules/azure/azure-metrics-exporter.yml +++ b/dist/rules/azure/azure-metrics-exporter.yml @@ -18,7 +18,7 @@ groups: description: "Azure metrics exporter on {{ $labels.instance }} has {{ $value }} API request errors in the last 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: AzureExporterHighErrorRate - expr: 'sum by (instance) (rate(azurerm_stats_metric_requests{result="error"}[5m])) / sum by (instance) (rate(azurerm_stats_metric_requests[5m])) * 100 > 10' + expr: 'sum by (instance) (rate(azurerm_stats_metric_requests{result="error"}[5m])) / sum by (instance) (rate(azurerm_stats_metric_requests[5m])) * 100 > 10 and sum by (instance) (rate(azurerm_stats_metric_requests[5m])) > 0' for: 5m labels: severity: warning diff --git a/dist/rules/caddy/embedded-exporter.yml b/dist/rules/caddy/embedded-exporter.yml index e8e2635..5348c0e 100644 --- a/dist/rules/caddy/embedded-exporter.yml +++ b/dist/rules/caddy/embedded-exporter.yml @@ -15,7 +15,7 @@ groups: description: "All Caddy reverse proxies are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CaddyHighHttp4xxErrorRateService - expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5' + expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0' for: 1m labels: severity: critical @@ -24,7 +24,7 @@ groups: description: "Caddy service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CaddyHighHttp5xxErrorRateService - expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5' + expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0' for: 1m labels: severity: critical diff --git a/dist/rules/clickhouse/embedded-exporter.yml b/dist/rules/clickhouse/embedded-exporter.yml index f1e983d..07ca4a1 100644 --- a/dist/rules/clickhouse/embedded-exporter.yml +++ b/dist/rules/clickhouse/embedded-exporter.yml @@ -34,7 +34,7 @@ groups: description: "Memory usage is over 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseDiskSpaceLowOnDefault - expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20' + expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0' for: 2m labels: severity: warning @@ -43,7 +43,7 @@ groups: description: "Disk space on default is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseDiskSpaceCriticalOnDefault - expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10' + expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0' for: 2m labels: severity: critical @@ -52,7 +52,7 @@ groups: description: "Disk space on default disk is critically low, below 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseDiskSpaceLowOnBackups - expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20' + expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) > 0' for: 2m labels: severity: warning diff --git a/dist/rules/cloudflare/lablabs-cloudflare-exporter.yml b/dist/rules/cloudflare/lablabs-cloudflare-exporter.yml index d647ebf..1e0a9f1 100644 --- a/dist/rules/cloudflare/lablabs-cloudflare-exporter.yml +++ b/dist/rules/cloudflare/lablabs-cloudflare-exporter.yml @@ -6,7 +6,7 @@ groups: rules: - alert: CloudflareHttp4xxErrorRate - expr: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5' + expr: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[15m])) > 0' for: 0m labels: severity: warning @@ -15,7 +15,7 @@ groups: description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CloudflareHttp5xxErrorRate - expr: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5' + expr: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[5m])) > 0' for: 0m labels: severity: critical diff --git a/dist/rules/cortex/embedded-exporter.yml b/dist/rules/cortex/embedded-exporter.yml index e711e1e..82aee46 100644 --- a/dist/rules/cortex/embedded-exporter.yml +++ b/dist/rules/cortex/embedded-exporter.yml @@ -23,8 +23,9 @@ groups: summary: Cortex not connected to Alertmanager (instance {{ $labels.instance }}) description: "Cortex not connected to Alertmanager (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: CortexNotificationAreBeingDropped - expr: 'rate(cortex_prometheus_notifications_dropped_total[5m]) > 0' + expr: 'rate(cortex_prometheus_notifications_dropped_total[5m]) > 0.05' for: 0m labels: severity: critical @@ -32,8 +33,9 @@ groups: summary: Cortex notification are being dropped (instance {{ $labels.instance }}) description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold of 0.05/s avoids firing on transient single-event spikes. - alert: CortexNotificationError - expr: 'rate(cortex_prometheus_notifications_errors_total[5m]) > 0' + expr: 'rate(cortex_prometheus_notifications_errors_total[5m]) > 0.05' for: 0m labels: severity: critical diff --git a/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml b/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml index 5f434c5..3ae6ed6 100644 --- a/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml +++ b/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml @@ -141,7 +141,7 @@ groups: description: "Replication changes feed has failed reads more than 5 times in 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CouchdbFileDescriptorsHigh - expr: 'process_open_fds / process_max_fds > 0.85' + expr: 'process_open_fds / process_max_fds > 0.85 and process_max_fds > 0' for: 5m labels: severity: warning diff --git a/dist/rules/digitalocean/digitalocean-exporter.yml b/dist/rules/digitalocean/digitalocean-exporter.yml index 662adf0..3502587 100644 --- a/dist/rules/digitalocean/digitalocean-exporter.yml +++ b/dist/rules/digitalocean/digitalocean-exporter.yml @@ -88,7 +88,7 @@ groups: # Fires when more than 80% of the account's droplet limit is in use. - alert: DigitaloceanDropletLimitApproaching - expr: '(count(digitalocean_droplet_up) / digitalocean_account_droplet_limit) * 100 > 80' + expr: '(count(digitalocean_droplet_up) / digitalocean_account_droplet_limit) * 100 > 80 and digitalocean_account_droplet_limit > 0' for: 0m labels: severity: warning diff --git a/dist/rules/docker-containers/google-cadvisor.yml b/dist/rules/docker-containers/google-cadvisor.yml index 84bf65c..dcf5588 100644 --- a/dist/rules/docker-containers/google-cadvisor.yml +++ b/dist/rules/docker-containers/google-cadvisor.yml @@ -25,8 +25,9 @@ groups: summary: Container absent (instance {{ $labels.instance }}) description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Only fires for containers with explicit CPU limits. Containers without limits have cpu_quota=0, which is filtered out by the guard. - alert: ContainerHighCpuUtilization - expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80' + expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0' for: 2m labels: severity: warning @@ -45,7 +46,7 @@ groups: description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ContainerVolumeUsage - expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80' + expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 and sum(container_fs_inodes_total) BY (instance) > 0' for: 2m labels: severity: warning @@ -54,7 +55,7 @@ groups: description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ContainerHighThrottleRate - expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )' + expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0' for: 5m labels: severity: warning diff --git a/dist/rules/ebpf/ebpf-exporter.yml b/dist/rules/ebpf/ebpf-exporter.yml index 9c27343..79c8df0 100644 --- a/dist/rules/ebpf/ebpf-exporter.yml +++ b/dist/rules/ebpf/ebpf-exporter.yml @@ -25,7 +25,7 @@ groups: description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EbpfExporterNoEnabledConfigs - expr: 'absent(ebpf_exporter_enabled_configs)' + expr: 'ebpf_exporter_enabled_configs == 0 or absent(ebpf_exporter_enabled_configs)' for: 5m labels: severity: warning diff --git a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml index cf4386a..ae69b92 100644 --- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml +++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml @@ -6,7 +6,7 @@ groups: rules: - alert: ElasticsearchHeapUsageTooHigh - expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90' + expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0' for: 2m labels: severity: critical @@ -15,7 +15,7 @@ groups: description: "The heap usage is over 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHeapUsageWarning - expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80' + expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0' for: 2m labels: severity: warning @@ -24,7 +24,7 @@ groups: description: "The heap usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchDiskOutOfSpace - expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10' + expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 and elasticsearch_filesystem_data_size_bytes > 0' for: 0m labels: severity: critical @@ -33,7 +33,7 @@ groups: description: "The disk usage is over 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchDiskSpaceLow - expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20' + expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 and elasticsearch_filesystem_data_size_bytes > 0' for: 2m labels: severity: warning diff --git a/dist/rules/envoy/embedded-exporter.yml b/dist/rules/envoy/embedded-exporter.yml index cb1138a..3eb43de 100644 --- a/dist/rules/envoy/embedded-exporter.yml +++ b/dist/rules/envoy/embedded-exporter.yml @@ -15,7 +15,7 @@ groups: description: "Envoy server is not live (draining or shutting down) on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EnvoyHighMemoryUsage - expr: 'envoy_server_memory_allocated / envoy_server_memory_heap_size * 100 > 90' + expr: 'envoy_server_memory_allocated / envoy_server_memory_heap_size * 100 > 90 and envoy_server_memory_heap_size > 0' for: 5m labels: severity: warning diff --git a/dist/rules/etcd/embedded-exporter.yml b/dist/rules/etcd/embedded-exporter.yml index a934eb8..7a10a9d 100644 --- a/dist/rules/etcd/embedded-exporter.yml +++ b/dist/rules/etcd/embedded-exporter.yml @@ -32,8 +32,9 @@ groups: summary: Etcd high number of leader changes (instance {{ $labels.instance }}) description: "Etcd leader changed more than 2 times during 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - alert: EtcdHighNumberOfFailedGrpcRequests - expr: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01' + expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0' for: 2m labels: severity: warning @@ -41,8 +42,9 @@ groups: summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }}) description: "More than 1% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - alert: EtcdHighNumberOfFailedGrpcRequests - expr: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05' + expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0' for: 2m labels: severity: critical @@ -60,7 +62,7 @@ groups: description: "GRPC requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHighNumberOfFailedHttpRequests - expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01' + expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0' for: 2m labels: severity: warning @@ -69,7 +71,7 @@ groups: description: "More than 1% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: EtcdHighNumberOfFailedHttpRequests - expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05' + expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0' for: 2m labels: severity: critical diff --git a/dist/rules/freeswitch/znerol-freeswitch-exporter.yml b/dist/rules/freeswitch/znerol-freeswitch-exporter.yml index 154bd63..c0b9db6 100644 --- a/dist/rules/freeswitch/znerol-freeswitch-exporter.yml +++ b/dist/rules/freeswitch/znerol-freeswitch-exporter.yml @@ -15,7 +15,7 @@ groups: description: "Freeswitch is unresponsive\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FreeswitchSessionsWarning - expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80' + expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80 and freeswitch_session_limit > 0' for: 10m labels: severity: warning @@ -24,7 +24,7 @@ groups: description: "High sessions usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: FreeswitchSessionsCritical - expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 90' + expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 90 and freeswitch_session_limit > 0' for: 5m labels: severity: critical diff --git a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml index 6a8b34a..c9e2b6e 100644 --- a/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml +++ b/dist/rules/gitlab-ci/gitlab-built-in-exporter.yml @@ -101,7 +101,7 @@ groups: # When the pool is near saturation, requests may block waiting for a connection. # Increase db_pool_size in gitlab.rb or investigate slow queries. - alert: GitlabDatabaseConnectionPoolSaturation - expr: 'gitlab_database_connection_pool_busy / gitlab_database_connection_pool_size * 100 > 90' + expr: 'gitlab_database_connection_pool_busy / gitlab_database_connection_pool_size * 100 > 90 and gitlab_database_connection_pool_size > 0' for: 5m labels: severity: warning @@ -198,7 +198,7 @@ groups: description: "Multiple GitLab versions are running across the fleet.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GitlabHighFileDescriptorUsage - expr: 'process_open_fds{job=~".*gitlab.*"} / process_max_fds * 100 > 80' + expr: 'process_open_fds{job=~".*gitlab.*"} / process_max_fds * 100 > 80 and process_max_fds > 0' for: 5m labels: severity: warning diff --git a/dist/rules/grafana-mimir/embedded-exporter.yml b/dist/rules/grafana-mimir/embedded-exporter.yml index bc1ce60..a9ed278 100644 --- a/dist/rules/grafana-mimir/embedded-exporter.yml +++ b/dist/rules/grafana-mimir/embedded-exporter.yml @@ -16,7 +16,7 @@ groups: description: "Mimir has {{ $value }} unhealthy ingester(s) in the ring.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirRequestErrors - expr: '100 * sum by (job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route!~"ready|debug_pprof"}[5m])) / sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 1' + expr: '100 * sum by (job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route!~"ready|debug_pprof"}[5m])) / sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 1 and sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 0' for: 15m labels: severity: critical @@ -52,7 +52,7 @@ groups: description: "There are {{ $value }} queued up queries in {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirCacheRequestErrors - expr: '(sum by (name, operation, job) (rate(thanos_cache_operation_failures_total[5m])) / sum by (name, operation, job) (rate(thanos_cache_operations_total[5m]))) * 100 > 5' + expr: '(sum by (name, operation, job) (rate(thanos_cache_operation_failures_total[5m])) / sum by (name, operation, job) (rate(thanos_cache_operations_total[5m]))) * 100 > 5 and sum by (name, operation, job) (rate(thanos_cache_operations_total[5m])) > 0' for: 5m labels: severity: warning @@ -61,7 +61,7 @@ groups: description: "Mimir cache {{ $labels.name }} is experiencing {{ printf \"%.2f\" $value }}% errors for {{ $labels.operation }} operation.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirKvStoreFailure - expr: '(sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.."}[5m])) / sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m]))) == 1' + expr: '(sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.."}[5m])) / sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m]))) == 1 and sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m])) > 0' for: 5m labels: severity: critical @@ -70,7 +70,7 @@ groups: description: "Mimir {{ $labels.job }} KV store {{ $labels.kv_name }} is failing with 100% error rate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirMemoryMapAreasTooHigh - expr: 'process_memory_map_areas{job=~".*(ingester|cortex|mimir|store-gateway).*"} / process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} * 100 > 80' + expr: 'process_memory_map_areas{job=~".*(ingester|cortex|mimir|store-gateway).*"} / process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} * 100 > 80 and process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} > 0' for: 5m labels: severity: critical @@ -315,7 +315,7 @@ groups: description: "Mimir compactor has found blocks that cannot be compacted (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirRulerTooManyFailedPushes - expr: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1' + expr: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 0' for: 5m labels: severity: critical @@ -324,7 +324,7 @@ groups: description: "Mimir ruler {{ $labels.instance }} is failing to push {{ printf \"%.2f\" $value }}% of write requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirRulerTooManyFailedQueries - expr: '100 * sum by (instance, job) (rate(cortex_ruler_queries_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 1' + expr: '100 * sum by (instance, job) (rate(cortex_ruler_queries_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 0' for: 5m labels: severity: critical @@ -333,7 +333,7 @@ groups: description: "Mimir ruler {{ $labels.instance }} is failing {{ printf \"%.2f\" $value }}% of query evaluations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MimirRulerMissedEvaluations - expr: '100 * sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_missed_total[5m])) / sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 1' + expr: '100 * sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_missed_total[5m])) / sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 1 and sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 0' for: 5m labels: severity: warning diff --git a/dist/rules/grafana-tempo/embedded-exporter.yml b/dist/rules/grafana-tempo/embedded-exporter.yml index 6e96623..42d1994 100644 --- a/dist/rules/grafana-tempo/embedded-exporter.yml +++ b/dist/rules/grafana-tempo/embedded-exporter.yml @@ -81,7 +81,7 @@ groups: # Fires when the blocklist grows more than 40% over 7 days. - alert: TempoBlockListRisingQuickly - expr: '(avg(tempodb_blocklist_length) / avg(tempodb_blocklist_length offset 7d) - 1) * 100 > 40' + expr: '(avg(tempodb_blocklist_length) / avg(tempodb_blocklist_length offset 7d) - 1) * 100 > 40 and avg(tempodb_blocklist_length offset 7d) > 0' for: 15m labels: severity: critical @@ -146,7 +146,7 @@ groups: description: "Tempo metrics generator processor updates are failing for {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TempoMetricsGeneratorServiceGraphsDroppingSpans - expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5' + expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0' for: 15m labels: severity: warning @@ -165,7 +165,7 @@ groups: # Fires when the memcached error rate exceeds 20%. Only relevant if Tempo is configured with memcached caching. - alert: TempoMemcachedErrorsElevated - expr: '100 * sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count{status_code="500"}[5m])) / sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 20' + expr: '100 * sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count{status_code="500"}[5m])) / sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 20 and sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 0' for: 10m labels: severity: warning diff --git a/dist/rules/hadoop/jmx_exporter.yml b/dist/rules/hadoop/jmx_exporter.yml index d9f3b8e..c483ec9 100644 --- a/dist/rules/hadoop/jmx_exporter.yml +++ b/dist/rules/hadoop/jmx_exporter.yml @@ -33,7 +33,7 @@ groups: description: "The Hadoop DataNode is not sending heartbeats.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopHdfsDiskSpaceLow - expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1' + expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 and hadoop_hdfs_bytes_total > 0' for: 15m labels: severity: warning diff --git a/dist/rules/haproxy/embedded-exporter-v2.yml b/dist/rules/haproxy/embedded-exporter-v2.yml index a296434..4863a3d 100644 --- a/dist/rules/haproxy/embedded-exporter-v2.yml +++ b/dist/rules/haproxy/embedded-exporter-v2.yml @@ -6,7 +6,7 @@ groups: rules: - alert: HaproxyHighHttp4xxErrorRateBackend - expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' + expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical @@ -15,7 +15,7 @@ groups: description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp5xxErrorRateBackend - expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' + expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical @@ -24,7 +24,7 @@ groups: description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp4xxErrorRateServer - expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' + expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical @@ -33,7 +33,7 @@ groups: description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp5xxErrorRateServer - expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' + expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical @@ -42,7 +42,7 @@ groups: description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerResponseErrors - expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5' + expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical @@ -77,8 +77,9 @@ groups: summary: HAProxy backend max active session > 80% (instance {{ $labels.instance }}) description: "Session limit from backend {{ $labels.proxy }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # haproxy_backend_current_queue is a gauge (current queue depth), not a counter. - alert: HaproxyPendingRequests - expr: 'sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0' + expr: 'sum by (proxy) (haproxy_backend_current_queue) > 0' for: 2m labels: severity: warning diff --git a/dist/rules/haproxy/haproxy-exporter-v1.yml b/dist/rules/haproxy/haproxy-exporter-v1.yml index 9af8084..82bf561 100644 --- a/dist/rules/haproxy/haproxy-exporter-v1.yml +++ b/dist/rules/haproxy/haproxy-exporter-v1.yml @@ -15,7 +15,7 @@ groups: description: "HAProxy down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp4xxErrorRateBackend - expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5' + expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical @@ -24,7 +24,7 @@ groups: description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp5xxErrorRateBackend - expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5' + expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical @@ -33,7 +33,7 @@ groups: description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp4xxErrorRateServer - expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5' + expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical @@ -42,7 +42,7 @@ groups: description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyHighHttp5xxErrorRateServer - expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5' + expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical @@ -51,7 +51,7 @@ groups: description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyServerResponseErrors - expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5' + expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' for: 1m labels: severity: critical @@ -78,7 +78,7 @@ groups: description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyBackendMaxActiveSession - expr: '((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80' + expr: '((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80 and sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])) > 0' for: 2m labels: severity: warning diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 26898e6..0ef89c4 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -34,7 +34,7 @@ groups: description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualNetworkThroughputIn - expr: '((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80)' + expr: '((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0' for: 0m labels: severity: warning @@ -43,7 +43,7 @@ groups: description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualNetworkThroughputOut - expr: '((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80)' + expr: '((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0' for: 0m labels: severity: warning @@ -85,7 +85,7 @@ groups: description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostOutOfInodes - expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)' + expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) and node_filesystem_files > 0' for: 2m labels: severity: critical @@ -188,7 +188,7 @@ groups: description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostSwapIsFillingUp - expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)' + expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) and node_memory_SwapTotal_bytes > 0' for: 2m labels: severity: warning @@ -223,8 +223,9 @@ groups: summary: Host node overtemperature alarm (instance {{ $labels.instance }}) description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin. - alert: HostSoftwareRaidInsufficientDrives - expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)' + expr: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)' for: 0m labels: severity: critical @@ -279,7 +280,7 @@ groups: description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkReceiveErrors - expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)' + expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0' for: 2m labels: severity: warning @@ -288,7 +289,7 @@ groups: description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkTransmitErrors - expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)' + expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) and rate(node_network_transmit_packets_total[2m]) > 0' for: 2m labels: severity: warning @@ -306,7 +307,7 @@ groups: description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostConntrackLimit - expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)' + expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) and node_nf_conntrack_entries_limit > 0' for: 5m labels: severity: warning diff --git a/dist/rules/istio/embedded-exporter.yml b/dist/rules/istio/embedded-exporter.yml index aef8632..cc05444 100644 --- a/dist/rules/istio/embedded-exporter.yml +++ b/dist/rules/istio/embedded-exporter.yml @@ -15,7 +15,7 @@ groups: description: "Gateway pods have dropped. Inbound traffic will likely be affected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioPilotHighTotalRequestRate - expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5' + expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0' for: 1m labels: severity: warning @@ -51,7 +51,7 @@ groups: description: "Global request rate in the service mesh is unusually low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioHigh4xxErrorRate - expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5' + expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0' for: 1m labels: severity: warning @@ -60,7 +60,7 @@ groups: description: "High percentage of HTTP 4xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioHigh5xxErrorRate - expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5' + expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0' for: 1m labels: severity: warning @@ -69,7 +69,7 @@ groups: description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioHighRequestLatency - expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100' + expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100 and rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 0' for: 1m labels: severity: warning diff --git a/dist/rules/jaeger/embedded-exporter.yml b/dist/rules/jaeger/embedded-exporter.yml index 3b484ae..acac4bd 100644 --- a/dist/rules/jaeger/embedded-exporter.yml +++ b/dist/rules/jaeger/embedded-exporter.yml @@ -6,7 +6,7 @@ groups: rules: - alert: JaegerAgentHttpServerErrors - expr: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1' + expr: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning @@ -15,7 +15,7 @@ groups: description: "Jaeger agent on {{ $labels.instance }} is experiencing {{ $value | humanize }}% HTTP server errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerClientRpcRequestErrors - expr: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1' + expr: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning @@ -24,7 +24,7 @@ groups: description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerClientSpansDropped - expr: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1' + expr: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning @@ -33,7 +33,7 @@ groups: description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerAgentSpansDropped - expr: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1' + expr: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning @@ -42,7 +42,7 @@ groups: description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerCollectorDroppingSpans - expr: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1' + expr: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning @@ -51,7 +51,7 @@ groups: description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerSamplingUpdateFailing - expr: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1' + expr: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning @@ -60,7 +60,7 @@ groups: description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerThrottlingUpdateFailing - expr: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1' + expr: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning @@ -69,7 +69,7 @@ groups: description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JaegerQueryRequestFailures - expr: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1' + expr: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 0' for: 15m labels: severity: warning diff --git a/dist/rules/jvm/jvm-exporter.yml b/dist/rules/jvm/jvm-exporter.yml index 67f7842..d7428f6 100644 --- a/dist/rules/jvm/jvm-exporter.yml +++ b/dist/rules/jvm/jvm-exporter.yml @@ -6,7 +6,7 @@ groups: rules: - alert: JvmMemoryFillingUp - expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80' + expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80 and sum by (instance)(jvm_memory_max_bytes{area="heap"}) > 0' for: 2m labels: severity: warning @@ -73,7 +73,7 @@ groups: description: "Frequent old/major GC cycles, indicating memory pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JvmDirectBufferPoolFillingUp - expr: '(jvm_buffer_pool_used_bytes / jvm_buffer_pool_capacity_bytes) * 100 > 90' + expr: '(jvm_buffer_pool_used_bytes / jvm_buffer_pool_capacity_bytes) * 100 > 90 and jvm_buffer_pool_capacity_bytes > 0' for: 5m labels: severity: warning @@ -93,7 +93,7 @@ groups: # process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not JVM-specific. # This alert will also fire for Go, Python, or any process exposing these metrics. - alert: JvmFileDescriptorsExhaustion - expr: '(process_open_fds / process_max_fds) * 100 > 90' + expr: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0' for: 5m labels: severity: warning diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml index f432f0a..3ff0d75 100644 --- a/dist/rules/kubernetes/kubestate-exporter.yml +++ b/dist/rules/kubernetes/kubestate-exporter.yml @@ -116,7 +116,7 @@ groups: description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesVolumeOutOfDiskSpace - expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10' + expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10 and kubelet_volume_stats_capacity_bytes > 0' for: 2m labels: severity: warning @@ -260,7 +260,7 @@ groups: description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesDaemonsetRolloutStuck - expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0' + expr: '(kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 and kube_daemonset_status_desired_number_scheduled > 0) or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0' for: 10m labels: severity: warning @@ -297,7 +297,7 @@ groups: description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesApiServerErrors - expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3' + expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3 and sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) > 0' for: 2m labels: severity: critical @@ -306,7 +306,7 @@ groups: description: "Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesApiClientErrors - expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1' + expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 and sum(rate(rest_client_requests_total[1m])) by (instance, job) > 0' for: 2m labels: severity: critical diff --git a/dist/rules/linkerd/embedded-exporter.yml b/dist/rules/linkerd/embedded-exporter.yml index 054e461..68bda7c 100644 --- a/dist/rules/linkerd/embedded-exporter.yml +++ b/dist/rules/linkerd/embedded-exporter.yml @@ -5,8 +5,9 @@ groups: rules: + # Linkerd does not expose request_errors_total. Errors are tracked via response_total{classification="failure"}. - alert: LinkerdHighErrorRate - expr: 'sum(rate(request_errors_total[1m])) by (deployment, statefulset, daemonset) / sum(rate(request_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10' + expr: 'sum(rate(response_total{classification="failure"}[1m])) by (deployment, statefulset, daemonset) / sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10 and sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) > 0' for: 1m labels: severity: warning diff --git a/dist/rules/loki/embedded-exporter.yml b/dist/rules/loki/embedded-exporter.yml index 0283bc5..f7d0974 100644 --- a/dist/rules/loki/embedded-exporter.yml +++ b/dist/rules/loki/embedded-exporter.yml @@ -15,7 +15,7 @@ groups: description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestErrors - expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10' + expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 and sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 0' for: 15m labels: severity: critical diff --git a/dist/rules/minio/embedded-exporter.yml b/dist/rules/minio/embedded-exporter.yml index e0f53bd..74375eb 100644 --- a/dist/rules/minio/embedded-exporter.yml +++ b/dist/rules/minio/embedded-exporter.yml @@ -24,7 +24,7 @@ groups: description: "Minio cluster node disk is offline\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MinioDiskSpaceUsage - expr: 'minio_cluster_capacity_raw_free_bytes / minio_cluster_capacity_raw_total_bytes * 100 < 10' + expr: 'minio_cluster_capacity_raw_free_bytes / minio_cluster_capacity_raw_total_bytes * 100 < 10 and minio_cluster_capacity_raw_total_bytes > 0' for: 0m labels: severity: warning diff --git a/dist/rules/mongodb/dcu-mongodb-exporter.yml b/dist/rules/mongodb/dcu-mongodb-exporter.yml index 422e224..ef7835d 100644 --- a/dist/rules/mongodb/dcu-mongodb-exporter.yml +++ b/dist/rules/mongodb/dcu-mongodb-exporter.yml @@ -78,7 +78,7 @@ groups: description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbTooManyConnections - expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80' + expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0' for: 2m labels: severity: warning diff --git a/dist/rules/mongodb/percona-mongodb-exporter.yml b/dist/rules/mongodb/percona-mongodb-exporter.yml index 25d7642..a257c9d 100644 --- a/dist/rules/mongodb/percona-mongodb-exporter.yml +++ b/dist/rules/mongodb/percona-mongodb-exporter.yml @@ -63,7 +63,7 @@ groups: description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbTooManyConnections - expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80' + expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0' for: 2m labels: severity: warning diff --git a/dist/rules/mysql/mysqld-exporter.yml b/dist/rules/mysql/mysqld-exporter.yml index b581754..582324e 100644 --- a/dist/rules/mysql/mysqld-exporter.yml +++ b/dist/rules/mysql/mysqld-exporter.yml @@ -16,7 +16,7 @@ groups: description: "MySQL instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlTooManyConnections(>80%) - expr: 'max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80' + expr: 'max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 and mysql_global_variables_max_connections > 0' for: 2m labels: severity: warning @@ -25,7 +25,7 @@ groups: description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlHighPreparedStatementsUtilization(>80%) - expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80' + expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 and mysql_global_variables_max_prepared_stmt_count > 0' for: 2m labels: severity: warning @@ -34,7 +34,7 @@ groups: description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlHighThreadsRunning - expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60' + expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 and mysql_global_variables_max_connections > 0' for: 2m labels: severity: warning @@ -108,7 +108,7 @@ groups: description: "MySQL is being overload with unusual QPS (> 10k QPS).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MysqlTooManyOpenFiles - expr: 'mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75' + expr: 'mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75 and mysql_global_variables_open_files_limit > 0' for: 2m labels: severity: warning diff --git a/dist/rules/netdata/embedded-exporter.yml b/dist/rules/netdata/embedded-exporter.yml index 4540a17..6d4f1e0 100644 --- a/dist/rules/netdata/embedded-exporter.yml +++ b/dist/rules/netdata/embedded-exporter.yml @@ -5,8 +5,9 @@ groups: rules: + # This is a gauge metric (not a counter). Checking idle < 20% means CPU usage > 80%. - alert: NetdataHighCpuUsage - expr: 'rate(netdata_cpu_cpu_percentage_average{dimension="idle"}[1m]) > 80' + expr: 'netdata_cpu_cpu_percentage_average{dimension="idle"} < 20' for: 5m labels: severity: warning @@ -15,7 +16,7 @@ groups: description: "Netdata high CPU usage (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostCpuStealNoisyNeighbor - expr: 'rate(netdata_cpu_cpu_percentage_average{dimension="steal"}[1m]) > 10' + expr: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10' for: 5m labels: severity: warning @@ -24,7 +25,7 @@ groups: description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataHighMemoryUsage - expr: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20' + expr: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20 and netdata_system_ram_MiB_average > 0' for: 5m labels: severity: warning @@ -33,7 +34,7 @@ groups: description: "Netdata high memory usage (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataLowDiskSpace - expr: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20' + expr: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20 and netdata_disk_space_GB_average > 0' for: 5m labels: severity: warning diff --git a/dist/rules/nginx/knyar-nginx-exporter.yml b/dist/rules/nginx/knyar-nginx-exporter.yml index a7ab176..0e54171 100644 --- a/dist/rules/nginx/knyar-nginx-exporter.yml +++ b/dist/rules/nginx/knyar-nginx-exporter.yml @@ -6,7 +6,7 @@ groups: rules: - alert: NginxHighHttp4xxErrorRate - expr: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5' + expr: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0' for: 1m labels: severity: critical @@ -15,7 +15,7 @@ groups: description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NginxHighHttp5xxErrorRate - expr: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5' + expr: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0' for: 1m labels: severity: critical diff --git a/dist/rules/php-fpm/bakins-fpm-exporter.yml b/dist/rules/php-fpm/bakins-fpm-exporter.yml index f5cce13..379fea7 100644 --- a/dist/rules/php-fpm/bakins-fpm-exporter.yml +++ b/dist/rules/php-fpm/bakins-fpm-exporter.yml @@ -6,7 +6,7 @@ groups: rules: - alert: Php-fpmMax-childrenReached - expr: 'sum(phpfpm_max_children_reached_total) by (instance) > 0' + expr: 'sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 0' for: 0m labels: severity: warning diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index b08bef7..b0565aa 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -115,7 +115,7 @@ groups: description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTooManyDeadTuples - expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1' + expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 and (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) > 0' for: 2m labels: severity: warning @@ -142,7 +142,7 @@ groups: description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTooManyLocksAcquired - expr: '((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20' + expr: '((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 and (pg_settings_max_locks_per_transaction * pg_settings_max_connections) > 0' for: 2m labels: severity: critical diff --git a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml index dfc4025..09fe333 100644 --- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml +++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml @@ -18,7 +18,7 @@ groups: # If all targets are down, PrometheusJobMissing or PrometheusAllTargetsMissing will fire instead. - alert: PrometheusTargetMissing expr: 'up == 0 unless on(job) (sum by (job) (up) == 0)' - for: 0m + for: 1m labels: severity: critical annotations: @@ -27,7 +27,7 @@ groups: - alert: PrometheusAllTargetsMissing expr: 'sum by (job) (up) == 0' - for: 0m + for: 1m labels: severity: critical annotations: @@ -36,7 +36,7 @@ groups: - alert: PrometheusTargetMissingWithWarmupTime expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))' - for: 0m + for: 1m labels: severity: critical annotations: diff --git a/dist/rules/promtail/embedded-exporter.yml b/dist/rules/promtail/embedded-exporter.yml index bd32fa5..f8e0a46 100644 --- a/dist/rules/promtail/embedded-exporter.yml +++ b/dist/rules/promtail/embedded-exporter.yml @@ -6,7 +6,7 @@ groups: rules: - alert: PromtailRequestErrors - expr: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10' + expr: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10 and sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 0' for: 5m labels: severity: critical diff --git a/dist/rules/python/python-exporter.yml b/dist/rules/python/python-exporter.yml index d4211d5..1da8228 100644 --- a/dist/rules/python/python-exporter.yml +++ b/dist/rules/python/python-exporter.yml @@ -25,7 +25,7 @@ groups: # process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not Python-specific. - alert: PythonFileDescriptorsExhaustion - expr: '(process_open_fds / process_max_fds) * 100 > 90' + expr: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0' for: 5m labels: severity: warning diff --git a/dist/rules/redis/oliver006-redis-exporter.yml b/dist/rules/redis/oliver006-redis-exporter.yml index db96953..ce6bbb5 100644 --- a/dist/rules/redis/oliver006-redis-exporter.yml +++ b/dist/rules/redis/oliver006-redis-exporter.yml @@ -72,7 +72,7 @@ groups: # The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable. - alert: RedisOutOfSystemMemory - expr: 'redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90' + expr: 'redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90 and redis_total_system_memory_bytes > 0' for: 2m labels: severity: warning @@ -90,7 +90,7 @@ groups: description: "Redis is running out of configured maxmemory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisTooManyConnections - expr: 'redis_connected_clients / redis_config_maxclients * 100 > 90' + expr: 'redis_connected_clients / redis_config_maxclients * 100 > 90 and redis_config_maxclients > 0' for: 2m labels: severity: warning diff --git a/dist/rules/systemd/systemd-exporter.yml b/dist/rules/systemd/systemd-exporter.yml index f8765e5..8e1717e 100644 --- a/dist/rules/systemd/systemd-exporter.yml +++ b/dist/rules/systemd/systemd-exporter.yml @@ -34,7 +34,7 @@ groups: description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SystemdUnitTasksNearLimit - expr: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and systemd_unit_tasks_max > 0' + expr: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and ignoring(type) systemd_unit_tasks_max > 0' for: 5m labels: severity: warning diff --git a/dist/rules/thanos/thanos-bucket-replicate.yml b/dist/rules/thanos/thanos-bucket-replicate.yml index 972ed1c..f1c44fe 100644 --- a/dist/rules/thanos/thanos-bucket-replicate.yml +++ b/dist/rules/thanos/thanos-bucket-replicate.yml @@ -6,7 +6,7 @@ groups: rules: - alert: ThanosBucketReplicateErrorRate - expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m]))/ on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10' + expr: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) > 0' for: 5m labels: severity: critical diff --git a/dist/rules/thanos/thanos-compactor.yml b/dist/rules/thanos/thanos-compactor.yml index 67032a9..3c88a33 100644 --- a/dist/rules/thanos/thanos-compactor.yml +++ b/dist/rules/thanos/thanos-compactor.yml @@ -24,7 +24,7 @@ groups: description: "Thanos Compact {{$labels.job}} has failed to run and now is halted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosCompactorHighCompactionFailures - expr: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)' + expr: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) > 0' for: 15m labels: severity: warning @@ -33,7 +33,7 @@ groups: description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosCompactBucketHighOperationFailures - expr: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)' + expr: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) > 0' for: 15m labels: severity: warning diff --git a/dist/rules/thanos/thanos-query.yml b/dist/rules/thanos/thanos-query.yml index 7813b8a..fc0b4c1 100644 --- a/dist/rules/thanos/thanos-query.yml +++ b/dist/rules/thanos/thanos-query.yml @@ -6,7 +6,7 @@ groups: rules: - alert: ThanosQueryHttpRequestQueryErrorRateHigh - expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5' + expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m])) > 0' for: 5m labels: severity: critical @@ -15,7 +15,7 @@ groups: description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh - expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5' + expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0' for: 5m labels: severity: critical @@ -24,7 +24,7 @@ groups: description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of \"query_range\" requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryGrpcServerErrorRate - expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)' + expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5) and sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) > 0' for: 5m labels: severity: warning @@ -33,7 +33,7 @@ groups: description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryGrpcClientErrorRate - expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5' + expr: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0' for: 5m labels: severity: warning @@ -42,7 +42,7 @@ groups: description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosQueryHighDNSFailures - expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1' + expr: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) > 0' for: 15m labels: severity: warning diff --git a/dist/rules/thanos/thanos-receiver.yml b/dist/rules/thanos/thanos-receiver.yml index 9ff00c3..8cc54f8 100644 --- a/dist/rules/thanos/thanos-receiver.yml +++ b/dist/rules/thanos/thanos-receiver.yml @@ -6,7 +6,7 @@ groups: rules: - alert: ThanosReceiveHttpRequestErrorRateHigh - expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5' + expr: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0' for: 5m labels: severity: critical @@ -33,7 +33,7 @@ groups: description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveHighForwardRequestFailures - expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20' + expr: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) > 0' for: 5m labels: severity: info @@ -42,7 +42,7 @@ groups: description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosReceiveHighHashringFileRefreshFailures - expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0)' + expr: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0' for: 15m labels: severity: warning diff --git a/dist/rules/thanos/thanos-ruler.yml b/dist/rules/thanos/thanos-ruler.yml index e69e6fa..17f11a7 100644 --- a/dist/rules/thanos/thanos-ruler.yml +++ b/dist/rules/thanos/thanos-ruler.yml @@ -24,7 +24,7 @@ groups: description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleHighRuleEvaluationFailures - expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)' + expr: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 5m labels: severity: critical @@ -51,7 +51,7 @@ groups: description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleGrpcErrorRate - expr: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)' + expr: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 5m labels: severity: warning @@ -69,7 +69,7 @@ groups: description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleQueryHighDNSFailures - expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)' + expr: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 15m labels: severity: warning @@ -78,7 +78,7 @@ groups: description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosRuleAlertmanagerHighDNSFailures - expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)' + expr: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0' for: 15m labels: severity: warning diff --git a/dist/rules/thanos/thanos-store.yml b/dist/rules/thanos/thanos-store.yml index 633ba97..289e0dd 100644 --- a/dist/rules/thanos/thanos-store.yml +++ b/dist/rules/thanos/thanos-store.yml @@ -6,7 +6,7 @@ groups: rules: - alert: ThanosStoreGrpcErrorRate - expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)' + expr: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5) and sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) > 0' for: 5m labels: severity: warning @@ -24,7 +24,7 @@ groups: description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ThanosStoreBucketHighOperationFailures - expr: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)' + expr: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) > 0' for: 15m labels: severity: warning diff --git a/dist/rules/traefik/embedded-exporter-v1.yml b/dist/rules/traefik/embedded-exporter-v1.yml index 0d95ac6..c0c9e59 100644 --- a/dist/rules/traefik/embedded-exporter-v1.yml +++ b/dist/rules/traefik/embedded-exporter-v1.yml @@ -15,7 +15,7 @@ groups: description: "All Traefik backends are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TraefikHighHttp4xxErrorRateBackend - expr: 'sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5' + expr: 'sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0' for: 1m labels: severity: critical @@ -24,7 +24,7 @@ groups: description: "Traefik backend 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TraefikHighHttp5xxErrorRateBackend - expr: 'sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5' + expr: 'sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0' for: 1m labels: severity: critical diff --git a/dist/rules/traefik/embedded-exporter-v2.yml b/dist/rules/traefik/embedded-exporter-v2.yml index d04519a..772b9df 100644 --- a/dist/rules/traefik/embedded-exporter-v2.yml +++ b/dist/rules/traefik/embedded-exporter-v2.yml @@ -15,7 +15,7 @@ groups: description: "All Traefik services are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TraefikHighHttp4xxErrorRateService - expr: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5' + expr: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0' for: 1m labels: severity: critical @@ -24,7 +24,7 @@ groups: description: "Traefik service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: TraefikHighHttp5xxErrorRateService - expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5' + expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0' for: 1m labels: severity: critical diff --git a/dist/rules/windows-server/windows-exporter.yml b/dist/rules/windows-server/windows-exporter.yml index 08ab937..7ed3d7b 100644 --- a/dist/rules/windows-server/windows-exporter.yml +++ b/dist/rules/windows-server/windows-exporter.yml @@ -42,7 +42,7 @@ groups: description: "Memory usage is more than 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: WindowsServerDiskSpaceUsage - expr: '100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80' + expr: '100 - 100 * (windows_logical_disk_free_bytes / windows_logical_disk_size_bytes) > 80 and windows_logical_disk_size_bytes > 0' for: 2m labels: severity: critical diff --git a/dist/rules/zfs/zfs_exporter.yml b/dist/rules/zfs/zfs_exporter.yml index 52abb00..78c4d05 100644 --- a/dist/rules/zfs/zfs_exporter.yml +++ b/dist/rules/zfs/zfs_exporter.yml @@ -6,7 +6,7 @@ groups: rules: - alert: ZfsPoolOutOfSpace - expr: 'zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0' + expr: 'zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0' for: 0m labels: severity: warning