From 267c3e8e70db9f39aa3f7cb697d58aaa3d790fbe Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 29 Apr 2024 22:35:43 +0200 Subject: [PATCH 01/32] Update rules.yml --- _data/rules.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_data/rules.yml b/_data/rules.yml index 3dc5c15..838591e 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -199,6 +199,7 @@ groups: description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem" query: "node_filesystem_device_error == 1" severity: critical + for: 2m - name: Host inodes will fill in 24 hours description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate query: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' From b77cb3467c1de3dd45d35e8d4e459a10df544628 Mon Sep 17 00:00:00 2001 From: samber Date: Mon, 29 Apr 2024 20:36:49 +0000 Subject: [PATCH 02/32] Publish --- dist/rules/host-and-hardware/node-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index de48231..6655ef7 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -96,7 +96,7 @@ groups: - alert: HostFilesystemDeviceError expr: 'node_filesystem_device_error == 1' - for: 0m + for: 2m labels: severity: critical annotations: From aad1c4cd959a4713cba9e614513d20ddcaf9cd93 Mon Sep 17 00:00:00 2001 From: Sergey Shtoltz Date: Thu, 2 May 2024 21:48:46 +0300 Subject: [PATCH 03/32] RedisOutOfConfiguredMaxmemory: checking if memory limit is set (#410) --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 838591e..109b0c8 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -821,7 +821,7 @@ groups: The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable. - name: Redis out of configured maxmemory description: Redis is running out of configured maxmemory (> 90%) - query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90" + query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0" severity: warning for: 2m - name: Redis too many connections From 5c0963558a1165f89bf2c0216c52512fb07602da Mon Sep 17 00:00:00 2001 From: samber Date: Thu, 2 May 2024 18:49:56 +0000 Subject: [PATCH 04/32] Publish --- dist/rules/redis/oliver006-redis-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/rules/redis/oliver006-redis-exporter.yml b/dist/rules/redis/oliver006-redis-exporter.yml index 08cdf23..6b4dd8d 100644 --- a/dist/rules/redis/oliver006-redis-exporter.yml +++ b/dist/rules/redis/oliver006-redis-exporter.yml @@ -77,7 +77,7 @@ groups: description: "Redis is running out of system memory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisOutOfConfiguredMaxmemory - expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90' + expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0' for: 2m labels: severity: warning From 59e6a9165dfb5dd23a3a3eafdf75210a818982e8 Mon Sep 17 00:00:00 2001 From: enesyalinkaya <49714068+enesyalinkaya@users.noreply.github.com> Date: Mon, 6 May 2024 02:32:00 +0300 Subject: [PATCH 05/32] add new alerts for elasticsearch rules.yml (#411) This commit adds new Prometheus alert definitions to monitor indexing and query metrics in Elasticsearch clusters. These alerts are essential for detecting performance issues related to indexing and querying activities. --- _data/rules.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index 109b0c8..744bf10 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1128,6 +1128,26 @@ groups: description: No new documents for 10 min! query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1' severity: warning + - name: Elasticsearch High Indexing Latency + description: "The indexing latency on Elasticsearch cluster is higher than the threshold." + query: "elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005" + severity: warning + for: 10m + - name: Elasticsearch High Indexing Rate + description: "The indexing rate on Elasticsearch cluster is higher than the threshold." + query: "elasticsearch_indices_indexing_index_total > 100000" + severity: warning + for: 5m + - name: Elasticsearch High Query Rate + description: "The query rate on Elasticsearch cluster is higher than the threshold." + query: "elasticsearch_indices_search_query_total > 100000" + severity: warning + for: 5m + - name: Elasticsearch High Query Latency + description: "The query latency on Elasticsearch cluster is higher than the threshold." + query: "elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1" + severity: warning + for: 5m - name: Cassandra exporters: From 515fca9c10898f728c116c9816a186c9d600a5b4 Mon Sep 17 00:00:00 2001 From: samber Date: Sun, 5 May 2024 23:33:11 +0000 Subject: [PATCH 06/32] Publish --- ...theus-community-elasticsearch-exporter.yml | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml index 4ed5660..9aeadec 100644 --- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml +++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml @@ -138,3 +138,39 @@ groups: annotations: summary: Elasticsearch no new documents (instance {{ $labels.instance }}) description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ElasticsearchHighIndexingLatency + expr: 'elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005' + for: 10m + labels: + severity: warning + annotations: + summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }}) + description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ElasticsearchHighIndexingRate + expr: 'elasticsearch_indices_indexing_index_total > 100000' + for: 5m + labels: + severity: warning + annotations: + summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }}) + description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ElasticsearchHighQueryRate + expr: 'elasticsearch_indices_search_query_total > 100000' + for: 5m + labels: + severity: warning + annotations: + summary: Elasticsearch High Query Rate (instance {{ $labels.instance }}) + description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ElasticsearchHighQueryLatency + expr: 'elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1' + for: 5m + labels: + severity: warning + annotations: + summary: Elasticsearch High Query Latency (instance {{ $labels.instance }}) + description: "The query latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" From 2547288c13ba21703dbd4be25e3c6e4618180255 Mon Sep 17 00:00:00 2001 From: Ali <115415312+xogoodnow@users.noreply.github.com> Date: Mon, 13 May 2024 12:02:18 +0330 Subject: [PATCH 07/32] Added Clickhouse (#412) * Added Clickhouse * Update rules.yml Added reasonable time periods for each query to avoid false positives and in some cased give the system a short window to try to solve the issue. Also changed the severity level of authentication alerts from critical to info which seems more appropriate * Modified time period for alerts embedded-exporter.yml I made a few adjustments in time periods. See if they seem reasonable or not * Replication alerts time periods were adjusted IMHO, replication alerts must be sent right away. --- _data/rules.yml | 82 ++++++++++++ dist/rules/clickhouse/embedded-exporter.yml | 131 ++++++++++++++++++++ 2 files changed, 213 insertions(+) create mode 100644 dist/rules/clickhouse/embedded-exporter.yml diff --git a/_data/rules.yml b/_data/rules.yml index 744bf10..ffb4604 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1300,6 +1300,88 @@ groups: severity: critical for: 2m + - name: Clickhouse + exporters: + - name: Embedded Exporter + slug: embedded-exporter + doc_url: https://clickhouse.com/docs/en/operations/system-tables/metrics + rules: + - name: ClickHouse Memory Usage Critical + description: Memory usage is critically high, over 90%. + query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90" + severity: critical + for: 5m + - name: ClickHouse Memory Usage Warning + description: Memory usage is over 80%. + query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80" + severity: warning + for: 5m + - name: ClickHouse Disk Space Low on Default + description: Disk space on default is below 20%. + query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20" + severity: warning + for: 2m + - name: ClickHouse Disk Space Critical on Default + description: Disk space on default disk is critically low, below 10%. + query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10" + severity: critical + for: 2m + - name: ClickHouse Disk Space Low on Backups + description: Disk space on backups is below 20%. + query: "ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20" + severity: warning + for: 2m + - name: ClickHouse Replica Errors + description: Critical replica errors detected, either all replicas are stale or lost. + query: "ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1" + severity: critical + for: 0m + - name: ClickHouse No Available Replicas + description: No available replicas in ClickHouse. + query: "ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1" + severity: critical + for: 0m + - name: ClickHouse No Live Replicas + description: There are too few live replicas available, risking data loss and service disruption. + query: "ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1" + severity: critical + for: 0m + - name: ClickHouse High Network Traffic + description: Network traffic is unusually high, may affect cluster performance. + query: "ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250" + severity: warning + for: 5m + comments: | + Please replace the threshold with an appropriate value + - name: ClickHouse High TCP Connections + description: High number of TCP connections, indicating heavy client or inter-cluster communication. + query: "ClickHouseMetrics_TCPConnection > 400" + severity: warning + for: 5m + comments: | + Please replace the threshold with an appropriate value + - name: ClickHouse Interserver Connection Issues + description: An increase in interserver connections may indicate replication or distributed query handling issues. + query: "increase(ClickHouseMetrics_InterserverConnection[5m]) > 0" + severity: warning + for: 1m + - name: ClickHouse ZooKeeper Connection Issues + description: ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination. + query: "avg(ClickHouseMetrics_ZooKeeperSession) != 1" + severity: warning + for: 3m + - name: ClickHouse Authentication Failures + description: Authentication failures detected, indicating potential security issues or misconfiguration. + query: "increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0" + severity: info + for: 0m + - name: ClickHouse Access Denied Errors + description: Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts. + query: "increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0" + severity: info + for: 0m + + - name: Zookeeper exporters: - name: cloudflare/kafka_zookeeper_exporter diff --git a/dist/rules/clickhouse/embedded-exporter.yml b/dist/rules/clickhouse/embedded-exporter.yml new file mode 100644 index 0000000..19917bb --- /dev/null +++ b/dist/rules/clickhouse/embedded-exporter.yml @@ -0,0 +1,131 @@ +groups: +- name: EmbeddedExporter + rules: + - alert: ClickHouseMemoryUsageCritical + expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90' + for: 5m + labels: + severity: critical + annotations: + summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }}) + description: "Memory usage is critically high, over 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ClickHouseMemoryUsageWarning + expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80' + for: 5m + labels: + severity: warning + annotations: + summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }}) + description: "Memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ClickHouseDiskSpaceLowDefault + expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20' + for: 2m + labels: + severity: warning + annotations: + summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }}) + description: "Disk space on default is below 20%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ClickHouseDiskSpaceCriticalDefault + expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10' + for: 2m + labels: + severity: critical + annotations: + summary: ClickHouse Disk Space Critical on Default Disk (instance {{ $labels.instance }}) + description: "Disk space on default disk is critically low, below 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ClickHouseDiskSpaceLowBackups + expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20' + for: 2m + labels: + severity: warning + annotations: + summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }}) + description: "Disk space on backups is below 20%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ClickHouseReplicaErrors + expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1' + for: 0m + labels: + severity: critical + annotations: + summary: ClickHouse Replica Errors Detected (instance {{ $labels.instance }}) + description: "Critical replica errors detected, either all replicas are stale or lost.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ClickHouseNoAvailableReplicas + expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1' + for: 0m + labels: + severity: critical + annotations: + summary: No Available Replicas in ClickHouse (instance {{ $labels.instance }}) + description: "No available replicas in ClickHouse.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ClickHouseNoLiveReplicas + expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1' + for: 0m + labels: + severity: critical + annotations: + summary: No Live Replicas in ClickHouse (instance {{ $labels.instance }}) + description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + + - alert: ClickHouseNetworkUsageHigh + expr: 'ClickHouseMetrics_NetworkSend > 1000 or ClickHouseMetrics_NetworkReceive > 1000' + for: 5m + labels: + severity: warning + annotations: + summary: High Network Traffic in ClickHouse (instance {{ $labels.instance }}) + description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ClickHouseHighTCPConnections + expr: 'ClickHouseMetrics_TCPConnection > 1500' + for: 5m + labels: + severity: warning + annotations: + summary: High TCP Connections in ClickHouse (instance {{ $labels.instance }}) + description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ClickHouseInterserverConnectionIssues + expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0' + for: 0m + labels: + severity: warning + annotations: + summary: Interserver Connection Issues in ClickHouse (instance {{ $labels.instance }}) + description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ClickHouseZooKeeperConnectionIssues + expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1' + for: 5m + labels: + severity: warning + annotations: + summary: ZooKeeper Connection Issues in ClickHouse (instance {{ $labels.instance }}) + description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ClickHouseAuthenticationFailures + expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Authentication Failures in ClickHouse (instance {{ $labels.instance }}) + description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ClickHouseAccessDeniedErrors + expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0' + for: 1m + labels: + severity: critical + annotations: + summary: Access Denied Errors in ClickHouse (instance {{ $labels.instance }}) + description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + From 84b0569c97975361b600f25aa90d5fc1e583bd87 Mon Sep 17 00:00:00 2001 From: samber Date: Mon, 13 May 2024 08:33:30 +0000 Subject: [PATCH 08/32] Publish --- dist/rules/clickhouse/embedded-exporter.yml | 74 ++++++++++----------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/dist/rules/clickhouse/embedded-exporter.yml b/dist/rules/clickhouse/embedded-exporter.yml index 19917bb..3efe551 100644 --- a/dist/rules/clickhouse/embedded-exporter.yml +++ b/dist/rules/clickhouse/embedded-exporter.yml @@ -1,7 +1,10 @@ groups: + - name: EmbeddedExporter + rules: - - alert: ClickHouseMemoryUsageCritical + + - alert: ClickhouseMemoryUsageCritical expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90' for: 5m labels: @@ -10,122 +13,119 @@ groups: summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }}) description: "Memory usage is critically high, over 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ClickHouseMemoryUsageWarning + - alert: ClickhouseMemoryUsageWarning expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80' for: 5m labels: severity: warning annotations: summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }}) - description: "Memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Memory usage is over 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ClickHouseDiskSpaceLowDefault + - alert: ClickhouseDiskSpaceLowOnDefault expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20' for: 2m labels: severity: warning annotations: summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }}) - description: "Disk space on default is below 20%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Disk space on default is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ClickHouseDiskSpaceCriticalDefault + - alert: ClickhouseDiskSpaceCriticalOnDefault expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10' for: 2m labels: severity: critical annotations: - summary: ClickHouse Disk Space Critical on Default Disk (instance {{ $labels.instance }}) + summary: ClickHouse Disk Space Critical on Default (instance {{ $labels.instance }}) description: "Disk space on default disk is critically low, below 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ClickHouseDiskSpaceLowBackups + - alert: ClickhouseDiskSpaceLowOnBackups expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20' for: 2m labels: severity: warning annotations: summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }}) - description: "Disk space on backups is below 20%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Disk space on backups is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ClickHouseReplicaErrors + - alert: ClickhouseReplicaErrors expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1' for: 0m labels: severity: critical annotations: - summary: ClickHouse Replica Errors Detected (instance {{ $labels.instance }}) + summary: ClickHouse Replica Errors (instance {{ $labels.instance }}) description: "Critical replica errors detected, either all replicas are stale or lost.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ClickHouseNoAvailableReplicas + - alert: ClickhouseNoAvailableReplicas expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1' for: 0m labels: severity: critical annotations: - summary: No Available Replicas in ClickHouse (instance {{ $labels.instance }}) + summary: ClickHouse No Available Replicas (instance {{ $labels.instance }}) description: "No available replicas in ClickHouse.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ClickHouseNoLiveReplicas + - alert: ClickhouseNoLiveReplicas expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1' for: 0m labels: severity: critical annotations: - summary: No Live Replicas in ClickHouse (instance {{ $labels.instance }}) + summary: ClickHouse No Live Replicas (instance {{ $labels.instance }}) description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ClickHouseNetworkUsageHigh - expr: 'ClickHouseMetrics_NetworkSend > 1000 or ClickHouseMetrics_NetworkReceive > 1000' + - alert: ClickhouseHighNetworkTraffic + expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250' for: 5m labels: severity: warning annotations: - summary: High Network Traffic in ClickHouse (instance {{ $labels.instance }}) + summary: ClickHouse High Network Traffic (instance {{ $labels.instance }}) description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ClickHouseHighTCPConnections - expr: 'ClickHouseMetrics_TCPConnection > 1500' + - alert: ClickhouseHighTcpConnections + expr: 'ClickHouseMetrics_TCPConnection > 400' for: 5m labels: severity: warning annotations: - summary: High TCP Connections in ClickHouse (instance {{ $labels.instance }}) + summary: ClickHouse High TCP Connections (instance {{ $labels.instance }}) description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ClickHouseInterserverConnectionIssues + - alert: ClickhouseInterserverConnectionIssues expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0' - for: 0m + for: 1m labels: severity: warning annotations: - summary: Interserver Connection Issues in ClickHouse (instance {{ $labels.instance }}) + summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }}) description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ClickHouseZooKeeperConnectionIssues + - alert: ClickhouseZookeeperConnectionIssues expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1' - for: 5m + for: 3m labels: severity: warning annotations: - summary: ZooKeeper Connection Issues in ClickHouse (instance {{ $labels.instance }}) + summary: ClickHouse ZooKeeper Connection Issues (instance {{ $labels.instance }}) description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ClickHouseAuthenticationFailures + - alert: ClickhouseAuthenticationFailures expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0' for: 0m labels: - severity: critical + severity: info annotations: - summary: Authentication Failures in ClickHouse (instance {{ $labels.instance }}) + summary: ClickHouse Authentication Failures (instance {{ $labels.instance }}) description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: ClickHouseAccessDeniedErrors + - alert: ClickhouseAccessDeniedErrors expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0' - for: 1m + for: 0m labels: - severity: critical + severity: info annotations: - summary: Access Denied Errors in ClickHouse (instance {{ $labels.instance }}) + summary: ClickHouse Access Denied Errors (instance {{ $labels.instance }}) description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - From 847143ecc94909ff05ebff39b83a91272122a68b Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 13 May 2024 10:42:04 +0200 Subject: [PATCH 09/32] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c5c241c..16c92c7 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq) - [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch) - [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra) +- [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse) - [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper) - [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka) - [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar) From 870bbd47d2d9b09dee7e286961c3b3043976a76c Mon Sep 17 00:00:00 2001 From: Vijay Dharap Date: Mon, 13 May 2024 09:10:55 +0000 Subject: [PATCH 10/32] Fixed HPA rule to use more correct condition (#408) * Fixed HPA rule to use more correct condition * Update rules.yml --------- Co-authored-by: Samuel Berthe --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index ffb4604..3f29fb0 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1981,7 +1981,7 @@ groups: for: 1m - name: Kubernetes HPA scale inability description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale - query: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1' + query: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0' severity: warning for: 2m - name: Kubernetes HPA metrics unavailability From 613401a9600b5d8f31ec0a4890371b978becee62 Mon Sep 17 00:00:00 2001 From: samber Date: Mon, 13 May 2024 09:12:01 +0000 Subject: [PATCH 11/32] Publish --- dist/rules/kubernetes/kubestate-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml index e43a1fb..9014275 100644 --- a/dist/rules/kubernetes/kubestate-exporter.yml +++ b/dist/rules/kubernetes/kubestate-exporter.yml @@ -122,7 +122,7 @@ groups: description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaScaleInability - expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1' + expr: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0' for: 2m labels: severity: warning From 396083a2a1daabeb6b7b60a29c5c0ec3eef215b7 Mon Sep 17 00:00:00 2001 From: Florian Schlichting Date: Mon, 13 May 2024 12:09:04 +0200 Subject: [PATCH 12/32] Fix HaproxyBackendMaxActiveSession: look at current / limit (#413) haproxy_backend_max_sessions is the maximum number of sessions ever encountered during the lifetime of the HAProxy process. That is, it will never go down until HAProxy is restarted, so the alert continues to fire even though the situation has cleared! This doesn't make sense. Look at the currently active sessions instead. --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 3f29fb0..61425a5 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1785,7 +1785,7 @@ groups: severity: critical - name: HAProxy backend max active session description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%). - query: "((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80" + query: "((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80" severity: warning for: 2m - name: HAProxy pending requests From 04886da968b4686de0af0c19f0fb4baa05f8265e Mon Sep 17 00:00:00 2001 From: samber Date: Mon, 13 May 2024 10:10:12 +0000 Subject: [PATCH 13/32] Publish --- dist/rules/haproxy/haproxy-exporter-v1.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/rules/haproxy/haproxy-exporter-v1.yml b/dist/rules/haproxy/haproxy-exporter-v1.yml index 2b2f93f..7be81a0 100644 --- a/dist/rules/haproxy/haproxy-exporter-v1.yml +++ b/dist/rules/haproxy/haproxy-exporter-v1.yml @@ -77,7 +77,7 @@ groups: description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HaproxyBackendMaxActiveSession - expr: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80' + expr: '((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80' for: 2m labels: severity: warning From 4963331101e42b4a799978891a5c9c5f927623fb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 14 May 2024 01:41:57 +0200 Subject: [PATCH 14/32] build(deps-dev): bump nokogiri from 1.16.2 to 1.16.5 (#415) Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.16.2 to 1.16.5. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.16.2...v1.16.5) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index df07b1f..2f8e470 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -231,7 +231,7 @@ GEM jekyll-seo-tag (~> 2.1) minitest (5.17.0) multipart-post (2.1.1) - nokogiri (1.16.2-x86_64-linux) + nokogiri (1.16.5-x86_64-linux) racc (~> 1.4) octokit (4.22.0) faraday (>= 0.9) From 8460f9008e1eb191bb62d445ea17709698ce63db Mon Sep 17 00:00:00 2001 From: "R.Sicart" Date: Tue, 14 May 2024 20:34:43 +0200 Subject: [PATCH 15/32] fix: some kube api alert lint (#416) * fix: apiserver regexp matchers are automatically fully anchored Signed-off-by: R.Sicart * fix: apiserver errors alert is using label but the query removes it Signed-off-by: R.Sicart * fix: apiserver latency alert is using label but the query removes it Signed-off-by: R.Sicart --------- Signed-off-by: R.Sicart --- _data/rules.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index 61425a5..eb87723 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -2071,7 +2071,7 @@ groups: for: 12h - name: Kubernetes API server errors description: Kubernetes API server is experiencing high error rate - query: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3' + query: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3' severity: critical for: 2m - name: Kubernetes API client errors @@ -2089,7 +2089,7 @@ groups: severity: critical - name: Kubernetes API server latency description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}." - query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) > 1' + query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1' severity: warning for: 2m From 81079a2a7e9923ce369b35301ddcbf660bea1f09 Mon Sep 17 00:00:00 2001 From: samber Date: Tue, 14 May 2024 18:35:54 +0000 Subject: [PATCH 16/32] Publish --- dist/rules/kubernetes/kubestate-exporter.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml index 9014275..3f9dc6a 100644 --- a/dist/rules/kubernetes/kubestate-exporter.yml +++ b/dist/rules/kubernetes/kubestate-exporter.yml @@ -266,7 +266,7 @@ groups: description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesApiServerErrors - expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3' + expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3' for: 2m labels: severity: critical @@ -302,7 +302,7 @@ groups: description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesApiServerLatency - expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) > 1' + expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1' for: 2m labels: severity: warning From 262e45162569863ea1784dcfc68066be32fcbe71 Mon Sep 17 00:00:00 2001 From: "R.Sicart" Date: Tue, 14 May 2024 20:43:00 +0200 Subject: [PATCH 17/32] kube hpa lint and improvement (#417) * fix: hpa alerts are using label but the queries remove it Signed-off-by: R.Sicart * fix: hpa alert is using label but the query removes it Signed-off-by: R.Sicart * feat: hpa scale max should not alert when min and max are the same Signed-off-by: R.Sicart --------- Signed-off-by: R.Sicart --- _data/rules.yml | 2 +- dist/rules/kubernetes/kubestate-exporter.yml | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index eb87723..4d0d9f4 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1990,7 +1990,7 @@ groups: severity: warning - name: Kubernetes HPA scale maximum description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods - query: "kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas" + query: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)' severity: info for: 2m - name: Kubernetes HPA underutilized diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml index 3f9dc6a..8684fdf 100644 --- a/dist/rules/kubernetes/kubestate-exporter.yml +++ b/dist/rules/kubernetes/kubestate-exporter.yml @@ -127,7 +127,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes HPA scale inability (instance {{ $labels.instance }}) + summary: Kubernetes HPA scale inability ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}) description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaMetricsUnavailability @@ -136,7 +136,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }}) + summary: Kubernetes HPA metrics unavailability ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}) description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaScaleMaximum @@ -145,16 +145,16 @@ groups: labels: severity: info annotations: - summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }}) + summary: Kubernetes HPA scale maximum ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}) description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaUnderutilized - expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3' + expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler, namespace) > 3' for: 0m labels: severity: info annotations: - summary: Kubernetes HPA underutilized (instance {{ $labels.instance }}) + summary: Kubernetes HPA underutilized ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}) description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPodNotHealthy From 826be5877ffc129ca3511cd25af87d67ea67fb48 Mon Sep 17 00:00:00 2001 From: samber Date: Tue, 14 May 2024 18:44:11 +0000 Subject: [PATCH 18/32] Publish --- dist/rules/kubernetes/kubestate-exporter.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml index 8684fdf..2db1d64 100644 --- a/dist/rules/kubernetes/kubestate-exporter.yml +++ b/dist/rules/kubernetes/kubestate-exporter.yml @@ -127,7 +127,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes HPA scale inability ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}) + summary: Kubernetes HPA scale inability (instance {{ $labels.instance }}) description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaMetricsUnavailability @@ -136,25 +136,25 @@ groups: labels: severity: warning annotations: - summary: Kubernetes HPA metrics unavailability ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}) + summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }}) description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaScaleMaximum - expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas' + expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)' for: 2m labels: severity: info annotations: - summary: Kubernetes HPA scale maximum ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}) + summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }}) description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaUnderutilized - expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler, namespace) > 3' + expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3' for: 0m labels: severity: info annotations: - summary: Kubernetes HPA underutilized ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}) + summary: Kubernetes HPA underutilized (instance {{ $labels.instance }}) description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPodNotHealthy From 9877561b6cfc2c145db1b7de6f85c084989be10d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enes=20Yal=C4=B1nkaya?= <49714068+enesyalinkaya@users.noreply.github.com> Date: Wed, 15 May 2024 09:07:55 +0300 Subject: [PATCH 19/32] fix elasticsearch rate rules (#418) * fix elasticsearch rate rules * fix * fix * fix --- _data/rules.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index 4d0d9f4..bf8ee70 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1135,12 +1135,12 @@ groups: for: 10m - name: Elasticsearch High Indexing Rate description: "The indexing rate on Elasticsearch cluster is higher than the threshold." - query: "elasticsearch_indices_indexing_index_total > 100000" + query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 100000" severity: warning for: 5m - name: Elasticsearch High Query Rate description: "The query rate on Elasticsearch cluster is higher than the threshold." - query: "elasticsearch_indices_search_query_total > 100000" + query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100000" severity: warning for: 5m - name: Elasticsearch High Query Latency From 1adecd9ee79ce65ec546ad77d6d47ab04259f689 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Wed, 15 May 2024 08:08:58 +0200 Subject: [PATCH 20/32] Update rules.yml --- _data/rules.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index bf8ee70..a41d8a2 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1125,7 +1125,7 @@ groups: severity: warning for: 15m - name: Elasticsearch no new documents - description: No new documents for 10 min! + description: "No new documents for 10 min!" query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1' severity: warning - name: Elasticsearch High Indexing Latency @@ -1135,12 +1135,12 @@ groups: for: 10m - name: Elasticsearch High Indexing Rate description: "The indexing rate on Elasticsearch cluster is higher than the threshold." - query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 100000" + query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000" severity: warning for: 5m - name: Elasticsearch High Query Rate description: "The query rate on Elasticsearch cluster is higher than the threshold." - query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100000" + query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100" severity: warning for: 5m - name: Elasticsearch High Query Latency From 7dd767c4b4e4bcc2fa8d2b86cee89402f80acacd Mon Sep 17 00:00:00 2001 From: samber Date: Wed, 15 May 2024 06:10:06 +0000 Subject: [PATCH 21/32] Publish --- .../prometheus-community-elasticsearch-exporter.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml index 9aeadec..5e6bb9d 100644 --- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml +++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml @@ -149,7 +149,7 @@ groups: description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHighIndexingRate - expr: 'elasticsearch_indices_indexing_index_total > 100000' + expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000' for: 5m labels: severity: warning @@ -158,7 +158,7 @@ groups: description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHighQueryRate - expr: 'elasticsearch_indices_search_query_total > 100000' + expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100' for: 5m labels: severity: warning From 61a40270d96ba9d7ae6489254f9dad2775f8e00c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 16 May 2024 23:28:17 +0200 Subject: [PATCH 22/32] build(deps-dev): bump rexml from 3.2.5 to 3.2.8 (#420) Bumps [rexml](https://github.com/ruby/rexml) from 3.2.5 to 3.2.8. - [Release notes](https://github.com/ruby/rexml/releases) - [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md) - [Commits](https://github.com/ruby/rexml/compare/v3.2.5...v3.2.8) --- updated-dependencies: - dependency-name: rexml dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Gemfile.lock | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 2f8e470..ca3c33c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -243,7 +243,8 @@ GEM rb-fsevent (0.11.1) rb-inotify (0.10.1) ffi (~> 1.0) - rexml (3.2.5) + rexml (3.2.8) + strscan (>= 3.0.9) rouge (3.26.0) ruby2_keywords (0.0.5) rubyzip (2.3.2) @@ -258,6 +259,7 @@ GEM faraday (> 0.8, < 2.0) simpleidn (0.2.1) unf (~> 0.1.4) + strscan (3.1.0) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) thread_safe (0.3.6) From 9b0ac7d230f18f7a371ece02d7ecb7724007faa9 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Thu, 23 May 2024 14:44:45 +0200 Subject: [PATCH 23/32] Update rules.yml --- _data/rules.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index a41d8a2..cd0123e 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -876,11 +876,6 @@ groups: query: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80' severity: warning for: 2m - - name: MongoDB virtual memory usage - description: High memory usage - query: "(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3" - severity: warning - for: 2m - name: dcu/mongodb_exporter slug: dcu-mongodb-exporter From 8759c50440e0c95363c739432e8c3a21b864c8e0 Mon Sep 17 00:00:00 2001 From: samber Date: Thu, 23 May 2024 12:45:56 +0000 Subject: [PATCH 24/32] Publish --- dist/rules/mongodb/percona-mongodb-exporter.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/dist/rules/mongodb/percona-mongodb-exporter.yml b/dist/rules/mongodb/percona-mongodb-exporter.yml index 3e1e5e9..1bd446f 100644 --- a/dist/rules/mongodb/percona-mongodb-exporter.yml +++ b/dist/rules/mongodb/percona-mongodb-exporter.yml @@ -66,12 +66,3 @@ groups: annotations: summary: MongoDB too many connections (instance {{ $labels.instance }}) description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: MongodbVirtualMemoryUsage - expr: '(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3' - for: 2m - labels: - severity: warning - annotations: - summary: MongoDB virtual memory usage (instance {{ $labels.instance }}) - description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" From 1e4ea0b3e75cdbf59a56c3e1cad94deaf2ab723f Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Thu, 6 Jun 2024 22:53:29 +0200 Subject: [PATCH 25/32] Update rules.yml --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index cd0123e..9a08f71 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -2421,7 +2421,7 @@ groups: rules: - name: Minio cluster disk offline description: "Minio cluster disk is offline" - query: "minio_cluster_disk_offline_total > 0" + query: "minio_cluster_drive_offline_total > 0" severity: critical - name: Minio node disk offline description: "Minio cluster node disk is offline" From 1ee046b7392426039031b521a433dcc177104134 Mon Sep 17 00:00:00 2001 From: samber Date: Thu, 6 Jun 2024 20:54:49 +0000 Subject: [PATCH 26/32] Publish --- dist/rules/minio/embedded-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/rules/minio/embedded-exporter.yml b/dist/rules/minio/embedded-exporter.yml index 8e19729..1ac2de5 100644 --- a/dist/rules/minio/embedded-exporter.yml +++ b/dist/rules/minio/embedded-exporter.yml @@ -5,7 +5,7 @@ groups: rules: - alert: MinioClusterDiskOffline - expr: 'minio_cluster_disk_offline_total > 0' + expr: 'minio_cluster_drive_offline_total > 0' for: 0m labels: severity: critical From ca4fb01c6dda3514048fb28166d3bd9f40b06ef7 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Fri, 14 Jun 2024 20:15:44 +0200 Subject: [PATCH 27/32] Update rules.yml --- _data/rules.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index 9a08f71..8994d44 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -240,12 +240,15 @@ groups: query: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' severity: warning for: 5m - - name: Host context switching - description: Context switching is growing on the node (> 10000 / CPU / s) - query: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + - name: Host context switching high + description: Context switching is growing on the node (twice the daily average during the last 15m) + query: | + (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) + / + (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 severity: warning comments: | - 10000 context switches is an arbitrary number. + x2 context switches is an arbitrary number. The alert threshold depends on the nature of the application. Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - name: Host swap is filling up From 60c235975c4a34354d30031cef12234d3fe7e3f6 Mon Sep 17 00:00:00 2001 From: samber Date: Fri, 14 Jun 2024 18:16:53 +0000 Subject: [PATCH 28/32] Publish --- dist/rules/host-and-hardware/node-exporter.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 6655ef7..0d80c16 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -175,14 +175,17 @@ groups: summary: Host unusual disk IO (instance {{ $labels.instance }}) description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HostContextSwitching - expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + - alert: HostContextSwitchingHigh + expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) +/ +(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 +' for: 0m labels: severity: warning annotations: - summary: Host context switching (instance {{ $labels.instance }}) - description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Host context switching high (instance {{ $labels.instance }}) + description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostSwapIsFillingUp expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' From b6a6c2e31315873c39c28622ffb2f26ad8f5ce9b Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Tue, 2 Jul 2024 09:33:01 +0200 Subject: [PATCH 29/32] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 16c92c7..9188322 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb) - [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq) - [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch) +- [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch) - [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra) - [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse) - [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper) From 9557d4b50e09f31d2f86ac1c70f87a025a6ac1b6 Mon Sep 17 00:00:00 2001 From: Greg <58505377+nohant@users.noreply.github.com> Date: Tue, 2 Jul 2024 09:33:08 +0200 Subject: [PATCH 30/32] feat(meilisearch): add basic set of rules (#425) * feat(meilisearch): add basic meilisearch rules * fix(query): use == instead of = * fix(data): set correct name and use == * chore(meilisearch): remove index filter --- _data/rules.yml | 15 +++++++++++++ dist/rules/meilisearch/embedded-exporter.yml | 23 ++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 dist/rules/meilisearch/embedded-exporter.yml diff --git a/_data/rules.yml b/_data/rules.yml index 8994d44..128793f 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -842,6 +842,21 @@ groups: query: "increase(redis_rejected_connections_total[1m]) > 0" severity: critical + - name: Meilisearch + exporters: + - name: Embedded exporter + slug: embedded-exporter + doc_url: https://github.com/orgs/meilisearch/discussions/625 + rules: + - name: Meilisearch index is empty + description: Meilisearch instance is down + query: 'meilisearch_index_docs_count == 0' + severity: warning + - name: Meilisearch http response time + description: Meilisearch http response time is too high + query: "meilisearch_http_response_time_seconds > 0.5" + severity: warning + - name: MongoDB exporters: - name: percona/mongodb_exporter diff --git a/dist/rules/meilisearch/embedded-exporter.yml b/dist/rules/meilisearch/embedded-exporter.yml new file mode 100644 index 0000000..a8824dd --- /dev/null +++ b/dist/rules/meilisearch/embedded-exporter.yml @@ -0,0 +1,23 @@ +groups: + +- name: EmbeddedExporter + + rules: + + - alert: MeilisearchIndexIsEmpty + expr: meilisearch_index_docs_count == 0 + for: 5m + labels: + severity: warning + annotations: + summary: the index {{ $labels.Index }} is empty + description: "The index {{ $labels.Index }} is empty at the moment, and shouldnt be empty\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MeilisearchHttpResponseTimeIsTooHigh + expr: rate(meilisearch_http_response_time_seconds_sum[5m]) / rate(meilisearch_http_response_time_seconds_count[5m]) > 0.5 + for: 5m + labels: + severity: warning + annotations: + summary: the meilisearch server http response time is too high + description: "The meilisearch server http response time is too high at the moment\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" From 47e74f65e02fbbb60fddaf450cca3178ef5e4ecd Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Tue, 2 Jul 2024 09:33:51 +0200 Subject: [PATCH 31/32] Update rules.yml --- _data/rules.yml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index 128793f..0216beb 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -842,21 +842,6 @@ groups: query: "increase(redis_rejected_connections_total[1m]) > 0" severity: critical - - name: Meilisearch - exporters: - - name: Embedded exporter - slug: embedded-exporter - doc_url: https://github.com/orgs/meilisearch/discussions/625 - rules: - - name: Meilisearch index is empty - description: Meilisearch instance is down - query: 'meilisearch_index_docs_count == 0' - severity: warning - - name: Meilisearch http response time - description: Meilisearch http response time is too high - query: "meilisearch_http_response_time_seconds > 0.5" - severity: warning - - name: MongoDB exporters: - name: percona/mongodb_exporter @@ -1162,6 +1147,21 @@ groups: severity: warning for: 5m + - name: Meilisearch + exporters: + - name: Embedded exporter + slug: embedded-exporter + doc_url: https://github.com/orgs/meilisearch/discussions/625 + rules: + - name: Meilisearch index is empty + description: Meilisearch instance is down + query: 'meilisearch_index_docs_count == 0' + severity: warning + - name: Meilisearch http response time + description: Meilisearch http response time is too high + query: "meilisearch_http_response_time_seconds > 0.5" + severity: warning + - name: Cassandra exporters: - name: instaclustr/cassandra-exporter From 58ade95b8bfaf20f875c74b8d7a6c509f70a77ab Mon Sep 17 00:00:00 2001 From: samber Date: Tue, 2 Jul 2024 07:34:59 +0000 Subject: [PATCH 32/32] Publish --- dist/rules/meilisearch/embedded-exporter.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/dist/rules/meilisearch/embedded-exporter.yml b/dist/rules/meilisearch/embedded-exporter.yml index a8824dd..8da2803 100644 --- a/dist/rules/meilisearch/embedded-exporter.yml +++ b/dist/rules/meilisearch/embedded-exporter.yml @@ -5,19 +5,19 @@ groups: rules: - alert: MeilisearchIndexIsEmpty - expr: meilisearch_index_docs_count == 0 - for: 5m + expr: 'meilisearch_index_docs_count == 0' + for: 0m labels: severity: warning annotations: - summary: the index {{ $labels.Index }} is empty - description: "The index {{ $labels.Index }} is empty at the moment, and shouldnt be empty\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: MeilisearchHttpResponseTimeIsTooHigh - expr: rate(meilisearch_http_response_time_seconds_sum[5m]) / rate(meilisearch_http_response_time_seconds_count[5m]) > 0.5 - for: 5m + summary: Meilisearch index is empty (instance {{ $labels.instance }}) + description: "Meilisearch instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MeilisearchHttpResponseTime + expr: 'meilisearch_http_response_time_seconds > 0.5' + for: 0m labels: severity: warning annotations: - summary: the meilisearch server http response time is too high - description: "The meilisearch server http response time is too high at the moment\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Meilisearch http response time (instance {{ $labels.instance }}) + description: "Meilisearch http response time is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"