From e6de4131467c07406514886d62047ed60e252066 Mon Sep 17 00:00:00 2001 From: Fernando Carletti Date: Mon, 18 May 2020 17:38:05 -0500 Subject: [PATCH 01/10] fix: container ContainerMemoryUsage alert --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index c60ab90..ac9ed97 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -239,7 +239,7 @@ groups: severity: warning - name: Container Memory usage description: Container Memory usage is above 80% - query: "(sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80" + query: "(sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80" severity: warning - name: Container Volume usage description: Container Volume usage is above 80% From 88e812c78e123d31e055e8e596128e11def602b6 Mon Sep 17 00:00:00 2001 From: Ozarklake Date: Fri, 17 Jul 2020 14:50:09 +0800 Subject: [PATCH 02/10] add sql server rules --- _data/rules.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index c60ab90..5ed391f 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -465,6 +465,20 @@ groups: query: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20' severity: critical + - name: SQL Server + exporters: + - name: Ozarklake/prometheus-mssql-exporter + doc_url: https://github.com/Ozarklake/prometheus-mssql-exporter + rules: + - name: SQL Server down + description: SQl server instance is down + query: mssql_up == 0 + severity: critical + - name: SQL Server deadlock + description: SQL Server is having some deadlock. + query: irate(mssql_deadlocks[2m]) > 0 + severity: warning + - name: PGBouncer exporters: - name: spreaker/prometheus-pgbouncer-exporter From 8fb5da83decbde0e54ed67299e462ddeb877f2d8 Mon Sep 17 00:00:00 2001 From: Nirav Chotai Date: Fri, 24 Jul 2020 13:32:44 +0800 Subject: [PATCH 03/10] Fix HPA alerts - Fixing KubernetesHpaMetricAvailability - Fixing KubernetesHpaScalingAbility --- _data/rules.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index c60ab90..3eee7f0 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -990,11 +990,11 @@ groups: severity: critical - name: Kubernetes HPA scaling ability description: Pod is unable to scale - query: 'kube_hpa_status_condition{condition="false", status="AbleToScale"} == 1' + query: 'kube_hpa_status_condition{status="false", condition ="AbleToScale"} == 1' severity: warning - name: Kubernetes HPA metric availability - description: HPA is not able to colelct metrics - query: 'kube_hpa_status_condition{condition="false", status="ScalingActive"} == 1' + description: HPA is not able to collect metrics + query: 'kube_hpa_status_condition{status="false", condition="ScalingActive"} == 1' severity: warning - name: Kubernetes HPA scale capability description: The maximum number of desired Pods has been hit From 6c5f708179bdfee374e0efcf1b27e521e204876f Mon Sep 17 00:00:00 2001 From: Daniel Andrzejewski Date: Thu, 17 Sep 2020 15:13:42 +0200 Subject: [PATCH 04/10] node_disk_write_time_seconds_total is in seconds, not in milliseconds. node_disk_write_time_seconds_total should be grater than 0, otherwise you get +Inf result. --- _data/rules.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index c60ab90..2983754 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -157,11 +157,11 @@ groups: severity: warning - name: Host unusual disk read latency description: Disk latency is growing (read operations > 100ms) - query: "rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100" + query: "rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m])" severity: warning - name: Host unusual disk write latency description: Disk latency is growing (write operations > 100ms) - query: "rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100" + query: "rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0" severity: warning - name: Host high CPU load description: CPU load is > 80% From fc4797db9e523e0f793f0486119e94c3cd7b06e7 Mon Sep 17 00:00:00 2001 From: Daniel Andrzejewski Date: Thu, 17 Sep 2020 15:19:14 +0200 Subject: [PATCH 05/10] small fix --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 2983754..86edb23 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -157,7 +157,7 @@ groups: severity: warning - name: Host unusual disk read latency description: Disk latency is growing (read operations > 100ms) - query: "rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m])" + query: "rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0" severity: warning - name: Host unusual disk write latency description: Disk latency is growing (write operations > 100ms) From 5288c9a2f53e136158ffa2012cd0e5b0ca78a1e6 Mon Sep 17 00:00:00 2001 From: fsschmitt <492108+fsschmitt@users.noreply.github.com> Date: Tue, 6 Oct 2020 13:33:50 +0100 Subject: [PATCH 06/10] Fix node_md_disks state from fail to failed --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index c60ab90..0985bfb 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -197,7 +197,7 @@ groups: severity: critical - name: Host RAID disk failure description: 'At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap' - query: 'node_md_disks{state="fail"} > 0' + query: 'node_md_disks{state="failed"} > 0' severity: warning - name: Host kernel version deviations description: Different kernel versions are running From 4266b4d3264cf2cfb1440f1bf3570abe048e322e Mon Sep 17 00:00:00 2001 From: fsschmitt <492108+fsschmitt@users.noreply.github.com> Date: Tue, 6 Oct 2020 14:36:22 +0100 Subject: [PATCH 07/10] Fix time unit on disk read/write latency rule --- _data/rules.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index c60ab90..0fa6a83 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -157,11 +157,11 @@ groups: severity: warning - name: Host unusual disk read latency description: Disk latency is growing (read operations > 100ms) - query: "rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100" + query: "rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1" severity: warning - name: Host unusual disk write latency description: Disk latency is growing (write operations > 100ms) - query: "rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100" + query: "rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1" severity: warning - name: Host high CPU load description: CPU load is > 80% From cf70272309f27e90580a70e083514f2078f92a68 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sun, 11 Oct 2020 16:08:54 +0200 Subject: [PATCH 08/10] fix(container memory limit): filter by containers having max memory setting --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 651fa44..09231b9 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -239,7 +239,7 @@ groups: severity: warning - name: Container Memory usage description: Container Memory usage is above 80% - query: "(sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80" + query: "(sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80" severity: warning - name: Container Volume usage description: Container Volume usage is above 80% From 7a609adf18f760946ee0404e1bbad983573822a7 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sun, 11 Oct 2020 16:11:44 +0200 Subject: [PATCH 09/10] adding comment to container OOM killer warning --- _data/rules.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_data/rules.yml b/_data/rules.yml index 09231b9..aae0373 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -241,6 +241,7 @@ groups: description: Container Memory usage is above 80% query: "(sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80" severity: warning + comments: See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d - name: Container Volume usage description: Container Volume usage is above 80% query: "(1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80" From bafcd1e9220ecf1f98575337ad90cf79e31c75ac Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sun, 11 Oct 2020 17:35:46 +0200 Subject: [PATCH 10/10] Update rules.yml --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 5ed391f..1ecd043 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -476,7 +476,7 @@ groups: severity: critical - name: SQL Server deadlock description: SQL Server is having some deadlock. - query: irate(mssql_deadlocks[2m]) > 0 + query: rate(mssql_deadlocks[1m]) > 0 severity: warning - name: PGBouncer