From e6de4131467c07406514886d62047ed60e252066 Mon Sep 17 00:00:00 2001 From: Fernando Carletti Date: Mon, 18 May 2020 17:38:05 -0500 Subject: [PATCH 1/5] fix: container ContainerMemoryUsage alert --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index c60ab90..ac9ed97 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -239,7 +239,7 @@ groups: severity: warning - name: Container Memory usage description: Container Memory usage is above 80% - query: "(sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80" + query: "(sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80" severity: warning - name: Container Volume usage description: Container Volume usage is above 80% From 5288c9a2f53e136158ffa2012cd0e5b0ca78a1e6 Mon Sep 17 00:00:00 2001 From: fsschmitt <492108+fsschmitt@users.noreply.github.com> Date: Tue, 6 Oct 2020 13:33:50 +0100 Subject: [PATCH 2/5] Fix node_md_disks state from fail to failed --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index c60ab90..0985bfb 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -197,7 +197,7 @@ groups: severity: critical - name: Host RAID disk failure description: 'At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap' - query: 'node_md_disks{state="fail"} > 0' + query: 'node_md_disks{state="failed"} > 0' severity: warning - name: Host kernel version deviations description: Different kernel versions are running From 4266b4d3264cf2cfb1440f1bf3570abe048e322e Mon Sep 17 00:00:00 2001 From: fsschmitt <492108+fsschmitt@users.noreply.github.com> Date: Tue, 6 Oct 2020 14:36:22 +0100 Subject: [PATCH 3/5] Fix time unit on disk read/write latency rule --- _data/rules.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index c60ab90..0fa6a83 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -157,11 +157,11 @@ groups: severity: warning - name: Host unusual disk read latency description: Disk latency is growing (read operations > 100ms) - query: "rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100" + query: "rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1" severity: warning - name: Host unusual disk write latency description: Disk latency is growing (write operations > 100ms) - query: "rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100" + query: "rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1" severity: warning - name: Host high CPU load description: CPU load is > 80% From cf70272309f27e90580a70e083514f2078f92a68 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sun, 11 Oct 2020 16:08:54 +0200 Subject: [PATCH 4/5] fix(container memory limit): filter by containers having max memory setting --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 651fa44..09231b9 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -239,7 +239,7 @@ groups: severity: warning - name: Container Memory usage description: Container Memory usage is above 80% - query: "(sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80" + query: "(sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80" severity: warning - name: Container Volume usage description: Container Volume usage is above 80% From 7a609adf18f760946ee0404e1bbad983573822a7 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sun, 11 Oct 2020 16:11:44 +0200 Subject: [PATCH 5/5] adding comment to container OOM killer warning --- _data/rules.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_data/rules.yml b/_data/rules.yml index 09231b9..aae0373 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -241,6 +241,7 @@ groups: description: Container Memory usage is above 80% query: "(sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80" severity: warning + comments: See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d - name: Container Volume usage description: Container Volume usage is above 80% query: "(1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80"