From fbe1e4d444ba5bc001f60a26b87df26b46763dda Mon Sep 17 00:00:00 2001 From: Simon Matic Langford Date: Wed, 17 Dec 2025 16:28:36 +0000 Subject: [PATCH] Convert cpu alert expressions to without() rather than on() --- _data/rules.yml | 8 ++++---- dist/rules/host-and-hardware/node-exporter.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index f4c66c1..19cbe35 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -207,23 +207,23 @@ groups: for: 2m - name: Host high CPU load description: CPU load is > 80% - query: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' + query: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' severity: warning for: 10m - name: Host CPU is underutilized description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs." - query: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' + query: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' severity: info for: 1w comments: | You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - name: Host CPU steal noisy neighbor description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. - query: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' + query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' severity: warning - name: Host CPU high iowait description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond. - query: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' + query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' severity: warning - name: Host unusual disk IO description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues." diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 5eef86e..e45c687 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -130,7 +130,7 @@ groups: description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostHighCpuLoad - expr: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' + expr: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' for: 10m labels: severity: warning @@ -140,7 +140,7 @@ groups: # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - alert: HostCpuIsUnderutilized - expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' + expr: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' for: 1w labels: severity: info @@ -149,7 +149,7 @@ groups: description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostCpuStealNoisyNeighbor - expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' + expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' for: 0m labels: severity: warning @@ -158,7 +158,7 @@ groups: description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostCpuHighIowait - expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' + expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' for: 0m labels: severity: warning