Convert cpu alert expressions to without() rather than on()

This commit is contained in:
Simon Matic Langford 2025-12-17 16:28:36 +00:00
parent ce0a93df78
commit fbe1e4d444
2 changed files with 8 additions and 8 deletions

View file

@ -207,23 +207,23 @@ groups:
for: 2m for: 2m
- name: Host high CPU load - name: Host high CPU load
description: CPU load is > 80% description: CPU load is > 80%
query: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' query: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
severity: warning severity: warning
for: 10m for: 10m
- name: Host CPU is underutilized - name: Host CPU is underutilized
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs." description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs."
query: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' query: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
severity: info severity: info
for: 1w for: 1w
comments: | comments: |
You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- name: Host CPU steal noisy neighbor - name: Host CPU steal noisy neighbor
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
query: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
severity: warning severity: warning
- name: Host CPU high iowait - name: Host CPU high iowait
description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond. description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond.
query: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
severity: warning severity: warning
- name: Host unusual disk IO - name: Host unusual disk IO
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues." description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues."

View file

@ -130,7 +130,7 @@ groups:
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad - alert: HostHighCpuLoad
expr: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' expr: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -140,7 +140,7 @@ groups:
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostCpuIsUnderutilized - alert: HostCpuIsUnderutilized
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' expr: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
for: 1w for: 1w
labels: labels:
severity: info severity: info
@ -149,7 +149,7 @@ groups:
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor - alert: HostCpuStealNoisyNeighbor
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
@ -158,7 +158,7 @@ groups:
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait - alert: HostCpuHighIowait
expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning