mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 19:37:27 +08:00
Convert cpu alert expressions to without() rather than on()
This commit is contained in:
parent
ce0a93df78
commit
fbe1e4d444
2 changed files with 8 additions and 8 deletions
|
|
@ -207,23 +207,23 @@ groups:
|
||||||
for: 2m
|
for: 2m
|
||||||
- name: Host high CPU load
|
- name: Host high CPU load
|
||||||
description: CPU load is > 80%
|
description: CPU load is > 80%
|
||||||
query: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
|
query: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 10m
|
for: 10m
|
||||||
- name: Host CPU is underutilized
|
- name: Host CPU is underutilized
|
||||||
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs."
|
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs."
|
||||||
query: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
|
query: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
|
||||||
severity: info
|
severity: info
|
||||||
for: 1w
|
for: 1w
|
||||||
comments: |
|
comments: |
|
||||||
You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
|
You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
|
||||||
- name: Host CPU steal noisy neighbor
|
- name: Host CPU steal noisy neighbor
|
||||||
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
|
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
|
||||||
query: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
|
query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
|
||||||
severity: warning
|
severity: warning
|
||||||
- name: Host CPU high iowait
|
- name: Host CPU high iowait
|
||||||
description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond.
|
description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond.
|
||||||
query: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
|
query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
|
||||||
severity: warning
|
severity: warning
|
||||||
- name: Host unusual disk IO
|
- name: Host unusual disk IO
|
||||||
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues."
|
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues."
|
||||||
|
|
|
||||||
|
|
@ -130,7 +130,7 @@ groups:
|
||||||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostHighCpuLoad
|
- alert: HostHighCpuLoad
|
||||||
expr: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
|
expr: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -140,7 +140,7 @@ groups:
|
||||||
|
|
||||||
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
|
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
|
||||||
- alert: HostCpuIsUnderutilized
|
- alert: HostCpuIsUnderutilized
|
||||||
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
|
expr: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
|
||||||
for: 1w
|
for: 1w
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
|
|
@ -149,7 +149,7 @@ groups:
|
||||||
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostCpuStealNoisyNeighbor
|
- alert: HostCpuStealNoisyNeighbor
|
||||||
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
|
expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -158,7 +158,7 @@ groups:
|
||||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostCpuHighIowait
|
- alert: HostCpuHighIowait
|
||||||
expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
|
expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue