awesome-prometheus-alerts/dist/rules/host-and-hardware/node-exporter.yml
2026-04-06 18:38:45 +00:00

335 lines
16 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# node_vmstat_pgmajfault is exposed as untyped/gauge by node_exporter (from /proc/vmstat), so deriv() is used instead of rate().
- alert: HostMemoryUnderMemoryPressure
expr: '(deriv(node_vmstat_pgmajfault[5m]) > 1000)'
for: 0m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostMemoryIsUnderutilized
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
for: 0m
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: '((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: '((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskIoUtilizationHigh
expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
for: 0m
labels:
severity: warning
annotations:
summary: Host disk IO utilization high (instance {{ $labels.instance }})
description: "Disk utilization is high (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
for: 2m
labels:
severity: critical
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostDiskMayFillIn24Hours
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) and node_filesystem_files > 0'
for: 2m
labels:
severity: critical
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError
expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesMayFillIn24Hours
expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostCpuIsUnderutilized
expr: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }})
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# x2 context switches is an arbitrary number.
# The alert threshold depends on the nature of the application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert: HostContextSwitchingHigh
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 and rate(node_context_switches_total[1d]) > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching high (instance {{ $labels.instance }})
description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) and node_memory_SwapTotal_bytes > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state="failed"} == 1)'
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service {{ $labels.name }} crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Uses ignoring(state) to handle additional labels on node_md_disks.
- alert: HostSoftwareRaidInsufficientDrives
expr: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)'
for: 0m
labels:
severity: critical
annotations:
summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSoftwareRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host software RAID disk failure (instance {{ $labels.instance }})
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: 'changes(node_uname_info[1h]) > 0'
for: 0m
labels:
severity: info
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 1520 minutes to recover, the alert should still trigger.
- alert: HostOomKillDetected
expr: '(delta(node_vmstat_oom_kill[30m]) > 0)'
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 1 minute.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0)'
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) and rate(node_network_transmit_packets_total[2m]) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr: '((node_bonding_active - node_bonding_slaves) != 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) and node_nf_conntrack_entries_limit > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"