diff --git a/_data/rules.yml b/_data/rules.yml index 1bb6ece..7f4da58 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -273,6 +273,8 @@ groups: description: OOM kill detected query: "(increase(node_vmstat_oom_kill[30m]) > 0)" severity: warning + comments: | + When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger. - name: Host EDAC Correctable Errors detected description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.' query: "(increase(node_edac_correctable_errors_total[1m]) > 0)"