diff --git a/_data/rules.yml b/_data/rules.yml index 715f181..7f4da58 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -271,8 +271,10 @@ groups: severity: info - name: Host OOM kill detected description: OOM kill detected - query: "(increase(node_vmstat_oom_kill[1m]) > 0)" + query: "(increase(node_vmstat_oom_kill[30m]) > 0)" severity: warning + comments: | + When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger. - name: Host EDAC Correctable Errors detected description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.' query: "(increase(node_edac_correctable_errors_total[1m]) > 0)"