From 6179475625dca95aeb4c333bfc973806d0a1295d Mon Sep 17 00:00:00 2001 From: Per Lundberg Date: Fri, 30 Jan 2026 09:07:59 +0200 Subject: [PATCH] Adjust OOM kill detected rule When a machine runs out of memory, it happens that the node exporter stops responding for multiple minutes. I've adjusted the rule now to take this into account: even if it takes 15-20 minutes before the machine becomes responsive again, the alert should still fire. --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 715f181..1bb6ece 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -271,7 +271,7 @@ groups: severity: info - name: Host OOM kill detected description: OOM kill detected - query: "(increase(node_vmstat_oom_kill[1m]) > 0)" + query: "(increase(node_vmstat_oom_kill[30m]) > 0)" severity: warning - name: Host EDAC Correctable Errors detected description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'