diff --git a/_data/rules.yml b/_data/rules.yml index 8cb0963..afd85dd 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -262,9 +262,9 @@ groups: description: "Physical node temperature alarm triggered" query: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' severity: critical - - name: Host Software RAID is not active + - name: Host Software RAID insufficient drives description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining." - query: '(node_md_disks_required - on(device, instance) node_md_disks{state="active"}) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' severity: critical - name: Host Software RAID disk failure description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention." @@ -273,7 +273,7 @@ groups: for: 2m - name: Host kernel version deviations description: Kernel version for {{ $labels.instance }} has changed - query: 'changes(node_uname_info[1h]) == 0' + query: 'changes(node_uname_info[1h]) > 0' severity: info for: 6h - name: Host OOM kill detected