Minor bug fixes

This commit is contained in:
Evi Vanoost 2024-02-24 19:30:22 -05:00
parent d6ef8e7449
commit 87ee1292e7

View file

@ -262,9 +262,9 @@ groups:
description: "Physical node temperature alarm triggered"
query: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
severity: critical
- name: Host Software RAID is not active
- name: Host Software RAID insufficient drives
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining."
query: '(node_md_disks_required - on(device, instance) node_md_disks{state="active"}) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
query: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
severity: critical
- name: Host Software RAID disk failure
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention."
@ -273,7 +273,7 @@ groups:
for: 2m
- name: Host kernel version deviations
description: Kernel version for {{ $labels.instance }} has changed
query: 'changes(node_uname_info[1h]) == 0'
query: 'changes(node_uname_info[1h]) > 0'
severity: info
for: 6h
- name: Host OOM kill detected