Update rules.yml

This commit is contained in:
Samuel Berthe 2023-06-22 18:40:33 +02:00 committed by GitHub
parent a4dbefd853
commit 47b7748618
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -237,12 +237,12 @@ groups:
severity: warning
for: 5m
- name: Host context switching
description: Context switching is growing on node (> 1000 / s)
query: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
description: Context switching is growing on the node (> 10000 / s)
query: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
severity: warning
comments: |
1000 context switches is an arbitrary number.
Alert threshold depends on nature of application.
10000 context switches is an arbitrary number.
The alert threshold depends on the nature of the application.
Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- name: Host swap is filling up
description: Swap is filling up (>80%)
@ -263,7 +263,7 @@ groups:
query: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
severity: critical
- name: Host RAID array got inactive
description: 'RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.'
description: 'RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.'
query: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
severity: critical
- name: Host RAID disk failure