mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-23 18:06:58 +08:00
Update rules.yml
This commit is contained in:
parent
a4dbefd853
commit
47b7748618
1 changed files with 5 additions and 5 deletions
|
|
@ -237,12 +237,12 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Host context switching
|
- name: Host context switching
|
||||||
description: Context switching is growing on node (> 1000 / s)
|
description: Context switching is growing on the node (> 10000 / s)
|
||||||
query: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
query: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||||
severity: warning
|
severity: warning
|
||||||
comments: |
|
comments: |
|
||||||
1000 context switches is an arbitrary number.
|
10000 context switches is an arbitrary number.
|
||||||
Alert threshold depends on nature of application.
|
The alert threshold depends on the nature of the application.
|
||||||
Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
|
Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
|
||||||
- name: Host swap is filling up
|
- name: Host swap is filling up
|
||||||
description: Swap is filling up (>80%)
|
description: Swap is filling up (>80%)
|
||||||
|
|
@ -263,7 +263,7 @@ groups:
|
||||||
query: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
query: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||||
severity: critical
|
severity: critical
|
||||||
- name: Host RAID array got inactive
|
- name: Host RAID array got inactive
|
||||||
description: 'RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.'
|
description: 'RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.'
|
||||||
query: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
query: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||||
severity: critical
|
severity: critical
|
||||||
- name: Host RAID disk failure
|
- name: Host RAID disk failure
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue