diff --git a/_data/rules.yml b/_data/rules.yml index 9a08f71..8994d44 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -240,12 +240,15 @@ groups: query: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' severity: warning for: 5m - - name: Host context switching - description: Context switching is growing on the node (> 10000 / CPU / s) - query: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + - name: Host context switching high + description: Context switching is growing on the node (twice the daily average during the last 15m) + query: | + (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) + / + (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 severity: warning comments: | - 10000 context switches is an arbitrary number. + x2 context switches is an arbitrary number. The alert threshold depends on the nature of the application. Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - name: Host swap is filling up