diff --git a/_data/rules.yml b/_data/rules.yml index f34bf98..4a2501a 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -213,7 +213,7 @@ groups: for: 2m - name: Host high CPU load description: CPU load is > 80% - query: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80' + query: 'sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8' severity: warning - name: Host CPU is under utilized description: 'CPU load is < 20% for 1 week. Consider reducing the number of CPUs.' @@ -227,8 +227,8 @@ groups: query: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' severity: warning - name: Host CPU high iowait - description: CPU iowait > 5%. A high iowait means that you are disk or network bound. - query: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 5' + description: CPU iowait > 10%. A high iowait means that you are disk or network bound. + query: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10' severity: warning - name: Host unusual disk IO description: 'Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.'