diff --git a/_data/rules.yml b/_data/rules.yml index f4c66c1..ee6a302 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -152,11 +152,11 @@ groups: You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - name: Host unusual network throughput in description: Host receive bandwidth is high (>80%). - query: "((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)" + query: "((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80)" severity: warning - name: Host unusual network throughput out description: Host transmit bandwidth is high (>80%) - query: "((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)" + query: "((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80)" severity: warning - name: Host unusual disk read rate description: Disk is too busy (IO wait > 80%) @@ -207,23 +207,23 @@ groups: for: 2m - name: Host high CPU load description: CPU load is > 80% - query: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' + query: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' severity: warning for: 10m - name: Host CPU is underutilized description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs." - query: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' + query: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' severity: info for: 1w comments: | You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - name: Host CPU steal noisy neighbor description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. - query: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' + query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' severity: warning - name: Host CPU high iowait description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond. - query: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' + query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' severity: warning - name: Host unusual disk IO description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues." diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 5eef86e..5902305 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -34,7 +34,7 @@ groups: description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualNetworkThroughputIn - expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)' + expr: '((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80)' for: 0m labels: severity: warning @@ -43,7 +43,7 @@ groups: description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualNetworkThroughputOut - expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)' + expr: '((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80)' for: 0m labels: severity: warning @@ -130,7 +130,7 @@ groups: description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostHighCpuLoad - expr: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' + expr: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' for: 10m labels: severity: warning @@ -140,7 +140,7 @@ groups: # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - alert: HostCpuIsUnderutilized - expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' + expr: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' for: 1w labels: severity: info @@ -149,7 +149,7 @@ groups: description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostCpuStealNoisyNeighbor - expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' + expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' for: 0m labels: severity: warning @@ -158,7 +158,7 @@ groups: description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostCpuHighIowait - expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' + expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' for: 0m labels: severity: warning