Node exporter rules to preserve instance labels (#488)

* Jenkins node offline for clause (#2) * Convert cpu alert expressions to without() rather than on() * Remove on() expression from network throughput alerts as labels fully match --------- Co-authored-by: Simon Matic Langford <simon@longshotsystems.co.uk>
2026-06-21 17:07:24 +08:00 · 2026-01-06 15:24:18 +00:00 · 2026-01-06 15:24:18 +00:00 · f810ff531d
commit f810ff531d
parent 74ba870f05
2 changed files with 12 additions and 12 deletions
--- a/_data/rules.yml
+++ b/_data/rules.yml
@ -152,11 +152,11 @@ groups:
                  You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
              - name: Host unusual network throughput in
                description: Host receive bandwidth is high (>80%).
-                query: "((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)"
+                query: "((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80)"
                severity: warning
              - name: Host unusual network throughput out
                description: Host transmit bandwidth is high (>80%)
-                query: "((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)"
+                query: "((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80)"
                severity: warning
              - name: Host unusual disk read rate
                description: Disk is too busy (IO wait > 80%)
@ -207,23 +207,23 @@ groups:
                for: 2m
              - name: Host high CPU load
                description: CPU load is > 80%
-                query: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
+                query: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
                severity: warning
                for: 10m
              - name: Host CPU is underutilized
                description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs."
-                query: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
+                query: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
                severity: info
                for: 1w
                comments: |
                  You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
              - name: Host CPU steal noisy neighbor
                description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
-                query: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
+                query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
                severity: warning
              - name: Host CPU high iowait
                description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond.
-                query: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
+                query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
                severity: warning
              - name: Host unusual disk IO
                description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues."
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@ -34,7 +34,7 @@ groups:
        description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostUnusualNetworkThroughputIn
-      expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
+      expr: '((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80)'
      for: 0m
      labels:
        severity: warning
@ -43,7 +43,7 @@ groups:
        description: "Host receive bandwidth is high (>80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostUnusualNetworkThroughputOut
-      expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
+      expr: '((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80)'
      for: 0m
      labels:
        severity: warning
@ -130,7 +130,7 @@ groups:
        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostHighCpuLoad
-      expr: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
+      expr: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
      for: 10m
      labels:
        severity: warning
@ -140,7 +140,7 @@ groups:

    # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
    - alert: HostCpuIsUnderutilized
-      expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
+      expr: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
      for: 1w
      labels:
        severity: info
@ -149,7 +149,7 @@ groups:
        description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostCpuStealNoisyNeighbor
-      expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
+      expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
      for: 0m
      labels:
        severity: warning
@ -158,7 +158,7 @@ groups:
        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostCpuHighIowait
-      expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
+      expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
      for: 0m
      labels:
        severity: warning